# 10 Minutes to pandas
请参阅[官方文档](http://pandas.pydata.org/pandas-docs/stable/10min.html)

### 处理丢失数据

Pandas 使用 numpy.NaN 来表示丢失的数据，它不参与计算。

In [1]:
# 设置为 inline 风格
%matplotlib inline
# 包导入
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dates = pd.date_range('20160301', periods=6)
df = pd.DataFrame(data=np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-03-01,-0.985666,0.240058,0.716721,0.352009
2016-03-02,-1.563644,0.091766,1.081764,0.951541
2016-03-03,0.27976,-0.316136,1.198073,-0.562947
2016-03-04,1.174777,-0.225305,-0.280256,-0.074768
2016-03-05,2.173366,0.907038,-1.104678,-0.921779
2016-03-06,0.200422,0.442619,1.97033,-0.609867


In [3]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,E
2016-03-01,-0.985666,0.240058,0.716721,0.352009,
2016-03-02,-1.563644,0.091766,1.081764,0.951541,
2016-03-03,0.27976,-0.316136,1.198073,-0.562947,
2016-03-04,1.174777,-0.225305,-0.280256,-0.074768,


In [4]:
df1.loc[dates[1:3], 'E'] = 1
df1

Unnamed: 0,A,B,C,D,E
2016-03-01,-0.985666,0.240058,0.716721,0.352009,
2016-03-02,-1.563644,0.091766,1.081764,0.951541,1.0
2016-03-03,0.27976,-0.316136,1.198073,-0.562947,1.0
2016-03-04,1.174777,-0.225305,-0.280256,-0.074768,


In [5]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2016-03-02,-1.563644,0.091766,1.081764,0.951541,1
2016-03-03,0.27976,-0.316136,1.198073,-0.562947,1


In [6]:
df1

Unnamed: 0,A,B,C,D,E
2016-03-01,-0.985666,0.240058,0.716721,0.352009,
2016-03-02,-1.563644,0.091766,1.081764,0.951541,1.0
2016-03-03,0.27976,-0.316136,1.198073,-0.562947,1.0
2016-03-04,1.174777,-0.225305,-0.280256,-0.074768,


In [7]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2016-03-01,-0.985666,0.240058,0.716721,0.352009,5
2016-03-02,-1.563644,0.091766,1.081764,0.951541,1
2016-03-03,0.27976,-0.316136,1.198073,-0.562947,1
2016-03-04,1.174777,-0.225305,-0.280256,-0.074768,5


In [8]:
df1

Unnamed: 0,A,B,C,D,E
2016-03-01,-0.985666,0.240058,0.716721,0.352009,
2016-03-02,-1.563644,0.091766,1.081764,0.951541,1.0
2016-03-03,0.27976,-0.316136,1.198073,-0.562947,1.0
2016-03-04,1.174777,-0.225305,-0.280256,-0.074768,


In [9]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E
2016-03-01,False,False,False,False,True
2016-03-02,False,False,False,False,False
2016-03-03,False,False,False,False,False
2016-03-04,False,False,False,False,True


### 统计

numpy.NaN 不参与计算

In [10]:
df1.mean()

A   -0.273693
B   -0.052404
C    0.679076
D    0.166459
E    1.000000
dtype: float64

In [11]:
df1.mean(axis=1)

2016-03-01    0.080781
2016-03-02    0.312285
2016-03-03    0.319750
2016-03-04    0.148612
Freq: D, dtype: float64

In [12]:
df.mean(axis=1)

2016-03-01    0.080781
2016-03-02    0.140357
2016-03-03    0.149687
2016-03-04    0.148612
2016-03-05    0.263487
2016-03-06    0.500876
Freq: D, dtype: float64

In [13]:
df.sum()

A    1.279015
B    1.140040
C    3.581954
D   -0.865811
dtype: float64

In [14]:
df.sum(axis='columns')

2016-03-01    0.323123
2016-03-02    0.561427
2016-03-03    0.598749
2016-03-04    0.594449
2016-03-05    1.053947
2016-03-06    2.003504
Freq: D, dtype: float64

In [15]:
df.cumsum()

Unnamed: 0,A,B,C,D
2016-03-01,-0.985666,0.240058,0.716721,0.352009
2016-03-02,-2.549311,0.331824,1.798486,1.30355
2016-03-03,-2.269551,0.015688,2.996558,0.740602
2016-03-04,-1.094774,-0.209616,2.716302,0.665835
2016-03-05,1.078593,0.697421,1.611624,-0.255944
2016-03-06,1.279015,1.14004,3.581954,-0.865811


In [16]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

2016-03-01   NaN
2016-03-02   NaN
2016-03-03     1
2016-03-04     3
2016-03-05     5
2016-03-06   NaN
Freq: D, dtype: float64

In [17]:
df

Unnamed: 0,A,B,C,D
2016-03-01,-0.985666,0.240058,0.716721,0.352009
2016-03-02,-1.563644,0.091766,1.081764,0.951541
2016-03-03,0.27976,-0.316136,1.198073,-0.562947
2016-03-04,1.174777,-0.225305,-0.280256,-0.074768
2016-03-05,2.173366,0.907038,-1.104678,-0.921779
2016-03-06,0.200422,0.442619,1.97033,-0.609867


In [18]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2016-03-01,,,,
2016-03-02,,,,
2016-03-03,-0.72024,-1.316136,0.198073,-1.562947
2016-03-04,-1.825223,-3.225305,-3.280256,-3.074768
2016-03-05,-2.826634,-4.092962,-6.104678,-5.921779
2016-03-06,,,,


In [19]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2016-03-01,-0.985666,0.240058,0.716721,0.352009
2016-03-02,-2.549311,0.331824,1.798486,1.30355
2016-03-03,-2.269551,0.015688,2.996558,0.740602
2016-03-04,-1.094774,-0.209616,2.716302,0.665835
2016-03-05,1.078593,0.697421,1.611624,-0.255944
2016-03-06,1.279015,1.14004,3.581954,-0.865811


In [20]:
df.apply(lambda x: x.max() - x.min())

A    3.737010
B    1.223174
C    3.075008
D    1.873320
dtype: float64

In [21]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    1
1    2
2    4
3    2
4    1
5    3
6    1
7    2
8    4
9    3
dtype: int64

In [22]:
s.value_counts()

2    3
1    3
4    2
3    2
dtype: int64

In [23]:
s.mode()

0    1
1    2
dtype: int64

### 数据合并

In [24]:
df = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,1.098103,-0.843356,-0.379135,0.419353
1,-0.177702,-0.225926,-0.363542,-0.153022
2,1.938231,0.154881,0.291382,0.152774
3,-0.460645,-0.268697,-1.509469,0.698776
4,-0.397048,-0.958223,0.212833,-0.435485
5,0.525406,-0.177595,0.453216,-0.093792
6,0.531912,-0.832667,0.200721,0.943878
7,-0.740845,0.098634,0.27402,1.671997
8,2.182379,1.72901,1.306269,0.580677
9,-0.031538,0.159714,0.736667,-0.122326


In [25]:
df.iloc[:3]

Unnamed: 0,A,B,C,D
0,1.098103,-0.843356,-0.379135,0.419353
1,-0.177702,-0.225926,-0.363542,-0.153022
2,1.938231,0.154881,0.291382,0.152774


In [26]:
df.iloc[3:7]

Unnamed: 0,A,B,C,D
3,-0.460645,-0.268697,-1.509469,0.698776
4,-0.397048,-0.958223,0.212833,-0.435485
5,0.525406,-0.177595,0.453216,-0.093792
6,0.531912,-0.832667,0.200721,0.943878


In [27]:
df.iloc[7:]

Unnamed: 0,A,B,C,D
7,-0.740845,0.098634,0.27402,1.671997
8,2.182379,1.72901,1.306269,0.580677
9,-0.031538,0.159714,0.736667,-0.122326


In [28]:
df1 = pd.concat([df.iloc[:3], df.iloc[3:7], df.iloc[7:]])
df1

Unnamed: 0,A,B,C,D
0,1.098103,-0.843356,-0.379135,0.419353
1,-0.177702,-0.225926,-0.363542,-0.153022
2,1.938231,0.154881,0.291382,0.152774
3,-0.460645,-0.268697,-1.509469,0.698776
4,-0.397048,-0.958223,0.212833,-0.435485
5,0.525406,-0.177595,0.453216,-0.093792
6,0.531912,-0.832667,0.200721,0.943878
7,-0.740845,0.098634,0.27402,1.671997
8,2.182379,1.72901,1.306269,0.580677
9,-0.031538,0.159714,0.736667,-0.122326


In [29]:
(df1 == df).all().all()

True

In [30]:
# SQL 样式的联合查询
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

In [31]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [32]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [33]:
# SELECT * FROM left INNER JOIN right ON left.key = right.key;
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [34]:
df

Unnamed: 0,A,B,C,D
0,1.098103,-0.843356,-0.379135,0.419353
1,-0.177702,-0.225926,-0.363542,-0.153022
2,1.938231,0.154881,0.291382,0.152774
3,-0.460645,-0.268697,-1.509469,0.698776
4,-0.397048,-0.958223,0.212833,-0.435485
5,0.525406,-0.177595,0.453216,-0.093792
6,0.531912,-0.832667,0.200721,0.943878
7,-0.740845,0.098634,0.27402,1.671997
8,2.182379,1.72901,1.306269,0.580677
9,-0.031538,0.159714,0.736667,-0.122326


In [35]:
s = pd.Series(np.random.randint(1, 5, size=4), index=list('ABCD'))
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,1.098103,-0.843356,-0.379135,0.419353
1,-0.177702,-0.225926,-0.363542,-0.153022
2,1.938231,0.154881,0.291382,0.152774
3,-0.460645,-0.268697,-1.509469,0.698776
4,-0.397048,-0.958223,0.212833,-0.435485
5,0.525406,-0.177595,0.453216,-0.093792
6,0.531912,-0.832667,0.200721,0.943878
7,-0.740845,0.098634,0.27402,1.671997
8,2.182379,1.72901,1.306269,0.580677
9,-0.031538,0.159714,0.736667,-0.122326


### 分组统计

* 数据分组
* 每个组应用一个函数，输出一个结果
* 合并每个组的结果构成最终输出

In [36]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.58032,-1.460149
1,bar,one,1.471201,-1.079598
2,foo,two,0.094836,1.513204
3,bar,three,-1.49881,0.754968
4,foo,two,0.180709,0.415266
5,bar,two,0.358515,-0.341988
6,foo,one,-0.121082,-0.408148
7,foo,three,0.404648,-0.320882


In [37]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.330906,-0.666618
foo,-0.021208,-0.260709


In [38]:
# df.groupby(['B', 'A']).sum()
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.471201,-1.079598
bar,three,-1.49881,0.754968
bar,two,0.358515,-0.341988
foo,one,-0.701402,-1.868297
foo,three,0.404648,-0.320882
foo,two,0.275545,1.92847
