### 导入相关包

In [1]:
import numpy as np
import pandas as pd

### 对象创建

##### 通过series创建

In [2]:
s=pd.Series([1,3,5,np.nan,6,8]) #注意是Series,第一个字符大写
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
s2=pd.Series([[1,2,3],[4,5,6]])
s2

0    [1, 2, 3]
1    [4, 5, 6]
dtype: object

##### 通过dataframe创建

In [4]:
#先创建时间索引
dates=pd.date_range('20130101',periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
#这里的index和columns可选，默认是从0开始的阿拉伯数字
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.192409,1.073759,-2.006184,-2.036606
2013-01-02,-1.445643,-1.442846,0.45664,1.298548
2013-01-03,-1.651316,1.406623,0.430699,0.83383
2013-01-04,-1.350756,-1.03195,0.118208,1.179488
2013-01-05,-0.066555,-0.292787,2.749318,1.442446
2013-01-06,1.978121,0.502092,0.280596,1.004316


### 查看数据


##### 查看数据的顶行和底行

In [6]:
df.head(3)

Unnamed: 0,A,B,C,D
2013-01-01,-1.192409,1.073759,-2.006184,-2.036606
2013-01-02,-1.445643,-1.442846,0.45664,1.298548
2013-01-03,-1.651316,1.406623,0.430699,0.83383


In [7]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-1.350756,-1.03195,0.118208,1.179488
2013-01-05,-0.066555,-0.292787,2.749318,1.442446
2013-01-06,1.978121,0.502092,0.280596,1.004316


##### 显示索引与列

In [8]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

##### 显示数据快速统计摘要

In [10]:
df.describe()# 显示常见的一些统计数据

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.621426,0.035815,0.338213,0.620337
std,1.390214,1.149643,1.508902,1.319155
min,-1.651316,-1.442846,-2.006184,-2.036606
25%,-1.421922,-0.847159,0.158805,0.876451
50%,-1.271583,0.104652,0.355647,1.091902
75%,-0.348019,0.930842,0.450154,1.268783
max,1.978121,1.406623,2.749318,1.442446


### 数据操作

##### 转置数据(行列转换)

In [11]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.192409,-1.445643,-1.651316,-1.350756,-0.066555,1.978121
B,1.073759,-1.442846,1.406623,-1.03195,-0.292787,0.502092
C,-2.006184,0.45664,0.430699,0.118208,2.749318,0.280596
D,-2.036606,1.298548,0.83383,1.179488,1.442446,1.004316


##### 按索引排序

使用sort_index()按索引排序。默认按行索引排序，指定axis=1则按列索引排序。

In [12]:
frame=pd.DataFrame(np.random.randn(12).reshape(3,4),index=list('CAB'),columns=list('cfda'))
frame

Unnamed: 0,c,f,d,a
C,-0.513371,0.706747,-2.315256,0.549465
A,1.562902,-0.169914,1.264456,-0.362105
B,-0.31837,-0.752203,-1.043262,1.487384


In [13]:
frame.sort_index()

Unnamed: 0,c,f,d,a
A,1.562902,-0.169914,1.264456,-0.362105
B,-0.31837,-0.752203,-1.043262,1.487384
C,-0.513371,0.706747,-2.315256,0.549465


In [14]:
frame.sort_index(axis=1)

Unnamed: 0,a,c,d,f
C,0.549465,-0.513371,-2.315256,0.706747
A,-0.362105,1.562902,1.264456,-0.169914
B,1.487384,-0.31837,-1.043262,-0.752203


##### 按值排序

使用sort_values(by,axis=0,...):
* by:这个参数要求传入一个字符或者是一个字符列表,用来指定按照哪个元素进行排序
* axis=0表示调整行,axis=1表示调整列

In [15]:
frame

Unnamed: 0,c,f,d,a
C,-0.513371,0.706747,-2.315256,0.549465
A,1.562902,-0.169914,1.264456,-0.362105
B,-0.31837,-0.752203,-1.043262,1.487384


In [16]:
frame.sort_values(by='c')

Unnamed: 0,c,f,d,a
C,-0.513371,0.706747,-2.315256,0.549465
B,-0.31837,-0.752203,-1.043262,1.487384
A,1.562902,-0.169914,1.264456,-0.362105


In [17]:
frame.sort_values(by='A',axis=1) #默认升序排序

Unnamed: 0,a,f,d,c
C,0.549465,0.706747,-2.315256,-0.513371
A,-0.362105,-0.169914,1.264456,1.562902
B,1.487384,-0.752203,-1.043262,-0.31837


### 数据访问

#### 获取数据

In [18]:
#选择一列,获取到一个Series
df['A']

2013-01-01   -1.192409
2013-01-02   -1.445643
2013-01-03   -1.651316
2013-01-04   -1.350756
2013-01-05   -0.066555
2013-01-06    1.978121
Freq: D, Name: A, dtype: float64

In [19]:
#通过[]获取行
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-1.192409,1.073759,-2.006184,-2.036606
2013-01-02,-1.445643,-1.442846,0.45664,1.298548
2013-01-03,-1.651316,1.406623,0.430699,0.83383


In [20]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-1.445643,-1.442846,0.45664,1.298548
2013-01-03,-1.651316,1.406623,0.430699,0.83383
2013-01-04,-1.350756,-1.03195,0.118208,1.179488


##### 通过标签选择(loc)

In [21]:
#选择第一个日期的数据
#第一个参数代表行索引，第二个参数代表列索引
df.loc[dates[0]] #选择第一个日期的数据

A   -1.192409
B    1.073759
C   -2.006184
D   -2.036606
Name: 2013-01-01 00:00:00, dtype: float64

In [22]:
#通过标签选择多个维度
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.192409,1.073759
2013-01-02,-1.445643,-1.442846
2013-01-03,-1.651316,1.406623
2013-01-04,-1.350756,-1.03195
2013-01-05,-0.066555,-0.292787
2013-01-06,1.978121,0.502092


In [23]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-1.445643,-1.442846
2013-01-03,-1.651316,1.406623
2013-01-04,-1.350756,-1.03195


##### 获取标量值

In [24]:
df.loc[dates[0],'A']

-1.1924091129077303

In [25]:
#at函数:通过行名和列名来取值
df.at[dates[0],'A']

-1.1924091129077303

##### 通过位置选择(iloc)

loc和iloc的区别:
* loc:通过选取行(列)标签索引数据
* iloc:通过选取行(列)位置编号索引数据

In [26]:
df.iloc[3]

A   -1.350756
B   -1.031950
C    0.118208
D    1.179488
Name: 2013-01-04 00:00:00, dtype: float64

In [27]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-1.350756,-1.03195
2013-01-05,-0.066555,-0.292787


In [28]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-1.445643,0.45664
2013-01-03,-1.651316,0.430699
2013-01-05,-0.066555,2.749318


In [29]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,1.073759,-2.006184
2013-01-02,-1.442846,0.45664
2013-01-03,1.406623,0.430699
2013-01-04,-1.03195,0.118208
2013-01-05,-0.292787,2.749318
2013-01-06,0.502092,0.280596


###### 快速访问标量

at和iat的区别:
* at:通过选取行列标签索引标量
* iat:通过选取行列位置编号索引标量

In [30]:
df.iat[1,1]

-1.4428462769928572

#### 布尔值索引

In [31]:
#通过判断单列的值选择数据
df[df.A>0] #通过df.A快速获取‘A’所指向的列

Unnamed: 0,A,B,C,D
2013-01-06,1.978121,0.502092,0.280596,1.004316


In [32]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,1.073759,,
2013-01-02,,,0.45664,1.298548
2013-01-03,,1.406623,0.430699,0.83383
2013-01-04,,,0.118208,1.179488
2013-01-05,,,2.749318,1.442446
2013-01-06,1.978121,0.502092,0.280596,1.004316


### 缺少数据

pandas使用np.nan表示缺少数据。

In [33]:
#丢弃存在缺少数据的行
df2=df[df>0.3]
print(df2)
df2.dropna(how='all') #how='all'表示滤除全为NaN的行,how='any'表示滤除有NaN的行


                   A         B         C         D
2013-01-01       NaN  1.073759       NaN       NaN
2013-01-02       NaN       NaN  0.456640  1.298548
2013-01-03       NaN  1.406623  0.430699  0.833830
2013-01-04       NaN       NaN       NaN  1.179488
2013-01-05       NaN       NaN  2.749318  1.442446
2013-01-06  1.978121  0.502092       NaN  1.004316


Unnamed: 0,A,B,C,D
2013-01-01,,1.073759,,
2013-01-02,,,0.45664,1.298548
2013-01-03,,1.406623,0.430699,0.83383
2013-01-04,,,,1.179488
2013-01-05,,,2.749318,1.442446
2013-01-06,1.978121,0.502092,,1.004316


In [34]:
#填充缺失值
df2.fillna(value=5)

Unnamed: 0,A,B,C,D
2013-01-01,5.0,1.073759,5.0,5.0
2013-01-02,5.0,5.0,0.45664,1.298548
2013-01-03,5.0,1.406623,0.430699,0.83383
2013-01-04,5.0,5.0,5.0,1.179488
2013-01-05,5.0,5.0,2.749318,1.442446
2013-01-06,1.978121,0.502092,5.0,1.004316


In [35]:
#判断是否为nan值得boolean掩码
pd.isna(df2)

Unnamed: 0,A,B,C,D
2013-01-01,True,False,True,True
2013-01-02,True,True,False,False
2013-01-03,True,False,False,False
2013-01-04,True,True,True,False
2013-01-05,True,True,False,False
2013-01-06,False,False,True,False


### 运算

运算通常不包含缺失值

#### 统计数据

In [36]:
df.mean()

A   -0.621426
B    0.035815
C    0.338213
D    0.620337
dtype: float64

In [37]:
df.mean(1)

2013-01-01   -1.040360
2013-01-02   -0.283326
2013-01-03    0.254959
2013-01-04   -0.271252
2013-01-05    0.958105
2013-01-06    0.941281
Freq: D, dtype: float64

#### 对数据运用函数

DataFrame.apply(func,axis=0,...)

参数说明:
* func：传入的函数，相当于C/C++的函数指针
* axis=0则将一列数据传入函数,axis=1则每次将一行数据传入函数

In [38]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.192409,1.073759,-2.006184,-2.036606
2013-01-02,-1.445643,-1.442846,0.45664,1.298548
2013-01-03,-1.651316,1.406623,0.430699,0.83383
2013-01-04,-1.350756,-1.03195,0.118208,1.179488
2013-01-05,-0.066555,-0.292787,2.749318,1.442446
2013-01-06,1.978121,0.502092,0.280596,1.004316


In [39]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,-1.192409,1.073759,-2.006184,-2.036606
2013-01-02,-2.638053,-0.369087,-1.549545,-0.738058
2013-01-03,-4.289369,1.037535,-1.118846,0.095772
2013-01-04,-5.640125,0.005586,-1.000638,1.27526
2013-01-05,-5.70668,-0.287202,1.74868,2.717705
2013-01-06,-3.728559,0.21489,2.029276,3.722021


In [40]:
df.apply(lambda x:x.max()-x.min()) #这里的x是一列数据

A    3.629437
B    2.849469
C    4.755502
D    3.479052
dtype: float64

### 合并

#### 连接

In [41]:
df=pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,2.003238,-1.519566,0.520048,-2.309081
1,1.598641,0.67392,-1.640884,-0.952867
2,0.907345,-0.619054,0.616556,0.682466
3,-2.369224,-0.364612,1.580985,-0.373948
4,-0.360547,-1.210696,1.250338,-1.98998
5,0.671682,0.643648,-0.22961,0.588377
6,2.347827,-1.206138,0.485166,1.459472
7,0.096798,0.087442,0.772021,1.291261
8,-1.178935,0.963666,0.690469,0.860861
9,-1.052,-0.179438,1.321585,0.240784


In [42]:
#使用concat进行连接
pieces=[df[:3],df[3:7],df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,2.003238,-1.519566,0.520048,-2.309081
1,1.598641,0.67392,-1.640884,-0.952867
2,0.907345,-0.619054,0.616556,0.682466
3,-2.369224,-0.364612,1.580985,-0.373948
4,-0.360547,-1.210696,1.250338,-1.98998
5,0.671682,0.643648,-0.22961,0.588377
6,2.347827,-1.206138,0.485166,1.459472
7,0.096798,0.087442,0.772021,1.291261
8,-1.178935,0.963666,0.690469,0.860861
9,-1.052,-0.179438,1.321585,0.240784


#### 加入( merge)

In [43]:
left=pd.DataFrame({'key':['foo','bar'],'lval':[1,2]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [44]:
right=pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [45]:
#通过merge取并集
pd.merge(left,right,on='key') #on指用于连接的列索引

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


#### 添加(append)

In [46]:
df

Unnamed: 0,0,1,2,3
0,2.003238,-1.519566,0.520048,-2.309081
1,1.598641,0.67392,-1.640884,-0.952867
2,0.907345,-0.619054,0.616556,0.682466
3,-2.369224,-0.364612,1.580985,-0.373948
4,-0.360547,-1.210696,1.250338,-1.98998
5,0.671682,0.643648,-0.22961,0.588377
6,2.347827,-1.206138,0.485166,1.459472
7,0.096798,0.087442,0.772021,1.291261
8,-1.178935,0.963666,0.690469,0.860861
9,-1.052,-0.179438,1.321585,0.240784


In [47]:
s=df.iloc[3]
s

0   -2.369224
1   -0.364612
2    1.580985
3   -0.373948
Name: 3, dtype: float64

In [48]:
df.append(s,ignore_index=True)

Unnamed: 0,0,1,2,3
0,2.003238,-1.519566,0.520048,-2.309081
1,1.598641,0.67392,-1.640884,-0.952867
2,0.907345,-0.619054,0.616556,0.682466
3,-2.369224,-0.364612,1.580985,-0.373948
4,-0.360547,-1.210696,1.250338,-1.98998
5,0.671682,0.643648,-0.22961,0.588377
6,2.347827,-1.206138,0.485166,1.459472
7,0.096798,0.087442,0.772021,1.291261
8,-1.178935,0.963666,0.690469,0.860861
9,-1.052,-0.179438,1.321585,0.240784


### 获取数据

#### CSV

In [49]:
#写入csv文件
df.to_csv('foo.csv')
#读csv文件
pd.read_csv('foo.csv')

Unnamed: 0.1,Unnamed: 0,0,1,2,3
0,0,2.003238,-1.519566,0.520048,-2.309081
1,1,1.598641,0.67392,-1.640884,-0.952867
2,2,0.907345,-0.619054,0.616556,0.682466
3,3,-2.369224,-0.364612,1.580985,-0.373948
4,4,-0.360547,-1.210696,1.250338,-1.98998
5,5,0.671682,0.643648,-0.22961,0.588377
6,6,2.347827,-1.206138,0.485166,1.459472
7,7,0.096798,0.087442,0.772021,1.291261
8,8,-1.178935,0.963666,0.690469,0.860861
9,9,-1.052,-0.179438,1.321585,0.240784
