#### 导入相关包

In [1]:
import numpy as np
import pandas as pd

#### 对象创建

##### 通过series创建

In [3]:
s=pd.Series([1,3,5,np.nan,6,8]) #注意是Series,第一个字符大写
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [32]:
s2=pd.Series([[1,2,3],[4,5,6]])
s2

0    [1, 2, 3]
1    [4, 5, 6]
dtype: object

##### 通过dataframe创建

In [5]:
#先创建时间索引
dates=pd.date_range('20130101',periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
#这里的index和columns可选，默认是从0开始的阿拉伯数字
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.283674,-1.258124,-1.143281,-1.318982
2013-01-02,-0.526053,-1.250063,-0.235331,-0.374807
2013-01-03,-0.891255,0.306235,1.106933,0.685079
2013-01-04,1.215972,-0.680178,0.495161,-0.070901
2013-01-05,-1.159601,0.381593,-2.124262,-1.675186
2013-01-06,0.045088,-0.276936,1.110488,0.046553


#### 查看数据


##### 查看数据的顶行和底行

In [12]:
df.head(3)

Unnamed: 0,A,B,C,D
2013-01-01,0.283674,-1.258124,-1.143281,-1.318982
2013-01-02,-0.526053,-1.250063,-0.235331,-0.374807
2013-01-03,-0.891255,0.306235,1.106933,0.685079


In [13]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,1.215972,-0.680178,0.495161,-0.070901
2013-01-05,-1.159601,0.381593,-2.124262,-1.675186
2013-01-06,0.045088,-0.276936,1.110488,0.046553


##### 显示索引与列

In [14]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [15]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

##### 显示数据快速统计摘要

In [16]:
df.describe()# 显示常见的一些统计数据

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.172029,-0.462912,-0.131715,-0.451374
std,0.871696,0.726435,1.301886,0.887754
min,-1.159601,-1.258124,-2.124262,-1.675186
25%,-0.799954,-1.107592,-0.916294,-1.082938
50%,-0.240482,-0.478557,0.129915,-0.222854
75%,0.224028,0.160443,0.95399,0.017189
max,1.215972,0.381593,1.110488,0.685079


### 数据操作

##### 转置数据(行列转换)

In [17]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.283674,-0.526053,-0.891255,1.215972,-1.159601,0.045088
B,-1.258124,-1.250063,0.306235,-0.680178,0.381593,-0.276936
C,-1.143281,-0.235331,1.106933,0.495161,-2.124262,1.110488
D,-1.318982,-0.374807,0.685079,-0.070901,-1.675186,0.046553


##### 按索引排序

使用sort_index()按索引排序。默认按行索引排序，指定axis=1则按列索引排序。

In [22]:
frame=pd.DataFrame(np.random.randn(12).reshape(3,4),index=list('CAB'),columns=list('cfda'))
frame

Unnamed: 0,c,f,d,a
C,0.774778,0.473843,0.559815,1.444086
A,-0.075902,0.979828,-2.261178,0.233571
B,0.274298,0.375499,-0.167876,-0.083925


In [23]:
frame.sort_index()

Unnamed: 0,c,f,d,a
A,-0.075902,0.979828,-2.261178,0.233571
B,0.274298,0.375499,-0.167876,-0.083925
C,0.774778,0.473843,0.559815,1.444086


In [24]:
frame.sort_index(axis=1)

Unnamed: 0,a,c,d,f
C,1.444086,0.774778,0.559815,0.473843
A,0.233571,-0.075902,-2.261178,0.979828
B,-0.083925,0.274298,-0.167876,0.375499


##### 按值排序

使用sort_values(by,axis=0,...):
* by:这个参数要求传入一个字符或者是一个字符列表,用来指定按照哪个元素进行排序
* axis=0表示调整行,axis=1表示调整列

In [25]:
frame

Unnamed: 0,c,f,d,a
C,0.774778,0.473843,0.559815,1.444086
A,-0.075902,0.979828,-2.261178,0.233571
B,0.274298,0.375499,-0.167876,-0.083925


In [31]:
frame.sort_values(by='c')

Unnamed: 0,c,f,d,a
A,-0.075902,0.979828,-2.261178,0.233571
B,0.274298,0.375499,-0.167876,-0.083925
C,0.774778,0.473843,0.559815,1.444086


In [30]:
frame.sort_values(by='A',axis=1) #默认升序排序

Unnamed: 0,d,c,a,f
C,0.559815,0.774778,1.444086,0.473843
A,-2.261178,-0.075902,0.233571,0.979828
B,-0.167876,0.274298,-0.083925,0.375499


### 数据访问

#### 获取数据

In [33]:
#选择一列,获取到一个Series
df['A']

2013-01-01    0.283674
2013-01-02   -0.526053
2013-01-03   -0.891255
2013-01-04    1.215972
2013-01-05   -1.159601
2013-01-06    0.045088
Freq: D, Name: A, dtype: float64

In [34]:
#通过[]获取行

In [35]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.283674,-1.258124,-1.143281,-1.318982
2013-01-02,-0.526053,-1.250063,-0.235331,-0.374807
2013-01-03,-0.891255,0.306235,1.106933,0.685079


In [36]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.526053,-1.250063,-0.235331,-0.374807
2013-01-03,-0.891255,0.306235,1.106933,0.685079
2013-01-04,1.215972,-0.680178,0.495161,-0.070901


##### 通过标签选择(loc)

In [37]:
#选择第一个日期的数据
#第一个参数代表行索引，第二个参数代表列索引
df.loc[dates[0]] #选择第一个日期的数据

A    0.283674
B   -1.258124
C   -1.143281
D   -1.318982
Name: 2013-01-01 00:00:00, dtype: float64

In [38]:
#通过标签选择多个维度
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.283674,-1.258124
2013-01-02,-0.526053,-1.250063
2013-01-03,-0.891255,0.306235
2013-01-04,1.215972,-0.680178
2013-01-05,-1.159601,0.381593
2013-01-06,0.045088,-0.276936


In [39]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.526053,-1.250063
2013-01-03,-0.891255,0.306235
2013-01-04,1.215972,-0.680178


##### 获取标量值

In [40]:
df.loc[dates[0],'A']

0.2836742359300112

In [42]:
#at函数:通过行名和列名来取值
df.at[dates[0],'A']

0.2836742359300112

##### 通过位置选择(iloc)

loc和iloc的区别:
* loc:通过选取行(列)标签索引数据
* iloc:通过选取行(列)位置编号索引数据

In [43]:
df.iloc[3]

A    1.215972
B   -0.680178
C    0.495161
D   -0.070901
Name: 2013-01-04 00:00:00, dtype: float64

In [44]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,1.215972,-0.680178
2013-01-05,-1.159601,0.381593


In [45]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,-0.526053,-0.235331
2013-01-03,-0.891255,1.106933
2013-01-05,-1.159601,-2.124262


In [46]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,-1.258124,-1.143281
2013-01-02,-1.250063,-0.235331
2013-01-03,0.306235,1.106933
2013-01-04,-0.680178,0.495161
2013-01-05,0.381593,-2.124262
2013-01-06,-0.276936,1.110488


###### 快速访问标量

at和iat的区别:
* at:通过选取行列标签索引标量
* iat:通过选取行列位置编号索引标量

In [47]:
df.iat[1,1]

-1.2500628279575439

#### 布尔值索引

In [49]:
#通过判断单列的值选择数据
df[df.A>0] #通过df.A快速获取‘A’所指向的列

Unnamed: 0,A,B,C,D
2013-01-01,0.283674,-1.258124,-1.143281,-1.318982
2013-01-04,1.215972,-0.680178,0.495161,-0.070901
2013-01-06,0.045088,-0.276936,1.110488,0.046553


In [50]:
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.283674,,,
2013-01-02,,,,
2013-01-03,,0.306235,1.106933,0.685079
2013-01-04,1.215972,,0.495161,
2013-01-05,,0.381593,,
2013-01-06,0.045088,,1.110488,0.046553


### 缺少数据

pandas使用np.nan表示缺少数据。

In [61]:
#丢弃存在缺少数据的行
df2=df[df>0.3]
print(df2)
df2.dropna(how='all') #how='all'表示滤除全为NaN的行,how='any'表示滤除有NaN的行


                   A         B         C         D
2013-01-01       NaN       NaN       NaN       NaN
2013-01-02       NaN       NaN       NaN       NaN
2013-01-03       NaN  0.306235  1.106933  0.685079
2013-01-04  1.215972       NaN  0.495161       NaN
2013-01-05       NaN  0.381593       NaN       NaN
2013-01-06       NaN       NaN  1.110488       NaN


Unnamed: 0,A,B,C,D
2013-01-03,,0.306235,1.106933,0.685079
2013-01-04,1.215972,,0.495161,
2013-01-05,,0.381593,,
2013-01-06,,,1.110488,


In [59]:
#填充缺失值
df2.fillna(value=5)

Unnamed: 0,A,B,C,D
2013-01-01,5.0,5.0,5.0,5.0
2013-01-02,5.0,5.0,5.0,5.0
2013-01-03,5.0,0.306235,1.106933,0.685079
2013-01-04,1.215972,5.0,0.495161,5.0
2013-01-05,5.0,0.381593,5.0,5.0
2013-01-06,5.0,5.0,1.110488,5.0


In [60]:
#判断是否为nan值得boolean掩码
pd.isna(df2)

Unnamed: 0,A,B,C,D
2013-01-01,True,True,True,True
2013-01-02,True,True,True,True
2013-01-03,True,False,False,False
2013-01-04,False,True,False,True
2013-01-05,True,False,True,True
2013-01-06,True,True,False,True


### 运算

运算通常不包含缺失值

#### 统计数据

In [62]:
df.mean()

A   -0.172029
B   -0.462912
C   -0.131715
D   -0.451374
dtype: float64

In [63]:
df.mean(1)

2013-01-01   -0.859178
2013-01-02   -0.596564
2013-01-03    0.301748
2013-01-04    0.240014
2013-01-05   -1.144364
2013-01-06    0.231298
Freq: D, dtype: float64

#### 对数据运用函数

DataFrame.apply(func,axis=0,...)

参数说明:
* func：传入的函数，相当于C/C++的函数指针
* axis=0则将一列数据传入函数,axis=1则每次将一行数据传入函数

In [64]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.283674,-1.258124,-1.143281,-1.318982
2013-01-02,-0.526053,-1.250063,-0.235331,-0.374807
2013-01-03,-0.891255,0.306235,1.106933,0.685079
2013-01-04,1.215972,-0.680178,0.495161,-0.070901
2013-01-05,-1.159601,0.381593,-2.124262,-1.675186
2013-01-06,0.045088,-0.276936,1.110488,0.046553


In [65]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.283674,-1.258124,-1.143281,-1.318982
2013-01-02,-0.242379,-2.508187,-1.378612,-1.693789
2013-01-03,-1.133634,-2.201952,-0.271679,-1.00871
2013-01-04,0.082338,-2.882129,0.223482,-1.079612
2013-01-05,-1.077262,-2.500536,-1.90078,-2.754798
2013-01-06,-1.032174,-2.777472,-0.790293,-2.708246


In [66]:
df.apply(lambda x:x.max()-x.min()) #这里的x是一列数据

A    2.375573
B    1.639718
C    3.234750
D    2.360265
dtype: float64

### 合并

#### 连接

In [67]:
df=pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,-2.34693,-0.843893,-1.562842,0.22381
1,0.031381,0.260611,1.949308,-0.326071
2,0.064482,0.060145,0.677236,0.846692
3,-0.614188,-0.215029,2.185771,0.217924
4,1.247276,-1.131724,0.336969,-1.604712
5,-0.633525,1.425953,2.47257,0.959619
6,-2.436455,-0.976973,-1.253845,-2.385834
7,1.279726,-0.683033,1.089339,-0.736067
8,0.35608,-0.919484,2.005166,-0.367869
9,1.216503,0.668547,-1.095472,1.203103


In [68]:
#使用concat进行连接
pieces=[df[:3],df[3:7],df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-2.34693,-0.843893,-1.562842,0.22381
1,0.031381,0.260611,1.949308,-0.326071
2,0.064482,0.060145,0.677236,0.846692
3,-0.614188,-0.215029,2.185771,0.217924
4,1.247276,-1.131724,0.336969,-1.604712
5,-0.633525,1.425953,2.47257,0.959619
6,-2.436455,-0.976973,-1.253845,-2.385834
7,1.279726,-0.683033,1.089339,-0.736067
8,0.35608,-0.919484,2.005166,-0.367869
9,1.216503,0.668547,-1.095472,1.203103


#### 加入( merge)

In [70]:
left=pd.DataFrame({'key':['foo','bar'],'lval':[1,2]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [71]:
right=pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [73]:
#通过merge取并集
pd.merge(left,right,on='key') #on指用于连接的列索引

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


#### 添加(append)

In [74]:
df

Unnamed: 0,0,1,2,3
0,-2.34693,-0.843893,-1.562842,0.22381
1,0.031381,0.260611,1.949308,-0.326071
2,0.064482,0.060145,0.677236,0.846692
3,-0.614188,-0.215029,2.185771,0.217924
4,1.247276,-1.131724,0.336969,-1.604712
5,-0.633525,1.425953,2.47257,0.959619
6,-2.436455,-0.976973,-1.253845,-2.385834
7,1.279726,-0.683033,1.089339,-0.736067
8,0.35608,-0.919484,2.005166,-0.367869
9,1.216503,0.668547,-1.095472,1.203103


In [78]:
s=df.iloc[3]
s

0   -0.614188
1   -0.215029
2    2.185771
3    0.217924
Name: 3, dtype: float64

In [79]:
df.append(s,ignore_index=True)

Unnamed: 0,0,1,2,3
0,-2.34693,-0.843893,-1.562842,0.22381
1,0.031381,0.260611,1.949308,-0.326071
2,0.064482,0.060145,0.677236,0.846692
3,-0.614188,-0.215029,2.185771,0.217924
4,1.247276,-1.131724,0.336969,-1.604712
5,-0.633525,1.425953,2.47257,0.959619
6,-2.436455,-0.976973,-1.253845,-2.385834
7,1.279726,-0.683033,1.089339,-0.736067
8,0.35608,-0.919484,2.005166,-0.367869
9,1.216503,0.668547,-1.095472,1.203103


### 获取数据

#### CSV

In [None]:
#写入csv文件
df.to_csv('foo.csv')
#读csv文件
pd.read_csv('foo.csv')