In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### 一、创建对象

传一个list，创建Series

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

#### 通过传递一个numpy array，时间索引以及列标签来创建一个DataFrame：

In [3]:
dates = pd.date_range('20130101',periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4),index = dates,columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.098533,0.795597,-0.194091,1.137165
2013-01-02,1.11214,-2.329821,2.393873,0.510313
2013-01-03,0.372095,-2.836341,1.353095,-1.41865
2013-01-04,0.759315,1.901651,-0.228759,-1.926736
2013-01-05,0.571882,-0.113175,-1.289003,0.167365
2013-01-06,-0.053887,0.145788,-1.550983,0.598632


#### 通过传递一个能够被转换成类似序列结构的字典对象来创建一个DataFrame

In [5]:
df2 = pd.DataFrame({
    'A':1.,
    'B':pd.Timestamp('20181120'),
    'C':pd.Series(1,index=list(range(4)),dtype='float32'),
    'D':np.array([3] * 4,dtype='int32'),
    'E':pd.Categorical(["test","train","test","train"]),
    'F':'foo'
})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-11-20,1.0,3,test,foo
1,1.0,2018-11-20,1.0,3,train,foo
2,1.0,2018-11-20,1.0,3,test,foo
3,1.0,2018-11-20,1.0,3,train,foo


In [6]:
# 查看不同列的数据类型：
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## 查看数据

In [7]:
# 查看头3行
df.head(3)

Unnamed: 0,A,B,C,D
2013-01-01,1.098533,0.795597,-0.194091,1.137165
2013-01-02,1.11214,-2.329821,2.393873,0.510313
2013-01-03,0.372095,-2.836341,1.353095,-1.41865


In [8]:
# 查看末3行
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.759315,1.901651,-0.228759,-1.926736
2013-01-05,0.571882,-0.113175,-1.289003,0.167365
2013-01-06,-0.053887,0.145788,-1.550983,0.598632


In [9]:
# 显示索引、列和底层的numpy数据
df.index
df.columns
df.values

array([[ 1.09853303,  0.79559718, -0.19409081,  1.13716471],
       [ 1.11213971, -2.32982075,  2.39387287,  0.51031262],
       [ 0.3720955 , -2.83634109,  1.35309533, -1.41865047],
       [ 0.75931485,  1.90165119, -0.22875934, -1.92673596],
       [ 0.57188236, -0.1131748 , -1.28900266,  0.16736478],
       [-0.05388656,  0.14578781, -1.55098323,  0.59863217]])

In [10]:
# describe()函数对于数据的快速统计汇总
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.643346,-0.40605,0.080689,-0.155319
std,0.448381,1.831226,1.528366,1.226383
min,-0.053887,-2.836341,-1.550983,-1.926736
25%,0.422042,-1.775659,-1.023942,-1.022147
50%,0.665599,0.016307,-0.211425,0.338839
75%,1.013728,0.633145,0.966299,0.576552
max,1.11214,1.901651,2.393873,1.137165


In [11]:
# 转置
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.098533,1.11214,0.372095,0.759315,0.571882,-0.053887
B,0.795597,-2.329821,-2.836341,1.901651,-0.113175,0.145788
C,-0.194091,2.393873,1.353095,-0.228759,-1.289003,-1.550983
D,1.137165,0.510313,-1.41865,-1.926736,0.167365,0.598632


In [12]:
# 按轴进行排序
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.137165,-0.194091,0.795597,1.098533
2013-01-02,0.510313,2.393873,-2.329821,1.11214
2013-01-03,-1.41865,1.353095,-2.836341,0.372095
2013-01-04,-1.926736,-0.228759,1.901651,0.759315
2013-01-05,0.167365,-1.289003,-0.113175,0.571882
2013-01-06,0.598632,-1.550983,0.145788,-0.053887


In [13]:
# 按列进行排序
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,0.372095,-2.836341,1.353095,-1.41865
2013-01-02,1.11214,-2.329821,2.393873,0.510313
2013-01-05,0.571882,-0.113175,-1.289003,0.167365
2013-01-06,-0.053887,0.145788,-1.550983,0.598632
2013-01-01,1.098533,0.795597,-0.194091,1.137165
2013-01-04,0.759315,1.901651,-0.228759,-1.926736


## 选择

In [14]:
# 选择一个单独的列，这将会返回一个Series，等同于df.A
df['A']

2013-01-01    1.098533
2013-01-02    1.112140
2013-01-03    0.372095
2013-01-04    0.759315
2013-01-05    0.571882
2013-01-06   -0.053887
Freq: D, Name: A, dtype: float64

In [15]:
# 通过[]进行选择，这将会对行进行切片
df[0:3]
df["20130101":"20130103"]

Unnamed: 0,A,B,C,D
2013-01-01,1.098533,0.795597,-0.194091,1.137165
2013-01-02,1.11214,-2.329821,2.393873,0.510313
2013-01-03,0.372095,-2.836341,1.353095,-1.41865


In [16]:
# 通过标签选择
df.loc[dates[0],:] 
df.loc[:,["A","B"]]
df.loc['20130102':'20130104',["A","B"]]
df.loc['20130102',["A","B"]]
df.loc['20130101',"A"]

1.0985330294274545

In [17]:
# 通过位置选择
df.iloc[3] #第四行
df.iloc[3:5,0:2]
df.iloc[[1,2,4],[0,2]] #行列数
df.iloc[1:3,:]
df.iloc[:,1:3]
df.iloc[1,1]

-2.3298207465336924

In [18]:
# 布尔索引
df[df.A > 0] # 使用一个单独列的值来选择数据
df[df > 0] # 使用where操作来选择数据


Unnamed: 0,A,B,C,D
2013-01-01,1.098533,0.795597,,1.137165
2013-01-02,1.11214,,2.393873,0.510313
2013-01-03,0.372095,,1.353095,
2013-01-04,0.759315,1.901651,,
2013-01-05,0.571882,,,0.167365
2013-01-06,,0.145788,,0.598632


In [19]:
# isin()
df2 = df.copy()
df2["E"] = ["ONE","ONE","TWO","THERE","FOUR","THERE"]
df2[df2["E"].isin(['TWO',"FOUR"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.372095,-2.836341,1.353095,-1.41865,TWO
2013-01-05,0.571882,-0.113175,-1.289003,0.167365,FOUR


### 设置

In [20]:
# 设置一个新的列
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130102',periods=6));s1
df["F"] = s1

In [21]:
# 通过标签设置新的值
df.at[dates[0],'A'] = 0
# 通过位置设置新的值
df.iat[0,1] = 0
# 通过一个numpy数组设置一组新值
df.loc[:,'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.194091,5,
2013-01-02,1.11214,-2.329821,2.393873,5,1.0
2013-01-03,0.372095,-2.836341,1.353095,5,2.0
2013-01-04,0.759315,1.901651,-0.228759,5,3.0
2013-01-05,0.571882,-0.113175,-1.289003,5,4.0
2013-01-06,-0.053887,0.145788,-1.550983,5,5.0


In [22]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.194091,-5,
2013-01-02,-1.11214,-2.329821,-2.393873,-5,-1.0
2013-01-03,-0.372095,-2.836341,-1.353095,-5,-2.0
2013-01-04,-0.759315,-1.901651,-0.228759,-5,-3.0
2013-01-05,-0.571882,-0.113175,-1.289003,-5,-4.0
2013-01-06,-0.053887,-0.145788,-1.550983,-5,-5.0


### 缺失值处理

In [23]:
# 1、  reindex()方法可以对指定轴上的索引进行改变/增加/删除操作，这将返回原始数据的一个拷贝
df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1

In [24]:
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.194091,5,,1.0
2013-01-02,1.11214,-2.329821,2.393873,5,1.0,1.0
2013-01-03,0.372095,-2.836341,1.353095,5,2.0,
2013-01-04,0.759315,1.901651,-0.228759,5,3.0,


In [25]:
#  去掉包含缺失值的行
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,1.11214,-2.329821,2.393873,5,1.0,1.0


In [26]:
# 对缺失值进行填充
df1.fillna(value=5555)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,-0.194091,5,5555.0,1.0
2013-01-02,1.11214,-2.329821,2.393873,5,1.0,1.0
2013-01-03,0.372095,-2.836341,1.353095,5,2.0,5555.0
2013-01-04,0.759315,1.901651,-0.228759,5,3.0,5555.0


In [27]:
#  对数据进行布尔填充
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


### 统计

In [28]:
# 描述性统计
df.mean()

A    0.460258
B   -0.538650
C    0.080689
D    5.000000
F    3.000000
dtype: float64

In [29]:
# 在其他轴上进行相同的操作
df.mean(1)

2013-01-01    1.201477
2013-01-02    1.435238
2013-01-03    1.177770
2013-01-04    2.086441
2013-01-05    1.633941
2013-01-06    1.708184
Freq: D, dtype: float64

In [30]:
# 对于拥有不同维度，需要对齐的对象进行操作。Pandas会自动的沿着指定的维度进行广播
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)# 跳过前两个
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [31]:
df.sub(s,axis='index')

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,,,,,
2013-01-03,-0.627905,-3.836341,0.353095,4.0,1.0
2013-01-04,-2.240685,-1.098349,-3.228759,2.0,0.0
2013-01-05,-4.428118,-5.113175,-6.289003,0.0,-1.0
2013-01-06,,,,,


### Apply

In [32]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.194091,5,
2013-01-02,1.11214,-2.329821,2.199782,10,1.0
2013-01-03,1.484235,-5.166162,3.552877,15,3.0
2013-01-04,2.24355,-3.264511,3.324118,20,6.0
2013-01-05,2.815432,-3.377685,2.035115,25,10.0
2013-01-06,2.761546,-3.231898,0.484132,30,15.0


In [33]:
df.apply(lambda x: x.max() - x.min())

A    1.166026
B    4.737992
C    3.944856
D    0.000000
F    4.000000
dtype: float64

In [34]:
# 直方图频数
ss = pd.Series(np.random.randint(0,7,size=10))
ss.value_counts()

5    3
4    3
2    2
6    1
1    1
dtype: int64

### 合并

In [35]:
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,1.044136,-0.613014,-0.456056,-0.139272
1,1.768429,0.841278,0.152336,0.405401
2,0.147314,-0.012694,0.303362,-0.198198
3,1.714434,1.743655,1.262833,-0.193669
4,-0.065697,-0.076877,0.522016,0.723026
5,0.243366,-0.469125,1.074831,-1.276159
6,-0.528465,0.234744,-0.686192,0.287172
7,0.428904,-0.000389,-1.215791,0.615915
8,0.709336,1.142056,-0.956761,1.200777
9,0.55963,-0.833946,0.917351,-0.191614


In [36]:
pieces = [df[:3],df[3:7],df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,1.044136,-0.613014,-0.456056,-0.139272
1,1.768429,0.841278,0.152336,0.405401
2,0.147314,-0.012694,0.303362,-0.198198
3,1.714434,1.743655,1.262833,-0.193669
4,-0.065697,-0.076877,0.522016,0.723026
5,0.243366,-0.469125,1.074831,-1.276159
6,-0.528465,0.234744,-0.686192,0.287172
7,0.428904,-0.000389,-1.215791,0.615915
8,0.709336,1.142056,-0.956761,1.200777
9,0.55963,-0.833946,0.917351,-0.191614


In [37]:
# append
df4 = df.iloc[3]
df.append(df4,ignore_index=False)


Unnamed: 0,0,1,2,3
0,1.044136,-0.613014,-0.456056,-0.139272
1,1.768429,0.841278,0.152336,0.405401
2,0.147314,-0.012694,0.303362,-0.198198
3,1.714434,1.743655,1.262833,-0.193669
4,-0.065697,-0.076877,0.522016,0.723026
5,0.243366,-0.469125,1.074831,-1.276159
6,-0.528465,0.234744,-0.686192,0.287172
7,0.428904,-0.000389,-1.215791,0.615915
8,0.709336,1.142056,-0.956761,1.200777
9,0.55963,-0.833946,0.917351,-0.191614


### 分组

In [38]:
df = pd.DataFrame({
                   'A':['foo','bar','foo','bar','foo','bar','foo','foo'],
                   'B':['one','one','two','three','two','two','one','three'],
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)
                })
df

Unnamed: 0,A,B,C,D
0,foo,one,0.856928,1.964299
1,bar,one,0.712434,0.593566
2,foo,two,0.414131,0.848199
3,bar,three,-2.402709,0.382803
4,foo,two,0.162621,-0.384533
5,bar,two,0.155285,-0.565406
6,foo,one,0.637771,-0.409684
7,foo,three,0.224272,1.097012


In [39]:
# 分组并对每个分组执行sum函数
df.groupby("A").sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.53499,0.410962
foo,2.295723,3.115293


In [40]:
df.groupby(["A","B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.712434,0.593566
bar,three,-2.402709,0.382803
bar,two,0.155285,-0.565406
foo,one,1.494699,1.554615
foo,three,0.224272,1.097012
foo,two,0.576752,0.463666


### 再生 （reshape）

In [41]:
# 堆叠 stack
tuples = list(zip(*[
    ['bar','bar','baz','baz','foo','foo','qux','qux'],
    ['one','two','one','two','one','two','one','two']
]))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [42]:
index = pd.MultiIndex.from_tuples(tuples,names=["first","second"])
index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [43]:
df = pd.DataFrame(np.random.randn(8,2),index=index,columns=["A","B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.522132,-0.006155
bar,two,0.501255,-0.251191
baz,one,-0.426329,-0.2153
baz,two,0.573822,-0.229796
foo,one,-1.366699,0.211038
foo,two,1.077461,-1.705703
qux,one,1.773592,0.266516
qux,two,-1.460898,0.578016


In [44]:
df5 = df[:4]
df5

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.522132,-0.006155
bar,two,0.501255,-0.251191
baz,one,-0.426329,-0.2153
baz,two,0.573822,-0.229796


In [45]:
stacked = df5.stack()

In [46]:
stacked

first  second   
bar    one     A    0.522132
               B   -0.006155
       two     A    0.501255
               B   -0.251191
baz    one     A   -0.426329
               B   -0.215300
       two     A    0.573822
               B   -0.229796
dtype: float64

In [47]:
stacked.unstack()
stacked.unstack(1)
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.522132,-0.426329
one,B,-0.006155,-0.2153
two,A,0.501255,0.573822
two,B,-0.251191,-0.229796


In [48]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.522132,-0.426329
one,B,-0.006155,-0.2153
two,A,0.501255,0.573822
two,B,-0.251191,-0.229796


#### 透视表（有单独文档）

## 类别类型

In [52]:
df = pd.DataFrame({"id":[1,2,3,4,5,6],"raw_grade":['a','b','b','a','a','e']})
df

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [54]:
# 将原始的grade转换为Categorical数据类型
df["grade"] = df["raw_grade"].astype("category")
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

In [None]:
# 将Categorical类型数据重命名为更有意义的名称