# pandas

## 重建索引

In [1]:
import pandas as pd

In [3]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [4]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [5]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [6]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [7]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [8]:
obj4 = obj3.reindex(range(6), method='ffill')

In [9]:
obj4

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [11]:
import numpy as np

In [13]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),index=['a', 'c','d'], columns=['Ohio', 'Texas', 'California'])

In [14]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [15]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [16]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [17]:
states = ['Texas', 'Utah', 'California']

In [19]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [20]:
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


## 轴向上删除条目

In [22]:
obj5 = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [23]:
obj5

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [27]:
n_obj = obj5.drop('c')

In [28]:
n_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [29]:
obj5.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [30]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns=['one', 'two', 'three', 'fore'])

In [31]:
data

Unnamed: 0,one,two,three,fore
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [33]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,fore
Utah,8,9,10,11
New York,12,13,14,15


In [34]:
data.drop(['one', 'two'], axis=1)

Unnamed: 0,three,fore
Ohio,2,3
Colorado,6,7
Utah,10,11
New York,14,15


In [36]:
data.drop(['three', 'fore'], axis='columns')

Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5
Utah,8,9
New York,12,13


In [37]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [38]:
obj.drop('c')

d    4.5
b    7.2
a   -5.3
dtype: float64

In [39]:
obj.drop('c', inplace=True)

In [40]:
obj

d    4.5
b    7.2
a   -5.3
dtype: float64

In [42]:
obj.drop('b',inplace=False)

d    4.5
a   -5.3
dtype: float64

inplace会清除被删除的数据

## 索引、选择、过滤

In [43]:
obj6 = pd.Series(np.arange(4.), index=['a','b','c','d'])

In [44]:
obj6

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [46]:
obj6['b']

1.0

In [48]:
obj6[1]

1.0

In [49]:
obj6[['a', 'c']]

a    0.0
c    2.0
dtype: float64

In [51]:
obj6[1:3]

b    1.0
c    2.0
dtype: float64

In [54]:
obj6

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [55]:
obj6[obj6<2]

a    0.0
b    1.0
dtype: float64

In [56]:
obj6['a': 'c']

a    0.0
b    1.0
c    2.0
dtype: float64

Series切片包含尾部，与python不同

In [57]:
obj6['b':'c']=5.

In [58]:
obj6

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [59]:
data

Unnamed: 0,one,two,three,fore
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [60]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [61]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [64]:
data[:2]

Unnamed: 0,one,two,three,fore
Ohio,0,1,2,3
Colorado,4,5,6,7


In [66]:
data[data['three']>5]

Unnamed: 0,one,two,three,fore
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


行选择语法：data[:2]

列选择：传递一个元素或一个列表选择列

In [67]:
data

Unnamed: 0,one,two,three,fore
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [68]:
data<5

Unnamed: 0,one,two,three,fore
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [69]:
data[data<5] = 1

In [70]:
data

Unnamed: 0,one,two,three,fore
Ohio,1,1,1,1
Colorado,1,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### 使用loc和iloc选择数据

In [71]:
data

Unnamed: 0,one,two,three,fore
Ohio,1,1,1,1
Colorado,1,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [72]:
data.loc['Colorado', ['one','two']]

one    1
two    5
Name: Colorado, dtype: int32

In [73]:
data

Unnamed: 0,one,two,three,fore
Ohio,1,1,1,1
Colorado,1,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [74]:
data.iloc[2,[3,0,1]]

fore    11
one      8
two      9
Name: Utah, dtype: int32

In [75]:
data

Unnamed: 0,one,two,three,fore
Ohio,1,1,1,1
Colorado,1,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [78]:
data.iloc[2]

one       8
two       9
three    10
fore     11
Name: Utah, dtype: int32

In [79]:
data

Unnamed: 0,one,two,three,fore
Ohio,1,1,1,1
Colorado,1,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [80]:
data.iloc[[1,2],[3,0,1]]

Unnamed: 0,fore,one,two
Colorado,7,1,5
Utah,11,8,9


In [81]:
data

Unnamed: 0,one,two,three,fore
Ohio,1,1,1,1
Colorado,1,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [82]:
data[:'Utah']

Unnamed: 0,one,two,three,fore
Ohio,1,1,1,1
Colorado,1,5,6,7
Utah,8,9,10,11


In [84]:
data.loc[:'Utah','two']

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

In [85]:
data.iloc[:,:3]

Unnamed: 0,one,two,three
Ohio,1,1,1
Colorado,1,5,6
Utah,8,9,10
New York,12,13,14


In [87]:
data.iloc[:,:3][data.three>5]

Unnamed: 0,one,two,three
Colorado,1,5,6
Utah,8,9,10
New York,12,13,14


## 整数索引

In [88]:
ser = pd.Series(np.arange(3.))

In [89]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [None]:
ser[-1]

对于整数索引会出错，因为需判断是位置索引还是标签索引是很难的

对于非整数索引，则不会出错

In [91]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])

In [92]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [94]:
ser2[-1]

2.0

为了保持一致性，如果你有一个包含整数的轴索引，数据选择时请始终选择标签索引

为了更精确的处理，可以使用loc（用于标签）和iloc（用于整数）：

In [95]:
ser[:1]

0    0.0
dtype: float64

In [96]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [97]:
ser.iloc[:1]

0    0.0
dtype: float64

## 算术和数据对齐

In [98]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], 
              index=['a','c','d','e'])

In [99]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4,3.1],
              index=['a','c','e','f','g'])

In [100]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [101]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [102]:
s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [105]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)),
                  columns = list('bcd'),
                  index = ['Ohio', 'Texas', 'Colorado'])

In [106]:
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)),
                  columns = list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])

In [107]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [108]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [109]:
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [110]:
df1 = pd.DataFrame({'A':[1,2]})

In [116]:
df2 = pd.DataFrame({'B':[3,4]})

In [117]:
df1

Unnamed: 0,A
0,1
1,2


In [118]:
df2

Unnamed: 0,B
0,3
1,4


In [119]:
df1-df2

Unnamed: 0,A,B
0,,
1,,


### 使用填充值的算术方法

In [120]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),
                  columns=list('abcd'))

In [121]:
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),
                  columns = list('abcde'))

In [122]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [123]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [125]:
df2.loc[1,'b'] = np.nan

In [126]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [127]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [129]:
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [130]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [131]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [132]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [134]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [135]:
df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


#### DataFrame和Series之间的操作

In [136]:
arr = np.arange(12.).reshape((3,4))

In [137]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [138]:
arr[0]

array([0., 1., 2., 3.])

In [139]:
arr-arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [141]:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)),
                    columns = list('bde'),
                    index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [142]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [143]:
series = frame.iloc[0]

In [144]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [145]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [146]:
frame-series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [147]:
series-frame

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,-3.0,-3.0,-3.0
Texas,-6.0,-6.0,-6.0
Oregon,-9.0,-9.0,-9.0


In [148]:
series2 = pd.Series(range(3),
                   index = ['b','e','f'])

In [149]:
frame+series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [150]:
series3 = frame['d']

In [151]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [152]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [153]:
frame+series3

Unnamed: 0,Ohio,Oregon,Texas,Utah,b,d,e
Utah,,,,,,,
Ohio,,,,,,,
Texas,,,,,,,
Oregon,,,,,,,


In [154]:
frame.sub(series3,axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [156]:
frame.sub(series3,axis=0)

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


## 函数应用和映射

In [157]:
frame = pd.DataFrame(np.random.randn(4,3),
                    columns = list('bde'),
                    index = ['liu', 'bo', 'wu', 'pei'])

In [158]:
frame

Unnamed: 0,b,d,e
liu,-0.366193,0.561985,0.30225
bo,1.297623,-0.26837,1.543548
wu,-1.695091,-0.547453,-0.456125
pei,0.075755,0.723048,-0.002222


In [159]:
np.abs(frame)

Unnamed: 0,b,d,e
liu,0.366193,0.561985,0.30225
bo,1.297623,0.26837,1.543548
wu,1.695091,0.547453,0.456125
pei,0.075755,0.723048,0.002222


In [160]:
frame

Unnamed: 0,b,d,e
liu,-0.366193,0.561985,0.30225
bo,1.297623,-0.26837,1.543548
wu,-1.695091,-0.547453,-0.456125
pei,0.075755,0.723048,-0.002222


In [161]:
f = lambda x:x.max()-x.min()

In [162]:
frame.apply(f)

b    2.992714
d    1.270500
e    1.999672
dtype: float64

In [164]:
frame.apply(f,axis=0)

b    2.992714
d    1.270500
e    1.999672
dtype: float64

In [165]:
frame.apply(f,axis=1)

liu    0.928178
bo     1.811918
wu     1.238966
pei    0.725270
dtype: float64

In [166]:
def f(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])

In [167]:
frame

Unnamed: 0,b,d,e
liu,-0.366193,0.561985,0.30225
bo,1.297623,-0.26837,1.543548
wu,-1.695091,-0.547453,-0.456125
pei,0.075755,0.723048,-0.002222


In [169]:
frame.apply(f, axis=1)

Unnamed: 0,min,max
liu,-0.366193,0.561985
bo,-0.26837,1.543548
wu,-1.695091,-0.456125
pei,-0.002222,0.723048


In [170]:
format = lambda x:'%.2f'%x

In [171]:
frame.applymap(format)

Unnamed: 0,b,d,e
liu,-0.37,0.56,0.3
bo,1.3,-0.27,1.54
wu,-1.7,-0.55,-0.46
pei,0.08,0.72,-0.0


In [172]:
frame['e']

liu    0.302250
bo     1.543548
wu    -0.456125
pei   -0.002222
Name: e, dtype: float64

In [173]:
frame['e'].map(format)

liu     0.30
bo      1.54
wu     -0.46
pei    -0.00
Name: e, dtype: object

## 排序和排名

In [174]:
obj8 = pd.Series(range(4),
                index=list('dabc'))

In [175]:
obj8

d    0
a    1
b    2
c    3
dtype: int64

In [176]:
obj8.sort_index

<bound method Series.sort_index of d    0
a    1
b    2
c    3
dtype: int64>

In [177]:
obj8.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [178]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)),
                    columns = list('dabc'),
                    index=['three','one'])

In [179]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [181]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [182]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [183]:
frame.sort_index(axis=1,ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [185]:
obj9 = pd.Series([4,7,-3,2])

In [186]:
obj9

0    4
1    7
2   -3
3    2
dtype: int64

In [188]:
obj9.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [189]:
frame2 = pd.DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})

In [192]:
frame2

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [194]:
frame2.sort_values('b')

Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [195]:
frame2.sort_values(['a','b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


#### 排名

In [196]:
obj9 = pd.Series([7,-5,7,4,2,0,4])

In [197]:
obj9.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [198]:
obj9.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [200]:
obj9

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [199]:
obj9.rank(ascending = False,method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [201]:
frame3=pd.DataFrame({'b':[4.3,7,-3,2],
                    'a':[0,1,0,1],
                    'c':[-2,5,8,-2.5]})

In [202]:
frame3

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [204]:
frame3.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


## 含有重复标签的轴索引

In [206]:
obj10 = pd.Series(range(5),index=list('aabbc'))

In [207]:
obj10

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [209]:
obj10.index.is_unique

False

In [210]:
obj10['a']

a    0
a    1
dtype: int64

In [212]:
obj10['c']

4

多个会返回序列，一个会返回值

In [213]:
df = pd.DataFrame(np.random.randn(4,3),
                 index=['a','a','b','b'])

In [214]:
de

NameError: name 'de' is not defined

In [215]:
df

Unnamed: 0,0,1,2
a,-0.625491,-0.156108,0.520268
a,-1.082648,0.968033,-0.779032
b,1.537883,0.654965,0.478372
b,1.139548,1.754758,0.778323


In [216]:
df.loc['b']

Unnamed: 0,0,1,2
b,1.537883,0.654965,0.478372
b,1.139548,1.754758,0.778323


# 描述性统计的概述与计算