# Getting Started with pandas 
### pandas,NumPy,SciPy,matplotlib经常一起使用
### NumPy适合处理同质性数组
### pandas适合处理表格型，异质性数据

## 5.1 Introduction to pandas Data Structures

In [2]:
import pandas as pd  ## 默认导入模式
from pandas import Series,DataFrame  ## 常用的两个工具 

## 5.1.1 Series
### 值+index索引

In [3]:
obj=pd.Series([4,7,-5,3])  # 默认索引0，1，2，3
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
# 单独得到值或索引
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [6]:
# 自定义索引
obj2=pd.Series([4,7,-5,3],index=['a','b','c','d'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [7]:
obj2['a']  # 用索引查看

4

In [8]:
obj2['d']=6  # 用索引改值
obj2

a    4
b    7
c   -5
d    6
dtype: int64

In [10]:
obj2[['c','a','b']]  # ['c','a','b']包含的是数字而不是字符串，作为索引列表

c   -5
a    4
b    7
dtype: int64

In [11]:
# 这个也算是布尔值索引
obj2[obj2>0]

a    4
b    7
d    6
dtype: int64

In [12]:
# 数值运算
obj2*2

a     8
b    14
c   -10
d    12
dtype: int64

In [14]:
# 长度固定且有序的字典可以判断
'b' in obj2

True

In [22]:
# 可以用字典生成Series，默认的Index就是字典的key
sdata={'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3=pd.Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [23]:
# 字典生成Series,自定义index
states=['California','Ohio','Oregon','Texas']
obj4=pd.Series(sdata,index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

####  NaN,这是标志缺失值或NA值的方式
#### 使用isnull和notnull函数检查缺失数据

In [24]:
pd.isnull(obj4)  # obj4.isnull() 第二种方式

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [25]:
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [26]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [27]:
obj3+obj4
#  相加之后所有的Index都被保留
#  共同的index对应的value数值相加
#  非共同的index变成NaN

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

#### Series对象自身和其索引都有name属性

In [28]:
obj4.name='population'
obj4.index.name='state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [29]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [31]:
obj.index=['Bob','Steve','Jeff','Ryan']  # 按位置赋值
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

## 5.1.2 DataFrame
### 二维矩阵

In [36]:
#  字典生成DataFrame
data={'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
     'year':[2000,2001,2002,2001,2002,2003],
     'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame=pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [37]:
frame.head()  # 只看前五行

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [38]:
#  指定列的顺序
pd.DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [39]:
# 列不包含在字典中，将出现缺失值
frame2=pd.DataFrame(data,columns=['year','state','pop','debt'],
                   index=['one','two','three','four','five','six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [40]:
frame2.columns  # 查看列名称

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [41]:
frame2['state']  # 查看某列,frame2.year

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [42]:
#  行也可以通过位置或特殊属性loc进行选取
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [44]:
#  给一列赋值，默认所有的数都是这个
frame2['debt']=16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [46]:
frame2['debt']=np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [47]:
#  通过series给列赋值，先定义一个
#  然后部分的index相同
val=pd.Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2['debt']=val  #  index相同的部分会赋值
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


#### 如果被赋值的列不存在，则会生成一个新的列

In [48]:
frame2['eastern']=frame2.state=='Ohio'
#  布尔值，判断state列是否为'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [49]:
#  del 方法可以用于移除之前新建的列
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [50]:
#  包含字典的嵌套字典
#  一级键作为列
#  二级见键作为索引
pop={'Nevada':{2001:2.4,2002:2.9},
    'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3=pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [51]:
frame3.T  # 进行转置,使用NumPy语法

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [52]:
pd.DataFrame(pop,index=[2001,2002,2003])  # 没有值的地方默认用NaN

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [53]:
#  用frame的series创建frame
#  一级关键字是列
#  二级关键字是原来的index
pdata={'Ohio':frame3['Ohio'][:-1], # [0:-1) 倒数第一个为止，也就是倒数第一个不要
      'Nevada':frame3['Nevada'][:2]}  #[0:2)
pd.DataFrame(pdata)

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


#### index和columns有name属性

In [56]:
frame3.index.name='year';frame3.columns.name='state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


#### DataFrame的values属性会将包含在DataFrame中的数据以二维ndarray的形式返回

In [57]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [59]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

## 5.1.3 Index Objects 索引对象
### 索引是一种对象类型的
### 数组或标签列可以转换出索引类型
### 特点：不可变，可重复

In [60]:
obj=pd.Series(range(3),index=['a','b','c'])
index=obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [61]:
index[1:]  # 切片

Index(['b', 'c'], dtype='object')

### 索引对象是不可变的
#### 不可变性使得在多种数据结构中分享索引更安全

In [62]:
index[1]='d'

TypeError: Index does not support mutable operations

In [66]:
labels=pd.Index(np.arange(3))  # 生成一个索引对象
labels

Int64Index([0, 1, 2], dtype='int64')

In [67]:
obj2=pd.Series([1.5,-2.5,0],index=labels)
#  生成一个序列，用上面定义的索引
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [68]:
#  可以判断
obj2.index is labels

True

In [69]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [70]:
frame3.columns  # 列名也是Index对象

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [71]:
'Ohio' in frame3.columns  # 判断是否在Index对象里

True

In [72]:
2003 in frame3.index

False

#### pandas索引对象可以包括重复标签

In [73]:
dup_labels=pd.Index(['foo','foo','bar','bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

## Essential Functionality 主要功能
### 基础，最重要的特性

## 5.2.1 Reindexing 重置索引

In [74]:
obj=pd.Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [77]:
obj2=obj.reindex(['a','b','c','d','e'])
#  reindex重置索引为abcde
#  改变顺序，原来有值取原来的值，没有的默认为NaN
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

#### 有时候需要自动填充一些值
#### 要用method
#### 比如method=ffill，将值向前填充

In [78]:
obj3=pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [79]:
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

#### reindex可以改变行，也可以改变列
#### 一个参数，默认改变行索引

In [83]:
frame=pd.DataFrame(np.arange(9).reshape((3,3)),
                  index=['a','c','d'],
                  columns=['Ohio','Texas','Calofornia'])
frame

Unnamed: 0,Ohio,Texas,Calofornia
a,0,1,2
c,3,4,5
d,6,7,8


In [85]:
frame2=frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,Calofornia
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


#### 用columns关键字重建索引

In [86]:
states=['Texas','Utah','Calofornia']
#  按顺序删除Ohio,加入Utah
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,Calofornia
a,1,,2
c,4,,5
d,7,,8


In [87]:
frame.loc[['a','b','c','d'],states]

Unnamed: 0,Texas,Utah,Calofornia
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


## 5.2.2 Dropping Entries from as Axis 轴向上删除条目
#### entries进入
#### 就是删除一行或一列 用drop

In [102]:
obj=pd.Series(np.arange(5.),index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [103]:
new_obj=obj.drop('c')  # 生成一个新的对象
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [104]:
obj  # 原来的obj并没有改变

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [92]:
data=pd.DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [93]:
data.drop(['Colorado','Ohio'])  # 默认就是删除索引对应的行或列

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [95]:
data.drop('two',axis=1)
#  可以指定是哪个轴，axis=1,axis='columns'

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [97]:
data.drop(['two','three'],axis='columns')

Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11
New York,12,15


#### 想要删除替换原来的表格
#### 直接用inplace=True
#### drop的值会被清除
#### 原来的表格会被改变

In [106]:
obj.drop('c',inplace=True)
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

## 5.2.3 Indexing,Selection,and Filtering 索引，选择，过滤

In [3]:
import numpy as np
import pandas as pd  ## 默认导入模式
from pandas import Series,DataFrame  ## 常用的两个工具 
obj=pd.Series(np.arange(4.),index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [4]:
obj['b'] # 将值选出来

1.0

In [5]:
obj[1]

1.0

In [6]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [7]:
obj[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [8]:
obj[[1,3]]

b    1.0
d    3.0
dtype: float64

In [9]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

### 普通的切片不包含尾部，Series的切片不同

In [10]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [12]:
obj['b':'c']=5  # 赋值
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [13]:
data=pd.DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [14]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [15]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [16]:
data[:2]  # 什么都不说，默认为行，前两行 

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [17]:
data[data['three']>5]  # 布尔值索引

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [18]:
data<5  # 布尔值判断

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [20]:
data[data<5]=0  # 布尔值索引赋值
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### 5.2.3.1 Selection with loc and iloc 使用loc和iloc选择数据
#### loc就是location吧

In [22]:
data.loc['Colorado',['two','three']]  # 选择内容 
#  第一个元素，一级索引，确定行，Colorado
#  第二个[],二级索引，确定列

two      5
three    6
Name: Colorado, dtype: int32

In [23]:
data.iloc[2,[3,0,1]]
#  用iloc,2,Utah
#  3 0 1 列

four    11
one      8
two      9
Name: Utah, dtype: int32

In [24]:
data.iloc[2]  # 第二行，默认一行都取出来

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [25]:
data.iloc[[1,2],[3,0,1]] 

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [26]:
data.loc[:'Utah','two']
#  行，[Ohio:Utah]
#  列选two

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [27]:
data.iloc[:,:3][data.three>5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


## 5.2.4 Integer Indexes 整数索引
### loc,根据元素的内容确定
### iloc，根据索引确定
### 直接用series.[]这种，如果内容不是数字，它就跟iloc一样，如果内容是数字，它就跟loc一样

In [28]:
ser=pd.Series(np.arange(3.),index=['a','b','c'])
ser

a    0.0
b    1.0
c    2.0
dtype: float64

#### 这里使用方框，情况相当于iloc,index

In [30]:
ser[-1]  # 默认整数索引

2.0

#### 但是你的index就是3,2,1
#### 比如下例

In [31]:
ser=pd.Series(np.arange(3.),index=[3,2,1])
#  index是整数
ser

3    0.0
2    1.0
1    2.0
dtype: float64

#### 这里使用方框，情况相当于loc

In [32]:
ser[1]

2.0

#### 无法启用index

In [34]:
ser[-1]  # 报错，不会启用index

KeyError: -1

#### 为了避免这种歧义，通常使用loc和iloc声明你指的是内容还是索引

In [39]:
print(ser)
ser[:1]  #index,iloc

3    0.0
2    1.0
1    2.0
dtype: float64


3    0.0
dtype: float64

In [40]:
ser.loc[:1]  # location 左闭右闭区间

3    0.0
2    1.0
1    2.0
dtype: float64

In [42]:
ser.iloc[:2]  # index location  左闭右开区间

3    0.0
2    1.0
dtype: float64

In [43]:
ser.iloc[-1]  # index location 倒数第一行

2.0

## Arithmetic and Data Alignment 四则运算和数据调整

In [44]:
s1=pd.Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2=pd.Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])

In [45]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [46]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

#### 直接相加就是索引内容相同的相加

In [47]:
s1+s2
#  索引值相同的相加，不同的返回NaN

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

#### 两张表相加，纵轴横轴都有所不同

In [49]:
df1=pd.DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),
                index=['Ohio','Texas','Colorado'])
df2=pd.DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),
                index=['Utah','Ohio','Texas','Oregon'])

In [50]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [51]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [52]:
df1+df2
#  b,d,Texas,Ohio对应的有数，其他的都是NaN

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


#### 横轴完全不同，都是NaN

In [53]:
df1=pd.DataFrame({'A':[1,2]})
df2=pd.DataFrame({'B':[3,4]})

In [54]:
df1

Unnamed: 0,A
0,1
1,2


In [55]:
df2

Unnamed: 0,B
0,3
1,4


In [60]:
df1+df2

Unnamed: 0,A,B
0,,
1,,


### 5.2.5.1 Arithmetic methods with fill values 使用填充值的算数方法

#### 用fill_value=这方法

In [3]:
import numpy as np
import pandas as pd  ## 默认导入模式
from pandas import Series,DataFrame  ## 常用的两个工具 
df1=pd.DataFrame(np.arange(12.).reshape((3,4)),
                columns=list('abcd'))
df1.loc[0,'b']=np.nan
df1

Unnamed: 0,a,b,c,d
0,0.0,,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [4]:
df2=pd.DataFrame(np.arange(20.).reshape((4,5)),
                columns=list('abcde'))
df2.loc[1,'b']=np.nan
df2.loc[1,'e']=np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [5]:
df1+df2  # 两个相加，有了更多的NaN

Unnamed: 0,a,b,c,d,e
0,0.0,,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [6]:
df1.add(df2,fill_value=0)
#  对df1填充值
#  添加的值来自df2
#  df1里没有的值，df2来补
#  fill_value=0先填充确实数据，NaN=0
#  再求和

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [8]:
df2.add(df1,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [9]:
1/df1  #  所有的值取倒数

Unnamed: 0,a,b,c,d
0,inf,,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [10]:
df1.rdiv(1)
#  等价于1 div df1,也就是1除以df1
#  看样子r应该是翻转的意思，就是换个位置

Unnamed: 0,a,b,c,d
0,inf,,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


### 重建index时也可以用
### fill_value这里就默认了=0

In [11]:
df1.reindex(columns=df2.columns,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


### 5.2.5.2 Operations between DataFrame and Series DataFrame和Series间的操作
#### 就是一个表加减一行的情况

In [12]:
arr=np.arange(12.).reshape((3,4))
arr

array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [13]:
arr[0]

array([ 0.,  1.,  2.,  3.])

In [14]:
arr-arr[0]  
#  每一行都减去了arr[0]

array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

#### 再来个例子

In [18]:
frame=pd.DataFrame(np.arange(12.).reshape((4,3)),
                  columns=list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])
series=frame.iloc[0]  # 对应着第一行
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [19]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [20]:
frame-series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [25]:
series2=pd.Series(range(3),index=['b','e','f'])
series2

b    0
e    1
f    2
dtype: int32

In [26]:
frame+series2
#  bde+bef=be
#  每行都加上series2
#  其他都是NaN

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [27]:
series3=frame['d']
series3
#  取出其中的一列

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [28]:
frame  # 这个是之前的表格

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [29]:
frame.sub(series3,axis='index')
#  要用sub这个函数
#  index就是人名那个轴
#  frame的每列减去d的值

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0
