# Pandas
## Pandas数据结构介绍

### Series 
> 一维数组，包含索引index   
> index和value分别查看索引和值

In [7]:
import pandas as pd
import numpy as np
obj = pd.Series([4,7,-5,3])
obj.values

array([ 4,  7, -5,  3], dtype=int64)

> pandas会自动生成索引，也可以自定义索引

In [None]:
obj2 = pd.Series([4,7,-5,3],index=['a','b','c','d'])
obj2

In [None]:
obj2[obj2>2]

In [None]:
np.exp(obj2) # 转换为浮点数仍然保留索引

> Series可以当成字典用在函数中  
> 所以可以拿python字典来创建series

In [None]:
'b' in obj2

In [14]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

> 改变字典的键来更改索引

In [16]:
# 改变字典的键来更改索引
states = ['California','Ohio','Oregon','Texas']
obj4 = pd.Series(sdata,index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

> isnull和notnull函数来检测缺失值  
> 同样obj.isnull方法也能实现同样操作

In [18]:
# isnull和notnull函数来检测缺失值
pd.isnull(obj4) 

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

> Series最重要的一个功能是，它会根据运算的索引标签自动对齐数据：意味着运算结果为交集，类似与数据库join

In [19]:
obj3 + obj4 

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

> 还可以给Series对象设置名字和索引名，索引名能赋值修改

In [20]:
obj4.name = "population"
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

### Dataframe  
> DataFrame是一个表格型的数据结构  
> DataFrame既有行索引也有列索引，它可以被看做由Series组成的字典（共用同一个索引）

In [28]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data) # 转换为数据帧
frame.index= [1,2,3,4,5,6] # 可以像Series一样更改索引
frame.columns = ['year','state','pop'] # 指定排列方式
frame

Unnamed: 0,year,state,pop
one,Ohio,2000,1.5
two,Ohio,2001,1.7
three,Ohio,2002,3.6
four,Nevada,2001,2.4
five,Nevada,2002,2.9
six,Nevada,2003,3.2


In [47]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'])
frame2.index = ['one', 'two', 'three', 'four', 'five', 'six']
frame2 # 如果没数据会产生空值

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


> 可以通过切片从Dataframe中提取Series或者新的Dataframe

In [48]:
frame2.year
frame2[['year','pop','debt']] # 跟numpy的切片索引很像

Unnamed: 0,year,pop,debt
one,2000,1.5,
two,2001,1.7,
three,2002,3.6,
four,2001,2.4,
five,2002,2.9,
six,2003,3.2,


In [49]:
frame2.loc['three'] # 可以通过位置索引

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

> 通过赋值可以更改列数据

In [63]:
import numpy as np
frame2.debt = 16.5
frame2['debt'] = np.arange(6.0)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [64]:
val = pd.Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2.debt = val 
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


> del用来删除列  
> 赋值不存在的列会创建新列

In [68]:
frame2['easterm'] = frame2.state == 'Ohio' # 这里就不能用方法了，只能用索引['eastern']
frame2 # 注意：不能用frame2.eastern创建新的列，笑死了我

Unnamed: 0,year,state,pop,debt,easterm
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


> 注意：通过索引方式返回的列只是相应数据的视图而已，并不是副本。因此，对返回的Series所做的任何就地修改全都会反映到源DataFrame上。通过Series的copy方法即可指定复制列。

In [72]:
frame3 = frame2.year.copy() # 复制一列
frame3

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

### 嵌套字典

> 嵌套字典：外层字典的键作为列，内层键则作为行索引

In [75]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [77]:
frame3.T # 转置

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


> 自定义索引顺序

In [81]:
pd.DataFrame(pop, index=[2001, 2002, 2003])
frame3.index = [2001, 2002, 2003] # 两者效果一样
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,1.5


In [85]:
pdata = {
    'Ohio':frame3['Ohio'][:-1],
    'Nevada':frame3['Nevada'][:2]
}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


> dataframe构造函数能接受二维数组、numpy的结构化数组、Series字典、字典组成的字典、列表或者元组组成的列表、另一个dataframe

> 可以给dataframe设置表名、列名、索引名

In [94]:
frame3.index.name = 'year'
frame3.columns.name = 'state' # 设置列名，相当于设置了表名
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2003,,1.5


> 跟Series一样可以用index和value返回索引和值

In [96]:
frame2.values
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

### 索引对象