## 核心数据结构

In [1]:
import pandas as pd
import numpy as np

### Series

Series 是**一维带标签的数组**，数组里可以放任意的数据（整数，浮点数，字符串，Python Object）。其基本的创建函数是：

```python
s = pd.Series(data, index=index)
```

其中 index 是一个列表，用来作为数据的标签。data 可以是不同的数据类型：

* Python 字典
* ndarray 对象
* 一个标量值，如 5


#### 从 ndaray 创建

In [5]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.267876
b    0.772569
c   -0.989360
d    0.082494
e   -0.252940
dtype: float64

In [6]:
s.index

Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')

In [7]:
s = pd.Series(np.random.randn(5))
s

0   -0.601604
1   -0.377704
2    1.062305
3   -1.963953
4    1.740142
dtype: float64

In [8]:
s.index

Int64Index([0, 1, 2, 3, 4], dtype='int64')

#### 从字典创建

In [14]:
# 空值的默认处理
d = {'a' : 0., 'b' : 1., 'd' : 3}
s = pd.Series(d, index=list('abcd'))
s

a     0
b     1
c   NaN
d     3
dtype: float64

#### 从标量创建

In [15]:
pd.Series(3, index=list('abcde'))

a    3
b    3
c    3
d    3
e    3
dtype: int64

In [22]:
print "Missing required dependencies {values}".format(values = ['aaa', 'bbb'])

Missing required dependencies ['aaa', 'bbb']


#### Series 是类 ndarray 对象

熟悉 numpy 的同学对下面的操作应该不会陌生。我们在 numpy 简介里也介绍过下面的索引方式。

In [25]:
s = pd.Series(np.random.randn(5))
s

0    0.640332
1   -0.759048
2   -0.458432
3   -0.367096
4   -0.552213
dtype: float64

In [26]:
s[0]

0.64033226699389012

In [27]:
s[:3]

0    0.640332
1   -0.759048
2   -0.458432
dtype: float64

In [28]:
s[[1, 3, 4]]

1   -0.759048
3   -0.367096
4   -0.552213
dtype: float64

In [29]:
np.exp(s)

0    1.897111
1    0.468112
2    0.632274
3    0.692743
4    0.575675
dtype: float64

In [30]:
np.sin(s)

0    0.597462
1   -0.688231
2   -0.442542
3   -0.358907
4   -0.524572
dtype: float64

#### Series 是类字典对象

In [32]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a    0.374798
b   -1.034789
c   -1.091428
d    0.789042
e    0.115345
dtype: float64

In [33]:
s['a']

0.37479824109118348

In [34]:
s['e'] = 5

In [35]:
s

a    0.374798
b   -1.034789
c   -1.091428
d    0.789042
e    5.000000
dtype: float64

In [36]:
s['g'] = 100

In [37]:
s

a      0.374798
b     -1.034789
c     -1.091428
d      0.789042
e      5.000000
g    100.000000
dtype: float64

In [38]:
'e' in s

True

In [39]:
'f' in s

False

In [41]:
# s['f']

In [43]:
print s.get('f')

None


In [44]:
print s.get('f', np.nan)

nan


#### 标签对齐操作

In [50]:
s1 = pd.Series(np.random.randn(3), index=['a', 'c', 'e'])
s2 = pd.Series(np.random.randn(3), index=['a', 'd', 'e'])
print '{0}\n\n{1}'.format(s1, s2)

a   -0.235168
c   -0.240533
e    1.944756
dtype: float64

a   -0.257899
d   -1.894610
e   -0.435829
dtype: float64


In [51]:
s1 + s2

a   -0.493067
c         NaN
d         NaN
e    1.508927
dtype: float64

#### name 属性

In [53]:
s = pd.Series(np.random.randn(5), name='Some Thing')
s

0   -0.090608
1    1.840820
2    1.625704
3   -0.549411
4    1.106454
Name: Some Thing, dtype: float64

In [54]:
s.name

'Some Thing'

### DataFrame

DataFrame 是**二维带行标签和列标签的数组**。可以把 DataFrame 想你成一个 Excel 表格或一个 SQL 数据库的表格，还可以相像成是一个 Series 对象字典。它是 Pandas 里最常用的数据结构。

创建 DataFrame 的基本格式是：

```python
df = pd.DataFrame(data, index=index, columns=columns)
```

其中 index 是行标签，columns 是列标签，data 可以是下面的数据：

* 由一维 numpy 数组，list，Series 构成的字典
* 二维 numpy 数组
* 一个 Series
* 另外的 DataFrame 对象


#### 从字典创建

In [62]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

In [63]:
pd.DataFrame(d)

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [59]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4
b,2.0,2
a,1.0,1


In [61]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4,
b,2,
a,1,


In [64]:
d = {'one' : [1, 2, 3, 4],
     'two' : [21, 22, 23, 24]}

In [65]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1,21
1,2,22
2,3,23
3,4,24


In [66]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1,21
b,2,22
c,3,23
d,4,24


#### 从结构化数据中创建

In [67]:
data = [(1, 2.2, 'Hello'), (2, 3., "World")]

In [68]:
pd.DataFrame(data)

Unnamed: 0,0,1,2
0,1,2.2,Hello
1,2,3.0,World


In [70]:
pd.DataFrame(data, index=['first', 'second'], columns=['A', 'B', 'C'])

Unnamed: 0,A,B,C
first,1,2.2,Hello
second,2,3.0,World


#### 从字典列表创建

In [71]:
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

In [72]:
pd.DataFrame(data)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [73]:
pd.DataFrame(data, index=['first', 'second'])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [76]:
pd.DataFrame(data, columns=['a', 'b'])

Unnamed: 0,a,b
0,1,2
1,5,10


#### 从元组字典创建

了解其创建的原理，实际应用中，会通过数据清洗的方式，把数据整理成方便 Pandas 导入且可读性好的格式。最后再通过 reindex/groupby 等方式转换成复杂数据结构。

In [78]:
d = {('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
     ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
     ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
     ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
     ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}

In [81]:
# 多级标签
pd.DataFrame(d)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,a,b,c,a,b
A,B,4.0,1.0,5.0,8.0,10.0
A,C,3.0,2.0,6.0,7.0,
A,D,,,,,9.0


#### 从 Series 创建

In [82]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
pd.DataFrame(s)

Unnamed: 0,0
a,-2.905337
b,1.7782
c,2.283394
d,0.086121
e,-1.460939


In [83]:
pd.DataFrame(s, index=['a', 'c', 'd'])

Unnamed: 0,0
a,-2.905337
c,2.283394
d,0.086121


In [85]:
pd.DataFrame(s, index=['a', 'c', 'd'], columns=['A'])

Unnamed: 0,A
a,-2.905337
c,2.283394
d,0.086121


#### 列选择/增加/删除

In [98]:
df = pd.DataFrame(np.random.randn(6, 4), columns=['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
0,-2.197815,0.558441,0.641929,1.513465
1,0.096819,-0.633099,-0.211517,0.205595
2,-0.030432,-0.299806,-0.265204,0.170584
3,2.398377,0.847184,0.207425,1.358312
4,-0.038788,-0.045007,0.18766,0.130579
5,-1.24108,0.005519,1.86163,-2.026534


In [99]:
df['one']

0   -2.197815
1    0.096819
2   -0.030432
3    2.398377
4   -0.038788
5   -1.241080
Name: one, dtype: float64

In [100]:
df['three'] = df['one'] + df['two']
df

Unnamed: 0,one,two,three,four
0,-2.197815,0.558441,-1.639373,1.513465
1,0.096819,-0.633099,-0.53628,0.205595
2,-0.030432,-0.299806,-0.330238,0.170584
3,2.398377,0.847184,3.24556,1.358312
4,-0.038788,-0.045007,-0.083796,0.130579
5,-1.24108,0.005519,-1.235561,-2.026534


In [101]:
df['flag'] = df['one'] > 0
df

Unnamed: 0,one,two,three,four,flag
0,-2.197815,0.558441,-1.639373,1.513465,False
1,0.096819,-0.633099,-0.53628,0.205595,True
2,-0.030432,-0.299806,-0.330238,0.170584,False
3,2.398377,0.847184,3.24556,1.358312,True
4,-0.038788,-0.045007,-0.083796,0.130579,False
5,-1.24108,0.005519,-1.235561,-2.026534,False


In [102]:
del df['three']
df

Unnamed: 0,one,two,four,flag
0,-2.197815,0.558441,1.513465,False
1,0.096819,-0.633099,0.205595,True
2,-0.030432,-0.299806,0.170584,False
3,2.398377,0.847184,1.358312,True
4,-0.038788,-0.045007,0.130579,False
5,-1.24108,0.005519,-2.026534,False


In [103]:
four = df.pop('four')
four

0    1.513465
1    0.205595
2    0.170584
3    1.358312
4    0.130579
5   -2.026534
Name: four, dtype: float64

In [104]:
df

Unnamed: 0,one,two,flag
0,-2.197815,0.558441,False
1,0.096819,-0.633099,True
2,-0.030432,-0.299806,False
3,2.398377,0.847184,True
4,-0.038788,-0.045007,False
5,-1.24108,0.005519,False


In [105]:
df['five'] = 5
df

Unnamed: 0,one,two,flag,five
0,-2.197815,0.558441,False,5
1,0.096819,-0.633099,True,5
2,-0.030432,-0.299806,False,5
3,2.398377,0.847184,True,5
4,-0.038788,-0.045007,False,5
5,-1.24108,0.005519,False,5


In [106]:
df['one_trunc'] = df['one'][:2]
df

Unnamed: 0,one,two,flag,five,one_trunc
0,-2.197815,0.558441,False,5,-2.197815
1,0.096819,-0.633099,True,5,0.096819
2,-0.030432,-0.299806,False,5,
3,2.398377,0.847184,True,5,
4,-0.038788,-0.045007,False,5,
5,-1.24108,0.005519,False,5,


In [109]:
# 指定插入位置
df.insert(1, 'bar', df['one'])
df

Unnamed: 0,one,bar,two,flag,five,one_trunc
0,-2.197815,-2.197815,0.558441,False,5,-2.197815
1,0.096819,0.096819,-0.633099,True,5,0.096819
2,-0.030432,-0.030432,-0.299806,False,5,
3,2.398377,2.398377,0.847184,True,5,
4,-0.038788,-0.038788,-0.045007,False,5,
5,-1.24108,-1.24108,0.005519,False,5,


#### 使用 assign() 方法来插入新列

更方便地使用 methd chains 的方法来实现

In [111]:
df = pd.DataFrame(np.random.randint(1, 5, (6, 4)), columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,2,4,2,4
1,2,1,1,2
2,1,4,1,4
3,1,3,1,3
4,1,2,1,2
5,2,3,4,4


In [112]:
df.assign(Ratio = df['A'] / df['B'])

Unnamed: 0,A,B,C,D,Ratio
0,2,4,2,4,0.5
1,2,1,1,2,2.0
2,1,4,1,4,0.25
3,1,3,1,3,0.333333
4,1,2,1,2,0.5
5,2,3,4,4,0.666667


In [118]:
df.assign(AB_Ratio = lambda x: x.A / x.B, CD_Ratio = lambda x: x.C - x.D)

Unnamed: 0,A,B,C,D,AB_Ratio,CD_Ratio
0,2,4,2,4,0.5,-2
1,2,1,1,2,2.0,-1
2,1,4,1,4,0.25,-3
3,1,3,1,3,0.333333,-2
4,1,2,1,2,0.5,-1
5,2,3,4,4,0.666667,0


In [119]:
df.assign(AB_Ratio = lambda x: x.A / x.B).assign(ABD_Ratio = lambda x: x.AB_Ratio * x.D)

Unnamed: 0,A,B,C,D,AB_Ratio,ABD_Ratio
0,2,4,2,4,0.5,2.0
1,2,1,1,2,2.0,4.0
2,1,4,1,4,0.25,1.0
3,1,3,1,3,0.333333,1.0
4,1,2,1,2,0.5,1.0
5,2,3,4,4,0.666667,2.666667
