# Getting Started with pandas

In [2]:
import numpy as np

import pandas as pd
from pandas import Series, DataFrame

## Introduction to pandas Data Structures
### Series

In [3]:
obj = Series([1,2,3])
obj

0    1
1    2
2    3
dtype: int64

In [4]:
obj.values

array([1, 2, 3])

In [5]:
obj.index

RangeIndex(start=0, stop=3, step=1)

In [6]:
obj2 = Series([1,2,3], index=['a','b','c'])
obj2

a    1
b    2
c    3
dtype: int64

In [7]:
obj2['b'] = 6
obj2

a    1
b    6
c    3
dtype: int64

In [8]:
obj2[obj2 > 3]

b    6
dtype: int64

In [9]:
obj2 * 2

a     2
b    12
c     6
dtype: int64

In [11]:
'b' in obj2

True

In [12]:
'd' in obj2

False

In [17]:
# 透过字典建立 Series（Series的索引是字典的键）
dict = {'apple': 1, 'banana': 2, 'cherry': 3}
obj3 = Series(dict)
obj3

apple     1
banana    2
cherry    3
dtype: int64

In [18]:
# 与fruits索引匹配的值会被找出放到相对应的位置
fruits = ['cherry', 'banana', 'apple', 'date']
obj4 = Series(dict, fruits)
obj4

cherry    3.0
banana    2.0
apple     1.0
date      NaN
dtype: float64

In [19]:
pd.isnull(obj4)

cherry    False
banana    False
apple     False
date       True
dtype: bool

In [21]:
pd.notnull(obj4)

cherry     True
banana     True
apple      True
date      False
dtype: bool

In [22]:
obj4.isnull()

cherry    False
banana    False
apple     False
date       True
dtype: bool

In [26]:
obj4.index.name = 'fruit'
obj4.name = "myName"
obj4

fruit
cherry    3.0
banana    2.0
apple     1.0
date      NaN
Name: myName, dtype: float64

In [27]:
# 对index赋值
obj.index = ['A','B','C']
obj

A    1
B    2
C    3
dtype: int64

### DataFrame

In [28]:
data = {
    'value1': ['A','B','C','D','E'],
    'value2': [1,2,3,4,5],
    'value3': [1.1,2.2,3.3,4.4,5.5]
}

frame = DataFrame(data)
frame

Unnamed: 0,value1,value2,value3
0,A,1,1.1
1,B,2,2.2
2,C,3,3.3
3,D,4,4.4
4,E,5,5.5


In [29]:
# 指定顺序
DataFrame(data, columns=['value2','value3','value1'])

Unnamed: 0,value2,value3,value1
0,1,1.1,A
1,2,2.2,B
2,3,3.3,C
3,4,4.4,D
4,5,5.5,E


In [31]:
# 若传入colomn在数据中找不到，会产生NA
frame2 = DataFrame(data,
                   columns=['value3','value2','value1','value0'],
                   index=['one','two','three','four','five'])
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,
two,2.2,2,B,
three,3.3,3,C,
four,4.4,4,D,
five,5.5,5,E,


In [32]:
# 透过字典标记方式存取
# 返回的Series跟DataFrame有相同的索引
frame2['value2']

one      1
two      2
three    3
four     4
five     5
Name: value2, dtype: int64

In [34]:
# 透过属性方式存取
frame2.value2

one      1
two      2
three    3
four     4
five     5
Name: value2, dtype: int64

In [35]:
# row也可以通过位置或名称的方式获取
frame2.ix['three']

value3    3.3
value2      3
value1      C
value0    NaN
Name: three, dtype: object

In [39]:
# column可以通过赋值的方式进行修改
frame2['value0'] = np.arange(5,0,-1)
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,5
two,2.2,2,B,4
three,3.3,3,C,3
four,4.4,4,D,2
five,5.5,5,E,1


In [41]:
# 将列表或数组赋值给某column，长度要跟DataFrame的长度匹配
frame2['value0'] = [1,2,3,4,5]
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,1
two,2.2,2,B,2
three,3.3,3,C,3
four,4.4,4,D,4
five,5.5,5,E,5


In [43]:
frame2['value0'] = np.array(['a','b','c','d','e'])
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,a
two,2.2,2,B,b
three,3.3,3,C,c
four,4.4,4,D,d
five,5.5,5,E,e


In [44]:
# 如果赋值的是Series，会精确匹配DataFrame的索引，空位会被填上缺失值
frame2['value0'] = Series([1,3,5], index=["one",'three','five'])
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,1.0
two,2.2,2,B,
three,3.3,3,C,3.0
four,4.4,4,D,
five,5.5,5,E,5.0


In [45]:
# 为不存在的column赋值，会创建出新的column
frame2['value4'] = frame2.value2 > 3
frame2

Unnamed: 0,value3,value2,value1,value0,value4
one,1.1,1,A,1.0,False
two,2.2,2,B,,False
three,3.3,3,C,3.0,False
four,4.4,4,D,,True
five,5.5,5,E,5.0,True


In [47]:
# del 用于删除 column
del frame2['value4']
frame2

Unnamed: 0,value3,value2,value1,value0
one,1.1,1,A,1.0
two,2.2,2,B,
three,3.3,3,C,3.0
four,4.4,4,D,
five,5.5,5,E,5.0


In [53]:
# 将字典传给DataFrame: 外层字典的键为 column 索引, 内层的健位 row 索引
data = {
    'c1': {'r1': 1, 'r2': 2, 'r3': 3},
    'c2': {'r1': 'A', 'r2': 'B', 'r3': 'C'}
}

DataFrame(data)

Unnamed: 0,c1,c2
r1,1,A
r2,2,B
r3,3,C


In [54]:
DataFrame(data, index=['r0','r1','r2'])

Unnamed: 0,c1,c2
r0,,
r1,1.0,A
r2,2.0,B


In [57]:
# 由Series组成的字典
data = {
    'c1': Series([1,2,3], index=['r1','r2','r3']),
    'c2': Series(['A','B','C'], index=['r1','r2','r3'])
}

DataFrame(data)

Unnamed: 0,c1,c2
r1,1,A
r2,2,B
r3,3,C


可以传给DataFrame构造器的数据

类型 | 说明
---|---
2D ndarray | A matrix of data, passing optional row and column labels
dict of arrays, lists, or tuples | Each sequence becomes a column in the DataFrame. All sequences must be the same length.
NumPy structured/record array | Treated as the “dict of arrays” case
dict of Series | Each value becomes a column. Indexes from each Series are unioned together to form the result’s row index if no explicit index is passed.
dict of dicts | Each inner dict becomes a column. Keys are unioned to form the row index as in the “dict of Series” case.
list of dicts or Series | Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame’s column labels
List of lists or tuples | Treated as the “2D ndarray” case
Another DataFrame | The DataFrame’s indexes are used unless different ones are passed
NumPy MaskedArray | Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result

In [64]:
# 2D ndarray
data = [[1,4],[2,5],[3,6]]
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


In [63]:
# dict of lists
data = {
    'c1': [1,2,3],
    'c2': [4,5,6]}
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


In [67]:
# dict of arrays
data = {
    'c1': np.array([1,2,3]),
    'c2': np.array([4,5,6])}
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


In [68]:
# dict of tuples
data = {
    'c1': (1,2,3),
    'c2': (4,5,6)}
DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

Unnamed: 0,c1,c2
r1,1,4
r2,2,5
r3,3,6


### Index Objects

In [69]:
obj = Series(range(3), index=['a','b','c'])
index = obj.index
index

Index([u'a', u'b', u'c'], dtype='object')

In [76]:
# index 对象是不可修改的
# index[1] = 1

Pandas 中主要的 index 对象

类 | 说明
---|---
`Index` | 最泛化的Index对象，将轴标签表示为一个由Python对象组成的NumPy数组
`Int64Index` | 针对整数的特殊Index
`MultiIndex` | 层次化索引对象，表示单个轴上的多层索引
`DatetimeIndex` | 储存纳秒级时间戳
`PeriodIndex` | 针对Period数据的特殊Index

In [74]:
data = [[1,4],[2,5],[3,6]]
df = DataFrame(data, columns=['c1','c2'], index=['r1','r2','r3'])

'c1' in df.columns, 'r1' in df.index

(True, True)

Index 的方法与属性

方法 | 说明
---|---
`append` | 连接另一个Index对象，产生一个新的Index
`difference` | 计算差集
`intersection` | 计算交集
`union` | 计算并集
`isin` | 计算一个指标各值是否都把含在参数集合中的布尔型数组
`delete` | 删除索引i处的元素，得到新的Index
`drop` | 删除传入的值，得到新的Index
`insert` | 将元素插入到索引i处，得到新的Index
`is_monotonic` | 当个元素均大于等于前一个元素时，返回True
`is_unique` | 当Index没有重复值，返回True
`unique` | 计算Index中唯一的数组

In [77]:
index1 = pd.Index(['a','b','c'])
index2 = pd.Index(['b','c','d'])

In [78]:
index1.append(index2)

Index([u'a', u'b', u'c', u'b', u'c', u'd'], dtype='object')

In [81]:
index1.difference(index2)

Index([u'a'], dtype='object')

In [82]:
index1.intersection(index2)

Index([u'b', u'c'], dtype='object')

In [83]:
index1.union(index2)

Index([u'a', u'b', u'c', u'd'], dtype='object')

In [85]:
index1.isin(['a','c'])

array([ True, False,  True], dtype=bool)

In [86]:
index1.delete(1)

Index([u'a', u'c'], dtype='object')

In [87]:
index1.drop('b')

Index([u'a', u'c'], dtype='object')

## Essential Functionality
### Reindexing

In [88]:
obj = Series([1,2,3,4], index=['a','c','b','d'])
obj

a    1
c    2
b    3
d    4
dtype: int64

In [90]:
obj2 = obj.reindex(['a','b','c','d','e'], fill_value=0)
obj2

a    1
b    3
c    2
d    4
e    0
dtype: int64

In [98]:
# 插值处理
obj3 = Series(['A','B','C'], index=[1,3,5])
obj3.reindex(range(7), method='ffill', fill_value='?')

0    ?
1    A
2    A
3    B
4    B
5    C
6    C
dtype: object

In [99]:
obj3.reindex(range(7), method='bfill', fill_value='?')

0    A
1    A
2    B
3    B
4    C
5    C
6    ?
dtype: object

In [104]:
frame = DataFrame(np.arange(9).reshape((3,3)),
                  columns=['c1','c3','c5'],
                  index=['r1','r3','r5'])
frame

Unnamed: 0,c1,c3,c5
r1,0,1,2
r3,3,4,5
r5,6,7,8


In [106]:
frame.reindex(['r1','r2','r3','r4','r5'], method='ffill')

Unnamed: 0,c1,c3,c5
r1,0,1,2
r2,0,1,2
r3,3,4,5
r4,3,4,5
r5,6,7,8


In [108]:
# 插值只能在轴0
frame.reindex(columns=['c1','c2','c3','c4','c5'])

Unnamed: 0,c1,c2,c3,c4,c5
r1,0,,1,,2
r3,3,,4,,5
r5,6,,7,,8


In [111]:
# 利用ix的标签索引功能，简化索引任务
frame.ix[['r1','r2','r3','r4','r5'], ['c1','c2','c3','c4','c5']]

Unnamed: 0,c1,c2,c3,c4,c5
r1,0.0,,1.0,,2.0
r2,,,,,
r3,3.0,,4.0,,5.0
r4,,,,,
r5,6.0,,7.0,,8.0


reindex 函数的参数

参数 | 说明
---|---
`index` | 用作索引的新序列
`method` | 插值方式
`fill_value` | 重新索引过程中，引入缺失值的替代值
`limit` | 向前或向后填充时的最大填充量
`level` | 在MultiIndex的指定级别上匹配简单索引，否则选取其子集
`copy` | 默认为True，无论如何都要复制；如果为False，则新旧相等就不复制

### Dropping entries from an axis

In [112]:
obj = Series([1,2,3,4,5], index=['a','b','c','d','e'])
obj.drop('c')

a    1
b    2
d    4
e    5
dtype: int64

In [114]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['one','two','three','four'],
                 columns=['a','b','c','d'])
data.drop(['two','four'])

Unnamed: 0,a,b,c,d
one,0,1,2,3
three,8,9,10,11


In [115]:
data.drop(['b','d'], axis=1)

Unnamed: 0,a,c
one,0,2
two,4,6
three,8,10
four,12,14


### Indexing, selection, and filtering

In [128]:
obj = Series([1,2,3,4,5], index=['a','b','c','d','e'])
obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [123]:
obj['b'], obj[1]

(2, 2)

In [124]:
obj[['b','c','d']]

b    2
c    3
d    4
dtype: int64

In [125]:
obj[2:4]

c    3
d    4
dtype: int64

In [126]:
# 利用标签的切片运算与普通的Python切片不同
obj['b':'d']

b    2
c    3
d    4
dtype: int64

In [131]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['one','two','three','four'],
                 columns=['a','b','c','d'])
data

Unnamed: 0,a,b,c,d
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11
four,12,13,14,15


In [133]:
data['b']

one       1
two       5
three     9
four     13
Name: b, dtype: int64

In [134]:
data[['b','d']]

Unnamed: 0,b,d
one,1,3
two,5,7
three,9,11
four,13,15


In [141]:
# 通过切片选取 row
data[:2]

Unnamed: 0,a,b,c,d
one,0,1,2,3
two,4,5,6,7


In [142]:
data['c'] > 6

one      False
two      False
three     True
four      True
Name: c, dtype: bool

In [143]:
# 通过布尔型数组选取 row
data[data['c'] > 6]

Unnamed: 0,a,b,c,d
three,8,9,10,11
four,12,13,14,15


In [144]:
# 通过索引字段ix
data.ix[['two','three'],['b','c']]

Unnamed: 0,b,c
two,5,6
three,9,10


In [145]:
data.ix[2]

a     8
b     9
c    10
d    11
Name: three, dtype: int64

In [146]:
data.ix[:, 2]

one       2
two       6
three    10
four     14
Name: c, dtype: int64

In [148]:
data.ix[data.b > 7, :2]

Unnamed: 0,a,b
three,8,9
four,12,13


DataFrame 的索引选项

类型 | 说明
---|---
`obj[val]` | 选取DataFrame的单个列或一组列。布尔型数组（过滤行），切片（行切片）
`obj.ix[val]` | 选取DataFrame的单个行或一组行
`obj.ix[:, val]` | 选取DataFrame的单个列或一组列
`obj.ix[val1, val2]` | 同时选取行和列
`reindex` | 将一个或多个轴匹配到新索引
`xs` | 根据标签选取单行或单列，返回一个Series
~~`icol`,`irow`~~ | ~~根据整数位置选取单列或单行，返回一个Series~~
`get_value`,`set_value` | 根据行标签和列标签选取单个值

In [154]:
data.xs('two')

a    4
b    5
c    6
d    7
Name: two, dtype: int64

In [155]:
data.xs('b', axis=1)

one       1
two       5
three     9
four     13
Name: b, dtype: int64

In [166]:
data.iloc[1]

a    4
b    5
c    6
d    7
Name: two, dtype: int64

In [167]:
data.iloc[:,1]

one       1
two       5
three     9
four     13
Name: b, dtype: int64

In [171]:
data.get_value('two','b')

5

### Arithmetic and data alignment