# 10 Minutes to pandas

请参阅[官方文档](http://pandas.pydata.org/pandas-docs/stable/10min.html)

In [151]:
# 设置为 inline 风格
%matplotlib inline

In [152]:
# 包导入
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 创建数据集对象

In [153]:
# Series 对象可以理解为一维数组
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0     1
1     3
2     5
3   NaN
4     6
5     8
dtype: float64

In [154]:
# DataFrame 对象可以理解为二维数组，可以指定索引格式
dates = pd.date_range('20160301', periods=6)
dates

DatetimeIndex(['2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
               '2016-03-05', '2016-03-06'],
              dtype='datetime64[ns]', freq='D')

In [155]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-03-01,1.05779,-0.190217,0.914144,-1.510789
2016-03-02,-0.977556,-0.388003,0.155051,0.263199
2016-03-03,1.198994,0.170433,-0.754744,1.160799
2016-03-04,-0.22574,1.081562,-0.393808,-0.214455
2016-03-05,0.797793,1.445573,-1.245718,-0.133621
2016-03-06,0.629801,-0.098653,0.329737,-0.427013


In [156]:
df.values

array([[ 1.05778983, -0.19021697,  0.91414432, -1.51078934],
       [-0.97755608, -0.38800288,  0.15505093,  0.26319861],
       [ 1.19899411,  0.1704333 , -0.75474376,  1.16079862],
       [-0.22573989,  1.08156156, -0.39380793, -0.21445454],
       [ 0.79779313,  1.44557323, -1.24571772, -0.13362084],
       [ 0.62980099, -0.09865309,  0.32973712, -0.42701299]])

In [157]:
# 使用字典来创建：key 为 DataFrame 的列；value 为对应列下的值
df = pd.DataFrame({
                  'A': 1,
                  'B': pd.Timestamp('20160301'),
                  'C': range(4),
                  'D': np.arange(5, 9),
                  'E': 'text',
                  'F': ['AA', 'BB', 'CC', 'DD']})
df

Unnamed: 0,A,B,C,D,E,F
0,1,2016-03-01,0,5,text,AA
1,1,2016-03-01,1,6,text,BB
2,1,2016-03-01,2,7,text,CC
3,1,2016-03-01,3,8,text,DD


In [158]:
df.dtypes

A             int64
B    datetime64[ns]
C             int64
D             int32
E            object
F            object
dtype: object

In [159]:
df.A

0    1
1    1
2    1
3    1
Name: A, dtype: int64

In [160]:
type(df.A)

pandas.core.series.Series

## 查看数据

In [161]:
# 创建数据集
n_rows = 6
dates = pd.date_range('20160301', periods=n_rows)
df = pd.DataFrame(np.random.randn(n_rows, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2016-03-01,-1.551376,0.705075,0.069273,-0.243848
2016-03-02,0.776168,1.872046,-2.721737,0.075365
2016-03-03,-0.823614,-1.38457,0.040887,0.425011
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971
2016-03-05,-0.717346,-1.066521,0.996664,1.989145
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365


In [162]:
df.shape

(6, 4)

In [163]:
df.head()

Unnamed: 0,A,B,C,D
2016-03-01,-1.551376,0.705075,0.069273,-0.243848
2016-03-02,0.776168,1.872046,-2.721737,0.075365
2016-03-03,-0.823614,-1.38457,0.040887,0.425011
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971
2016-03-05,-0.717346,-1.066521,0.996664,1.989145


In [164]:
df.head(3)

Unnamed: 0,A,B,C,D
2016-03-01,-1.551376,0.705075,0.069273,-0.243848
2016-03-02,0.776168,1.872046,-2.721737,0.075365
2016-03-03,-0.823614,-1.38457,0.040887,0.425011


In [165]:
df.tail()

Unnamed: 0,A,B,C,D
2016-03-02,0.776168,1.872046,-2.721737,0.075365
2016-03-03,-0.823614,-1.38457,0.040887,0.425011
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971
2016-03-05,-0.717346,-1.066521,0.996664,1.989145
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365


In [205]:
df.tail(2)

Unnamed: 0,A,B,C,D,tag,E
2016-03-05,14,-1.066521,0.996664,1.989145,c,4
2016-03-06,15,-0.8353,-1.63352,-0.616365,c,5


In [167]:
df.index

DatetimeIndex(['2016-03-01', '2016-03-02', '2016-03-03', '2016-03-04',
               '2016-03-05', '2016-03-06'],
              dtype='datetime64[ns]', freq='D')

In [168]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [169]:
df.values

array([[-1.55137581,  0.70507481,  0.06927297, -0.24384828],
       [ 0.77616818,  1.87204593, -2.72173723,  0.07536546],
       [-0.82361405, -1.38457044,  0.04088676,  0.42501123],
       [-2.06523411,  2.21243073, -1.14651394, -2.01497107],
       [-0.71734632, -1.06652057,  0.99666403,  1.98914489],
       [-1.69287009, -0.83529966, -1.6335196 , -0.61636535]])

In [170]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-1.012379,0.250527,-0.732491,-0.064277
std,1.018201,1.566664,1.354347,1.312837
min,-2.065234,-1.38457,-2.721737,-2.014971
25%,-1.657497,-1.008715,-1.511768,-0.523236
50%,-1.187495,-0.065112,-0.552814,-0.084241
75%,-0.743913,1.580303,0.062176,0.3376
max,0.776168,2.212431,0.996664,1.989145


In [171]:
df.T

Unnamed: 0,2016-03-01 00:00:00,2016-03-02 00:00:00,2016-03-03 00:00:00,2016-03-04 00:00:00,2016-03-05 00:00:00,2016-03-06 00:00:00
A,-1.551376,0.776168,-0.823614,-2.065234,-0.717346,-1.69287
B,0.705075,1.872046,-1.38457,2.212431,-1.066521,-0.8353
C,0.069273,-2.721737,0.040887,-1.146514,0.996664,-1.63352
D,-0.243848,0.075365,0.425011,-2.014971,1.989145,-0.616365


In [172]:
df.T.shape

(4, 6)

In [173]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2016-03-01,-0.243848,0.069273,0.705075,-1.551376
2016-03-02,0.075365,-2.721737,1.872046,0.776168
2016-03-03,0.425011,0.040887,-1.38457,-0.823614
2016-03-04,-2.014971,-1.146514,2.212431,-2.065234
2016-03-05,1.989145,0.996664,-1.066521,-0.717346
2016-03-06,-0.616365,-1.63352,-0.8353,-1.69287


In [174]:
df.sort_values(by='C')

Unnamed: 0,A,B,C,D
2016-03-02,0.776168,1.872046,-2.721737,0.075365
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971
2016-03-03,-0.823614,-1.38457,0.040887,0.425011
2016-03-01,-1.551376,0.705075,0.069273,-0.243848
2016-03-05,-0.717346,-1.066521,0.996664,1.989145


## 数据选择

In [175]:
df['A']

2016-03-01   -1.551376
2016-03-02    0.776168
2016-03-03   -0.823614
2016-03-04   -2.065234
2016-03-05   -0.717346
2016-03-06   -1.692870
Freq: D, Name: A, dtype: float64

In [176]:
df[2:4]

Unnamed: 0,A,B,C,D
2016-03-03,-0.823614,-1.38457,0.040887,0.425011
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971


In [177]:
df['20160302':'20160305']

Unnamed: 0,A,B,C,D
2016-03-02,0.776168,1.872046,-2.721737,0.075365
2016-03-03,-0.823614,-1.38457,0.040887,0.425011
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971
2016-03-05,-0.717346,-1.066521,0.996664,1.989145


### 通过标签选择

In [178]:
df.loc['20160301']

A   -1.551376
B    0.705075
C    0.069273
D   -0.243848
Name: 2016-03-01 00:00:00, dtype: float64

In [179]:
type(df.loc['20160301'])

pandas.core.series.Series

In [180]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2016-03-01,-1.551376,0.705075
2016-03-02,0.776168,1.872046
2016-03-03,-0.823614,-1.38457
2016-03-04,-2.065234,2.212431
2016-03-05,-0.717346,-1.066521
2016-03-06,-1.69287,-0.8353


In [181]:
df.loc['20160301':'20160305', ['A', 'B']]

Unnamed: 0,A,B
2016-03-01,-1.551376,0.705075
2016-03-02,0.776168,1.872046
2016-03-03,-0.823614,-1.38457
2016-03-04,-2.065234,2.212431
2016-03-05,-0.717346,-1.066521


In [182]:
df.loc['2016-03-01', 'A']

-1.5513758074956525

In [183]:
df.at[pd.Timestamp('2016-03-01'), 'A']
# df.at['2016-03-01', 'A'] will raise error

-1.5513758074956525

### 通过位置选择

In [184]:
df.iloc[1]

A    0.776168
B    1.872046
C   -2.721737
D    0.075365
Name: 2016-03-02 00:00:00, dtype: float64

In [185]:
df.iloc[2:5, 0:2]

Unnamed: 0,A,B
2016-03-03,-0.823614,-1.38457
2016-03-04,-2.065234,2.212431
2016-03-05,-0.717346,-1.066521


In [186]:
df.iloc[1:5, :]

Unnamed: 0,A,B,C,D
2016-03-02,0.776168,1.872046,-2.721737,0.075365
2016-03-03,-0.823614,-1.38457,0.040887,0.425011
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971
2016-03-05,-0.717346,-1.066521,0.996664,1.989145


In [187]:
df.iloc[1, 1]

1.87204593456861

In [188]:
df.iat[1, 1]

1.87204593456861

### 布尔索引

In [189]:
df[df.A < 0]

Unnamed: 0,A,B,C,D
2016-03-01,-1.551376,0.705075,0.069273,-0.243848
2016-03-03,-0.823614,-1.38457,0.040887,0.425011
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971
2016-03-05,-0.717346,-1.066521,0.996664,1.989145
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365


In [190]:
df[df > 0]

Unnamed: 0,A,B,C,D
2016-03-01,,0.705075,0.069273,
2016-03-02,0.776168,1.872046,,0.075365
2016-03-03,,,0.040887,0.425011
2016-03-04,,2.212431,,
2016-03-05,,,0.996664,1.989145
2016-03-06,,,,


In [191]:
df['tag'] = ['a'] * 2 + ['b'] * 2 + ['c'] * 2

In [192]:
df

Unnamed: 0,A,B,C,D,tag
2016-03-01,-1.551376,0.705075,0.069273,-0.243848,a
2016-03-02,0.776168,1.872046,-2.721737,0.075365,a
2016-03-03,-0.823614,-1.38457,0.040887,0.425011,b
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971,b
2016-03-05,-0.717346,-1.066521,0.996664,1.989145,c
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365,c


In [193]:
df[df.tag.isin(['a', 'c'])]

Unnamed: 0,A,B,C,D,tag
2016-03-01,-1.551376,0.705075,0.069273,-0.243848,a
2016-03-02,0.776168,1.872046,-2.721737,0.075365,a
2016-03-05,-0.717346,-1.066521,0.996664,1.989145,c
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365,c


### 修改数据

In [194]:
df

Unnamed: 0,A,B,C,D,tag
2016-03-01,-1.551376,0.705075,0.069273,-0.243848,a
2016-03-02,0.776168,1.872046,-2.721737,0.075365,a
2016-03-03,-0.823614,-1.38457,0.040887,0.425011,b
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971,b
2016-03-05,-0.717346,-1.066521,0.996664,1.989145,c
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365,c


In [195]:
s = pd.Series(np.arange(6), index=pd.date_range('20160301', periods=6))
s

2016-03-01    0
2016-03-02    1
2016-03-03    2
2016-03-04    3
2016-03-05    4
2016-03-06    5
Freq: D, dtype: int32

In [196]:
df['E'] = s

In [197]:
df

Unnamed: 0,A,B,C,D,tag,E
2016-03-01,-1.551376,0.705075,0.069273,-0.243848,a,0
2016-03-02,0.776168,1.872046,-2.721737,0.075365,a,1
2016-03-03,-0.823614,-1.38457,0.040887,0.425011,b,2
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971,b,3
2016-03-05,-0.717346,-1.066521,0.996664,1.989145,c,4
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365,c,5


In [198]:
df.loc['20160301', 'A'] = 0.2
# df.['20160301', 'A'] = 0.2 will not have effect

In [199]:
df

Unnamed: 0,A,B,C,D,tag,E
2016-03-01,0.2,0.705075,0.069273,-0.243848,a,0
2016-03-02,0.776168,1.872046,-2.721737,0.075365,a,1
2016-03-03,-0.823614,-1.38457,0.040887,0.425011,b,2
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971,b,3
2016-03-05,-0.717346,-1.066521,0.996664,1.989145,c,4
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365,c,5


In [200]:
df.at[pd.Timestamp('20160301'), 'A'] = 0.4

In [201]:
df

Unnamed: 0,A,B,C,D,tag,E
2016-03-01,0.4,0.705075,0.069273,-0.243848,a,0
2016-03-02,0.776168,1.872046,-2.721737,0.075365,a,1
2016-03-03,-0.823614,-1.38457,0.040887,0.425011,b,2
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971,b,3
2016-03-05,-0.717346,-1.066521,0.996664,1.989145,c,4
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365,c,5


In [202]:
df.iat[0, 0] = 0.6
df

Unnamed: 0,A,B,C,D,tag,E
2016-03-01,0.6,0.705075,0.069273,-0.243848,a,0
2016-03-02,0.776168,1.872046,-2.721737,0.075365,a,1
2016-03-03,-0.823614,-1.38457,0.040887,0.425011,b,2
2016-03-04,-2.065234,2.212431,-1.146514,-2.014971,b,3
2016-03-05,-0.717346,-1.066521,0.996664,1.989145,c,4
2016-03-06,-1.69287,-0.8353,-1.63352,-0.616365,c,5


In [203]:
df.loc[:, 'A'] = np.arange(10, 16)
df

Unnamed: 0,A,B,C,D,tag,E
2016-03-01,10,0.705075,0.069273,-0.243848,a,0
2016-03-02,11,1.872046,-2.721737,0.075365,a,1
2016-03-03,12,-1.38457,0.040887,0.425011,b,2
2016-03-04,13,2.212431,-1.146514,-2.014971,b,3
2016-03-05,14,-1.066521,0.996664,1.989145,c,4
2016-03-06,15,-0.8353,-1.63352,-0.616365,c,5


In [204]:
df2 = df.loc[:, ['B', 'C']].copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,B,C
2016-03-01,-0.705075,-0.069273
2016-03-02,-1.872046,-2.721737
2016-03-03,-1.38457,-0.040887
2016-03-04,-2.212431,-1.146514
2016-03-05,-1.066521,-0.996664
2016-03-06,-0.8353,-1.63352
