# pandas

1. Series 结构：一维的，可以把它理解成数据库表的某个 column
2. DataFrame：二维的，数据库的整张表

实际上，**我们完全可以把 DataFrame 理解成一张数据库表，这个表有索引（index），column 名称，和数据**。Series 理解成一个特殊的 DataFrame，即只有1个 column 的 DataFrame.

pandas 这个库就是对这个“数据库表”进行各种操作的。为了方便理解，我把 DataFrame 和「表」互换使用，代表相同的含义。接下来，分为几个部分进行讨论：

- 表的创建
- column name 和 index 的修改
- 数据的查询
- 查询后数据的计算

In [3]:
import pandas as pd
import numpy as np

## 表的创建

In [26]:
record1 = pd.Series({'Name': 'Alice',
                        'Class': 'Physics',
                        'Score': 85})
record2 = pd.Series({'Name': 'Jack',
                        'Class': 'Chemistry',
                        'Score': 82})
record3 = pd.Series({'Name': 'Helen',
                        'Class': 'Biology',
                        'Score': 90})

dfo = pd.DataFrame([record1, record2, record3],
                  index=['school1', 'school2', 'school1'])
dfo

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


上面的数据看起来非常像一张数据库表吧。

- Name，Class，和 Score 是 column name
- school1 和 school2 实际上就是 index

如果我们不显示指定 index，那么 pandas 就会自动生成下标从 0 开始的数值索引。

In [27]:
record1 = pd.Series({'Name': 'Alice',
                        'Class': 'Physics',
                        'Score': 85})
record2 = pd.Series({'Name': 'Jack',
                        'Class': 'Chemistry',
                        'Score': 82})

dfo = pd.DataFrame([record1, record2])
dfo

Unnamed: 0,Name,Class,Score
0,Alice,Physics,85
1,Jack,Chemistry,82


### Series 和只有1列数据的 DataFrame

In [28]:
# 可以看到，Series 并没有 column name，字典的 key 就是数据对应的 index
record1 = pd.Series({'Name': 'Alice',
                        'Class': 'Physics',
                        'Score': 85})
record1

Name       Alice
Class    Physics
Score         85
dtype: object

In [29]:
# 只有1列数据的 DataFrame
record1 = pd.Series({'Name': 'Alice',
                        'Class': 'Physics',
                        'Score': 85})
dfo = pd.DataFrame([record1])
dfo

Unnamed: 0,Name,Class,Score
0,Alice,Physics,85


In [30]:
print(record1.index)
print(dfo.index)

Index(['Name', 'Class', 'Score'], dtype='object')
RangeIndex(start=0, stop=1, step=1)


## column name 和 index 的修改

### index 相关的操作

In [66]:
# index_col 属性让 csv 的第2列作为 index，如果不指定这个属性。用 pandas 自动生成的 index
df = pd.read_csv('aa.csv', index_col=1)
df.head()

Unnamed: 0_level_0,市场类型,日期,变动类型,每10股票分红（元）,配股价（元）,每10股票送几股,每10股票配几股,前流通盘,后流通盘,前总股本,后总股本,浓缩比例,份数,行权价
股票代码,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
'000001,深圳,19900301,除权除息,0.0,3.56,0.0,1.0,,,,,,,
'600601,上海,19900927,除权除息,0.0,9.6,0.0,5.0,,,,,,,
'600601,上海,19900927,配送股上市,,,,,606.6667,666.6667,910.0,1000.0,,,
'600653,上海,19910226,除权除息,0.0,0.0,90.0,0.0,,,,,,,
'600601,上海,19910312,除权除息,0.0,0.0,40.0,0.0,,,,,,,


In [64]:
# drop参数：会把原先的索引插入成 column，然后用 pandas 自动生成的索引
df.reset_index(drop=False, inplace=True)
df.head()

Unnamed: 0,股票代码,市场类型,日期,变动类型,每10股票分红（元）,配股价（元）,每10股票送几股,每10股票配几股,前流通盘,后流通盘,前总股本,后总股本,浓缩比例,份数,行权价
0,'000001,深圳,19900301,除权除息,0.0,3.56,0.0,1.0,,,,,,,
1,'600601,上海,19900927,除权除息,0.0,9.6,0.0,5.0,,,,,,,
2,'600601,上海,19900927,配送股上市,,,,,606.6667,666.6667,910.0,1000.0,,,
3,'600653,上海,19910226,除权除息,0.0,0.0,90.0,0.0,,,,,,,
4,'600601,上海,19910312,除权除息,0.0,0.0,40.0,0.0,,,,,,,


In [75]:
# 只保留指定这些 columns 的数据，其它的剔除掉
df = df[['股票代码', '市场类型', '日期', '变动类型', '配股价（元）', '份数']]
df.head(10)

Unnamed: 0,股票代码,市场类型,日期,变动类型,配股价（元）,份数
0,'000001,深圳,19900301,除权除息,3.56,
1,'600601,上海,19900927,除权除息,9.6,
2,'600601,上海,19900927,配送股上市,,
3,'600653,上海,19910226,除权除息,0.0,
4,'600601,上海,19910312,除权除息,0.0,
5,'000001,深圳,19910502,除权除息,0.0,
6,'000002,深圳,19910601,除权除息,4.4,
7,'000002,深圳,19910608,除权除息,0.0,
8,'000004,深圳,19910628,除权除息,0.0,
9,'000001,深圳,19910817,除权除息,0.0,


In [81]:
# 创建 MultiIndex（hierarchical index）
df_copy = df.set_index(['市场类型', '变动类型'])
df_copy.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,股票代码,日期,配股价（元）,份数
市场类型,变动类型,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
深圳,除权除息,'000001,19900301,3.56,
上海,除权除息,'600601,19900927,9.6,
上海,配送股上市,'600601,19900927,,
上海,除权除息,'600653,19910226,0.0,
上海,除权除息,'600601,19910312,0.0,
深圳,除权除息,'000001,19910502,0.0,
深圳,除权除息,'000002,19910601,4.4,
深圳,除权除息,'000002,19910608,0.0,
深圳,除权除息,'000004,19910628,0.0,
深圳,除权除息,'000001,19910817,0.0,


### column name 相关的操作

In [67]:
df.head()

Unnamed: 0_level_0,市场类型,日期,变动类型,每10股票分红（元）,配股价（元）,每10股票送几股,每10股票配几股,前流通盘,后流通盘,前总股本,后总股本,浓缩比例,份数,行权价
股票代码,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
'000001,深圳,19900301,除权除息,0.0,3.56,0.0,1.0,,,,,,,
'600601,上海,19900927,除权除息,0.0,9.6,0.0,5.0,,,,,,,
'600601,上海,19900927,配送股上市,,,,,606.6667,666.6667,910.0,1000.0,,,
'600653,上海,19910226,除权除息,0.0,0.0,90.0,0.0,,,,,,,
'600601,上海,19910312,除权除息,0.0,0.0,40.0,0.0,,,,,,,


In [70]:
new_df=df.rename(mapper=str.strip, axis=1)
new_df=new_df.rename(columns={'股票代码': '代码','市场类型': '类型'})
new_df.columns

Index(['类型', '日期', '变动类型', '每10股票分红（元）', '配股价（元）', '每10股票送几股', '每10股票配几股',
       '前流通盘', '后流通盘', '前总股本', '后总股本', '浓缩比例', '份数', '行权价'],
      dtype='object')

In [71]:
# 删除列
del(new_df["份数"])
new_df.columns

Index(['类型', '日期', '变动类型', '每10股票分红（元）', '配股价（元）', '每10股票送几股', '每10股票配几股',
       '前流通盘', '后流通盘', '前总股本', '后总股本', '浓缩比例', '行权价'],
      dtype='object')

炫技一下：批量修改 columns name

In [152]:
np.arange(0, len(new_df.columns))

array([0, 1, 2, 3, 4, 5])

In [153]:
cols = ['col' + str(x) for x in np.arange(0, len(new_df.columns))]
cols

['col0', 'col1', 'col2', 'col3', 'col4', 'col5']

In [154]:
new_df.columns = cols
new_df.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5
0,'000001,深圳,19900301,除权除息,3.56,
1,'600601,上海,19900927,除权除息,9.6,
2,'600601,上海,19900927,配送股上市,,
3,'600653,上海,19910226,除权除息,0.0,
4,'600601,上海,19910312,除权除息,0.0,


## 数据的查询

In [124]:
df.head(3)

Unnamed: 0,股票代码,市场类型,日期,变动类型,配股价（元）,份数
0,'000001,深圳,19900301,除权除息,3.56,
1,'600601,上海,19900927,除权除息,9.6,
2,'600601,上海,19900927,配送股上市,,


In [125]:
dfs = df.set_index('股票代码')
dfs.head(3)

Unnamed: 0_level_0,市场类型,日期,变动类型,配股价（元）,份数
股票代码,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
'000001,深圳,19900301,除权除息,3.56,
'600601,上海,19900927,除权除息,9.6,
'600601,上海,19900927,配送股上市,,


In [89]:
dfs['市场类型'].unique()

array(['深圳', '上海'], dtype=object)

In [88]:
dfs['变动类型'].unique()

array(['除权除息', '配送股上市', '股本变动', '未知股本变动', '非流通股上市', '股份回购', '增发新股',
       '增发新股上市', '配转股上市', '可转债上市', '指数留存', '送认购权证', '非流通股缩股', '送认沽权证'],
      dtype=object)

### loc 和 iloc 都是用于查询 DataFrame 的行和列的。它们都是属性，而不是函数。

- loc 主要是基于 label，即名称（name），和 boolean 数组查询的。
- iloc 主要是基于 integer-location 查询的。

In [126]:
dfs.iloc[0:2, 0:3]

Unnamed: 0_level_0,市场类型,日期,变动类型
股票代码,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
'000001,深圳,19900301,除权除息
'600601,上海,19900927,除权除息


In [104]:
# loc 由于是属性，不能像函数传递多个参数
# 逗号就是分隔符，前面指的是 index 的值，用于选择 rows；后面指的是 column names，用于选择 columns
dfs.loc[["'000001", "'600601"], ["市场类型", "日期"]].head()

Unnamed: 0_level_0,市场类型,日期
股票代码,Unnamed: 1_level_1,Unnamed: 2_level_1
'000001,深圳,19900301
'000001,深圳,19910502
'000001,深圳,19910817
'000001,深圳,19920323
'000001,深圳,19930524


In [127]:
# 基于 boolean 数组查询
dfs.loc[dfs['配股价（元）']>4].head()

Unnamed: 0_level_0,市场类型,日期,变动类型,配股价（元）,份数
股票代码,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
'600601,上海,19900927,除权除息,9.6,
'000002,深圳,19910601,除权除息,4.4,
'600653,上海,19920601,除权除息,9.5,
'600654,上海,19920604,除权除息,6.0,
'600652,上海,19930316,除权除息,5.5,


In [128]:
dfs.reset_index(inplace=True)
dfs.head()
# dfs.drop("index", axis=1, inplace=True)

Unnamed: 0,股票代码,市场类型,日期,变动类型,配股价（元）,份数
0,'000001,深圳,19900301,除权除息,3.56,
1,'600601,上海,19900927,除权除息,9.6,
2,'600601,上海,19900927,配送股上市,,
3,'600653,上海,19910226,除权除息,0.0,
4,'600601,上海,19910312,除权除息,0.0,


In [129]:
dfs_copy = dfs.set_index(['市场类型', '变动类型'])
dfs_copy.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,股票代码,日期,配股价（元）,份数
市场类型,变动类型,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
深圳,除权除息,'000001,19900301,3.56,
上海,除权除息,'600601,19900927,9.6,
上海,配送股上市,'600601,19900927,,
上海,除权除息,'600653,19910226,0.0,
上海,除权除息,'600601,19910312,0.0,
深圳,除权除息,'000001,19910502,0.0,
深圳,除权除息,'000002,19910601,4.4,
深圳,除权除息,'000002,19910608,0.0,
深圳,除权除息,'000004,19910628,0.0,
深圳,除权除息,'000001,19910817,0.0,


In [132]:
# MultiIndex 的查询
dfs_copy.loc[ [('深圳', '除权除息'), ('上海', '配送股上市')] ].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,股票代码,日期,配股价（元）,份数
市场类型,变动类型,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
深圳,除权除息,'000001,19900301,3.56,
深圳,除权除息,'000001,19910502,0.0,
深圳,除权除息,'000002,19910601,4.4,
深圳,除权除息,'000002,19910608,0.0,
深圳,除权除息,'000004,19910628,0.0,


In [135]:
dfs_copy.loc[["深圳", "除权除息"], "日期"].head()

市场类型  变动类型
深圳    除权除息    19900301
      除权除息    19910502
      除权除息    19910601
      除权除息    19910608
      除权除息    19910628
Name: 日期, dtype: int64

### 直接查询

上面的查询使用的是 iloc 和 loc 两个函数。下面采用直接查询的方法。

操作的核心就是：**Boolean masking**

In [136]:
dfs_copy.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,股票代码,日期,配股价（元）,份数
市场类型,变动类型,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
深圳,除权除息,'000001,19900301,3.56,
上海,除权除息,'600601,19900927,9.6,
上海,配送股上市,'600601,19900927,,
上海,除权除息,'600653,19910226,0.0,
上海,除权除息,'600601,19910312,0.0,


In [142]:
dfs_copy[dfs_copy['配股价（元）'] > 4].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,股票代码,日期,配股价（元）,份数
市场类型,变动类型,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
上海,除权除息,'600601,19900927,9.6,
深圳,除权除息,'000002,19910601,4.4,
上海,除权除息,'600653,19920601,9.5,
上海,除权除息,'600654,19920604,6.0,
上海,除权除息,'600652,19930316,5.5,


In [144]:
# 获取指定的 column
dfs_copy[["股票代码","配股价（元）"]].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,股票代码,配股价（元）
市场类型,变动类型,Unnamed: 2_level_1,Unnamed: 3_level_1
深圳,除权除息,'000001,3.56
上海,除权除息,'600601,9.6
上海,配送股上市,'600601,
上海,除权除息,'600653,0.0
上海,除权除息,'600601,0.0


上面都是传递1个条件的。介绍一下传递多个条件的几种方式

In [146]:
# 注意：一定要加上 小括号
(dfs_copy['配股价（元）'] > 5) & (dfs_copy['配股价（元）'] < 9)

# 这种方式不用加 小括号
dfs_copy['配股价（元）'].gt(5) & dfs_copy['配股价（元）'].lt(9)

# 简写
dfs_copy['配股价（元）'].gt(5).lt(9)

市场类型  变动类型 
深圳    除权除息     True
上海    除权除息     True
      配送股上市    True
      除权除息     True
      除权除息     True
               ... 
      股本变动     True
      股本变动     True
      股本变动     True
      股本变动     True
      股本变动     True
Name: 配股价（元）, Length: 65865, dtype: bool

## 缺失值的表示：None 和 Nan（Not a number）

在 Python 中，会用 None 类型表示缺失值。然而，pandas 对它做了如下的转换：

- 当数据类型是数值相关类型时，用 Nan 表示 missing value
- 当数据类型是 Object 相关类型时，用 None 表示 missing value

对于数据科学家来说，它们都是用于表达缺失值的，并没有什么不同。

In [26]:
students = ['Alice', 'Jack', None]
se = pd.Series(students)
print(se.dtype)

print('------------------')

# 为什么 int 变成了 float，这是 pandas 底层用 float 表示 Nan 的，然后就把 int 转换成精度更高的 float
numbers = [1, 2, None]
nn = pd.Series(numbers)
print(nn.dtype)

object
------------------
float64


In [27]:
np.isnan(np.nan)

True

### 缺失值的处理方法

In [15]:
lldf = pd.read_csv("log.csv")
lldf.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


In [16]:
# 识别 None 或者 numpy.NaN
# 如果想要它也识别「空字符串 ''」 或者 「numpy.inf」，加上 pandas.options.mode.use_inf_as_na = True
mask=lldf.isnull()
mask.head(3)

Unnamed: 0,time,user,video,playback position,paused,volume
0,False,False,False,False,False,False
1,False,False,False,False,True,True
2,False,False,False,False,True,True


In [18]:
lldf.dropna().head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
13,1469974424,sue,advanced.html,23,False,10.0
24,1469977424,bob,intro.html,1,True,10.0


In [38]:
# 找出 paused 列的非 null 值
lldf[lldf["paused"].notnull()]

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
13,1469974424,sue,advanced.html,23,False,10.0
24,1469977424,bob,intro.html,1,True,10.0


In [40]:
# 用指定的值填充 na
lldf["paused"].fillna(True)

0     False
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13    False
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26     True
27     True
28     True
29     True
30     True
31     True
32     True
Name: paused, dtype: bool

In [41]:
lldf.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


In [55]:
# 按照 volume 排序
lldf.sort_values(by=['volume'], ascending=False, na_position="first", inplace=True)
lldf

Unnamed: 0,time,user,video,playback position,paused,volume
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,
10,1469974724,cheryl,intro.html,15,,


In [58]:
# 主要：要排序以后再用 bfill 或 ffill
lldf["volume"].fillna(method='bfill', inplace=True)
lldf

Unnamed: 0,time,user,video,playback position,paused,volume
1,1469974454,cheryl,intro.html,6,,10.0
2,1469974544,cheryl,intro.html,9,,10.0
3,1469974574,cheryl,intro.html,10,,10.0
4,1469977514,bob,intro.html,1,,10.0
5,1469977544,bob,intro.html,1,,10.0
6,1469977574,bob,intro.html,1,,10.0
7,1469977604,bob,intro.html,1,,10.0
8,1469974604,cheryl,intro.html,11,,10.0
9,1469974694,cheryl,intro.html,14,,10.0
10,1469974724,cheryl,intro.html,15,,10.0


In [59]:
# 也可用正则表达式匹配，具体参照官方文档
lldf["volume"].replace([10, 5], [0, 6], inplace=True)
lldf

Unnamed: 0,time,user,video,playback position,paused,volume
1,1469974454,cheryl,intro.html,6,,0.0
2,1469974544,cheryl,intro.html,9,,0.0
3,1469974574,cheryl,intro.html,10,,0.0
4,1469977514,bob,intro.html,1,,0.0
5,1469977544,bob,intro.html,1,,0.0
6,1469977574,bob,intro.html,1,,0.0
7,1469977604,bob,intro.html,1,,0.0
8,1469974604,cheryl,intro.html,11,,0.0
9,1469974694,cheryl,intro.html,14,,0.0
10,1469974724,cheryl,intro.html,15,,0.0


当对某列数据进行统计时，比如求和，会忽视 missing value

In [62]:
lldf["volume"].replace(0.0, np.nan, inplace=True)
lldf

Unnamed: 0,time,user,video,playback position,paused,volume
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,
10,1469974724,cheryl,intro.html,15,,


In [63]:
np.sum(lldf["volume"])

6.0

### 文本处理方法

# 实际案例

In [80]:
census = pd.read_csv('census.csv')
# 2010到2015的人口估算数据
census_ex = census.head(3)[['POPESTIMATE2010',
                'POPESTIMATE2011',
                'POPESTIMATE2012',
                'POPESTIMATE2013',
                'POPESTIMATE2014',
                'POPESTIMATE2015']]
census_ex

Unnamed: 0,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
0,4785161,4801108,4816089,4830533,4846411,4858979
1,54660,55253,55175,55038,55290,55347
2,183193,186659,190396,195126,199713,203709


为 census_ex 加2列：min 和 max

In [84]:
# 定义函数
def min_max(row):
    data = row[['POPESTIMATE2010',
                'POPESTIMATE2011',
                'POPESTIMATE2012',
                'POPESTIMATE2013',
                'POPESTIMATE2014',
                'POPESTIMATE2015']]
    # Create a new entry for max
    row['max'] = np.max(data)
    # Create a new entry for min
    row['min'] = np.min(data)
    return row
# 应用函数
census_ex.apply(min_max, axis='columns')

Unnamed: 0,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,max,min
0,4785161,4801108,4816089,4830533,4846411,4858979,4858979,4785161
1,54660,55253,55175,55038,55290,55347,55347,54660
2,183193,186659,190396,195126,199713,203709,203709,183193


In [85]:
# 用 lambda 来简单算一下 2010 到 2013 之间的 max
rows = ['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013']
census_ex.apply(lambda x: np.max(x[rows]), axis=1).head()

0    4830533
1      55253
2     195126
dtype: int64

In [86]:
census["STNAME"].head(10)

0    Alabama
1    Alabama
2    Alabama
3    Alabama
4    Alabama
5    Alabama
6    Alabama
7    Alabama
8    Alabama
9    Alabama
Name: STNAME, dtype: object

In [89]:
# 为美国的各个州分区
def get_state_region(x):
    northeast = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 
                 'Rhode Island','Vermont','New York','New Jersey','Pennsylvania']
    midwest = ['Illinois','Indiana','Michigan','Ohio','Wisconsin','Iowa',
               'Kansas','Minnesota','Missouri','Nebraska','North Dakota',
               'South Dakota']
    south = ['Delaware','Florida','Georgia','Maryland','North Carolina',
             'South Carolina','Virginia','District of Columbia','West Virginia',
             'Alabama','Kentucky','Mississippi','Tennessee','Arkansas',
             'Louisiana','Oklahoma','Texas']
    west = ['Arizona','Colorado','Idaho','Montana','Nevada','New Mexico','Utah',
            'Wyoming','Alaska','California','Hawaii','Oregon','Washington']
    
    if x in northeast:
        return "Northeast"
    elif x in midwest:
        return "Midwest"
    elif x in south:
        return "South"
    else:
        return "West"

census['state_region'] = census['STNAME'].apply(lambda x: get_state_region(x))
census[['STNAME','state_region']].head()

Unnamed: 0,STNAME,state_region
0,Alabama,South
1,Alabama,South
2,Alabama,South
3,Alabama,South
4,Alabama,South


# TODO

1、合并2个 DataFrame：https://zoiklzsl.labs.coursera.org/notebooks/resources/week-3/MergingDataFrame_ed.ipynb

concat 和 merge 函数

2、

In [31]:
df = pd.DataFrame({
    "c": [1, 1, 1, 2, 2, 2, 2],
    "type": ["m", "n", "o", "m", "m", "n", "n"],
    "b": [3, 1, 1, 3,4,5,1]
})
df

Unnamed: 0,c,type,b
0,1,m,3
1,1,n,1
2,1,o,1
3,2,m,3
4,2,m,4
5,2,n,5
6,2,n,1


In [23]:
df.groupby("type").sum()

Unnamed: 0_level_0,c,b
type,Unnamed: 1_level_1,Unnamed: 2_level_1
m,5,10
n,5,7
o,1,1


In [24]:
df.groupby("type").agg({"c": (np.sum, np.mean)})

Unnamed: 0_level_0,c,c
Unnamed: 0_level_1,sum,mean
type,Unnamed: 1_level_2,Unnamed: 2_level_2
m,5,1.666667
n,5,1.666667
o,1,1.0


In [35]:
df.groupby("type").transform(np.sum)

Unnamed: 0,c,b
0,5,10
1,5,7
2,1,1
3,5,10
4,5,10
5,5,7
6,5,7


In [39]:
df.groupby("type").transform(np.sum)

Unnamed: 0,c,b
0,5,10
1,5,7
2,1,1
3,5,10
4,5,10
5,5,7
6,5,7
