# 数据规整化：清理、转换、合并、重塑

In [1]:
%pylab inline 

from pandas import Series, DataFrame
import pandas as pd

from numpy.random import randn
import numpy as np

Populating the interactive namespace from numpy and matplotlib


## 合并数据集

### 数据库风格的DataFrame合并 (pandas.merge)

合并（merge）或连接（join）以下两个 DataFrame，考虑 'left', 'right', 'outer', 'inner' 不同合并方式

In [2]:
df1 = DataFrame({'key': list('bbacaab'), 'data1': range(7)})
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [3]:
df2 = DataFrame({'key': list('abd'), 'data2': range(3)})
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


### 索引上的合并 (pandas.merge)

合并以下两个 DataFrame

In [4]:
left1 = DataFrame({'key': list('abaabc'), 'value': range(6)})
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [5]:
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a','b'])
right1

Unnamed: 0,group_val
a,3.5
b,7.0


### 轴向连接 (pandas.concat)

连接以下两个 DataFrame (尝试axis=0,axis=1)

In [6]:
df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                columns=['one', 'two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [7]:
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], 
                columns=['three', 'four'])
df2

Unnamed: 0,three,four
a,5,6
c,7,8


### 合并重叠数据

试合并以下两个 DataFrame

In [8]:
df1 = DataFrame({'a': [1., np.nan, 5., np.nan], 
                 'b': [np.nan, 2., np.nan, 6.],
                 'c': range(2, 18, 4)})
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [9]:
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.], 
                 'b': [np.nan, 3., 4., 6., 8.]})
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


## 重塑和轴向旋转

### 将长格式旋转成宽格式

转换下面长格式的DataFrame，不同的item值形成一个列，date列中的时间值做索引。

In [10]:
ldata = DataFrame({
    'date': ['1959-03-31']*3 + ['1959-06-30']*3 + ['1959-09-30']*3 + ['1959-12-31']*3,
    'item': ['R','I','U']*4,
    'value': [2710.349, 0, 5.8, 2778.801, 2.34, 5.1, 2775.488, 2.74, 5.3, 2785.204, 2.66, 5.12]
})

ldata

Unnamed: 0,date,item,value
0,1959-03-31,R,2710.349
1,1959-03-31,I,0.0
2,1959-03-31,U,5.8
3,1959-06-30,R,2778.801
4,1959-06-30,I,2.34
5,1959-06-30,U,5.1
6,1959-09-30,R,2775.488
7,1959-09-30,I,2.74
8,1959-09-30,U,5.3
9,1959-12-31,R,2785.204


## 数据转换

### 利用函数或映射进行数据转换

将下面 DataFrame 按照食物来源添加一列表示该肉类食物来源的动物类型

In [11]:
meat_to_animal = {'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}

data = DataFrame({
    'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


### 离散化和面元划分

1. 将 ages 根据 bins 划分
2. 将 ages 分成等数量的四群

In [12]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 36, 60, 100]

### 侦测和过滤异常值

将下面 DataFrmae 中绝对值大于 3 值调整为等于 3

In [13]:
np.random.seed(12345)

data = DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


### 排列和随机采样

1. 找出 10 个字母（不重复）
2. 找出 10 个字母（可重复）

In [14]:
import string

letters = np.array(string.ascii_uppercase)
letters

array('ABCDEFGHIJKLMNOPQRSTUVWXYZ', 
      dtype='|S26')

### 计算指标、哑变量

将下面电影分类转换成 dummy matrix

In [15]:
mnames = ['movie_id', 'title', 'genres']

movies = pd.read_table('ch02/movielens/movies.dat',
                       sep='::',
                       header=None,
                       names=mnames,
                       engine='python')
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## 字符串操作

### 字符串对象方法

将下面以逗号分隔的字符串，去除空白并用::合并

In [16]:
val = 'a,b,   guido'

### 正则表达式

找出用户名、域名、后缀

In [17]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

### pandas 中矢量化的字符串函数

将下面 Series 转换成包含用户名、域名、后缀的 DataFraem

In [18]:
data = Series({
    'Dave': 'dave@google.com',
    'Steve': 'steve@gmail.com',
    'Rob': 'rob@gmail.com',
    'Wes': np.nan})
data

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

## 示例：USDA 食品数据库

根据食物分类和营养类型画出一张中位值图

In [19]:
import json

db = json.load(open('ch07/foods-2011-10-03.json'))
len(db)

6636