In [1]:
import pandas as pd
import numpy as np

## 载入数据到 Pandas

* 索引：将一个列或多个列读取出来构成 DataFrame，其中涉及是否从文件中读取索引以及列名
* 类型推断和数据转换：包括用户自定义的转换以及缺失值标记
* 日期解析
* 迭代：针对大文件进行逐块迭代。这个是Pandas和Python原生的csv库的最大区别
* 不规整数据问题：跳过一些行，或注释等等

### 索引及列名

In [2]:
%more data/ex1.csv

In [3]:
df = pd.read_csv('data/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
df = pd.read_csv('data/ex1.csv', sep=',')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
%more data/ex2.csv

In [6]:
# 列名缺失
pd.read_csv('data/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [7]:
# 指定列名
pd.read_csv('data/ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'msg'])

Unnamed: 0,a,b,c,d,msg
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [8]:
# 指定行索引
pd.read_csv('data/ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'msg'], index_col='msg')

Unnamed: 0_level_0,a,b,c,d
msg,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [9]:
# 多层行索引
pd.read_csv('data/ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'msg'], index_col=['msg', 'a'])

Unnamed: 0_level_0,Unnamed: 1_level_0,b,c,d
msg,a,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


### 处理不规则的分隔符

In [10]:
%more data/ex3.csv

In [11]:
# 正则表达式
pd.read_table('data/ex3.csv', sep='\s+')
# pd.read_table('data/ex3.csv', sep=' ')
# pd.read_csv('data/ex3.csv')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


### 缺失值处理

In [12]:
%more data/ex5.csv

In [13]:
pd.read_csv('data/ex5.csv')

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [14]:
pd.read_csv('data/ex5.csv', na_values=['NA', 'NULL', 'foo'])

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,


In [15]:
pd.read_csv('data/ex5.csv', na_values={'message': ['foo', 'NA'], 'something': ['two']})

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


In [16]:
# 查看帮助文档
# pd.read_table?

### 逐块读取数据

In [17]:
pd.read_csv('data/ex6.csv', nrows=10)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.81748,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.35848,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.63783,2.172201,G


In [18]:
# 统计每个 key 出现的次数
tr = pd.read_csv('data/ex6.csv', chunksize=1000)

key_count = pd.Series([])
for pieces in tr:
    key_count = key_count.add(pieces['key'].value_counts(), fill_value=0)
key_count = key_count.sort_values(ascending=False)
key_count[:10]

E    368
X    364
L    346
O    343
Q    340
M    338
J    337
F    335
K    334
H    330
dtype: float64

### 保存数据到磁盘

In [19]:
df = pd.read_csv('data/ex5.csv')
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [20]:
df.to_csv('data/ex5_out.csv')
%more data/ex5_out.csv

In [21]:
df = pd.read_csv('data/ex5_out.csv')
df

Unnamed: 0.1,Unnamed: 0,something,a,b,c,d,message
0,0,one,1,2,3.0,4,
1,1,two,5,6,,8,world
2,2,three,9,10,11.0,12,foo


In [22]:
# 不写索引
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False)
%more data/ex5_out.csv

In [23]:
# 不写列名称
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False, header=None)
%more data/ex5_out.csv

In [24]:
# 指定分隔符
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False, sep='|')
%more data/ex5_out.csv

In [25]:
# 只写出一部分列
df = pd.read_csv('data/ex5.csv')
df.to_csv('data/ex5_out.csv', index=False, columns=['a', 'b', 'message'])
%more data/ex5_out.csv

## 二进制格式

二进制的优点是容量小，读取速度快。缺点是可能在不同版本间不兼容。比如 Pandas 版本升级后，早期版本保存的二进制数据可能无法正确地读出来。

In [26]:
df = pd.read_csv('data/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [27]:
pd.to_pickle(df, 'data/ex1_pickle.bin')

In [28]:
%ls data

 Volume in drive C has no label.
 Volume Serial Number is 78AE-8B3A

 Directory of C:\Users\CNJOHUA10\kamidox\work\pandas_tutor\data

2016-04-28  14:20 PM    <DIR>          .
2016-04-28  14:20 PM    <DIR>          ..
2015-11-02  11:58 AM                61 ex1.csv
2016-04-28  14:20 PM               942 ex1_pickle.bin
2015-11-02  11:58 AM                44 ex2.csv
2015-11-02  11:58 AM               173 ex3.csv
2015-11-02  11:58 AM                81 ex5.csv
2016-04-28  14:20 PM                40 ex5_out.csv
2015-11-02  11:58 AM           636,864 ex6.csv
2016-04-19  16:46 PM             7,943 tips.csv
               8 File(s)        646,148 bytes
               2 Dir(s)  81,779,331,072 bytes free


In [29]:
pd.read_pickle('data/ex1_pickle.bin')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [30]:
pd.to_pickle(pd.read_csv('data/ex6.csv'), 'data/ex6_pickle.bin')

In [31]:
%ls data

 Volume in drive C has no label.
 Volume Serial Number is 78AE-8B3A

 Directory of C:\Users\CNJOHUA10\kamidox\work\pandas_tutor\data

2016-04-28  14:20 PM    <DIR>          .
2016-04-28  14:20 PM    <DIR>          ..
2015-11-02  11:58 AM                61 ex1.csv
2016-04-28  14:20 PM               942 ex1_pickle.bin
2015-11-02  11:58 AM                44 ex2.csv
2015-11-02  11:58 AM               173 ex3.csv
2015-11-02  11:58 AM                81 ex5.csv
2016-04-28  14:20 PM                40 ex5_out.csv
2015-11-02  11:58 AM           636,864 ex6.csv
2016-04-28  14:20 PM           430,870 ex6_pickle.bin
2016-04-19  16:46 PM             7,943 tips.csv
               9 File(s)      1,077,018 bytes
               2 Dir(s)  81,778,892,800 bytes free


## 其他格式

* HDF5: HDF5是个C语言实现的库，可以高效地读取磁盘上的二进制存储的科学数据。
* Excel文件: pd.read_excel/pd.ExcelFile/pd.ExcelWriter
* JSON: 通过 json 模块转换为字典，再转换为 DataFrame
* SQL 数据库：通过 pd.io.sql 模块来从数据库读取数据
* NoSQL (MongoDB) 数据库：需要结合相应的数据库模块，如 pymongo 。再通过游标把数据读出来，转换为 DataFrame