# Data Loading, Storage, and File Formats

In [1]:
%pylab inline

import numpy as np
from numpy.random import randn

import pandas as pd
from pandas import Series, DataFrame

Populating the interactive namespace from numpy and matplotlib


## Reading and Writing Data in Text Format

函数 | 说明
---|---
`read_csv` | 从文件、URL、文件型对象中加载带分隔符的数据。默认分隔符为逗号
`read_table` | 从文件、URL、文件型对象中加载带分隔符的数据。默认分隔符为`\t`
`read_fwf` | 读取定宽列格式数据
`read_clipboard` | 读取剪贴板中的数据，可以看成 `read_table` 的剪贴板

In [4]:
!cat ch06/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [5]:
df = pd.read_csv('ch06/ex1.csv')

In [6]:
pd.read_table('ch06/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [7]:
# 没有标题
!cat ch06/ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [8]:
pd.read_csv('ch06/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [16]:
# 自行定义 column name
pd.read_csv('ch06/ex2.csv', names=['a','b','c','d','message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [17]:
# 将某 column 当成索引
pd.read_csv('ch06/ex2.csv', names=['a','b','c','d','message'], index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [18]:
# 将多个 column 做成层次化索引
!cat ch06/csv_mindex.csv

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [19]:
pd.read_csv('ch06/csv_mindex.csv', index_col=['key1','key2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [20]:
# 有些表格不是用固定分隔符号分隔字段
!cat ch06/ex3.txt

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


In [21]:
pd.read_table('ch06/ex3.csv', sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [23]:
pd.read_fwf('ch06/ex3.csv',index_col=0)

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [25]:
# 可以用 skipraw 跳过文件某些行
!cat ch06/ex4.csv

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [26]:
pd.read_csv('ch06/ex4.csv', skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [27]:
# 出现缺失值
!cat ch06/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [28]:
pd.read_csv('ch06/ex5.csv')

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [31]:
pd.read_csv('ch06/ex5.csv').isnull()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [29]:
# 指定表示缺失值的字符串
pd.read_csv('ch06/ex5.csv', na_values=['NULL'])

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [30]:
# 'message'里面：'foo', 'NA' 都当成缺失值
# ‘something’里面：‘two' 当成缺失值
pd.read_csv('ch06/ex5.csv', 
            na_values={'message': ['foo', 'NA'], 'something': ['two']})

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


read_csv/read_table 函数的参数

参数 | 说明
---|---
`path` | 表示文件系统位置、URL、文件型对象的字串符
`sep` 或 `delimiter` | 用于对行中各字段进行拆分的字符序列或正则表达式
`header` | 用作列名的行号。默认为 0
`index_col` | 用作行索引编号或列名
`names` | 用于结果的列名列表，结合 header=None
`skiprows` | 需要忽略的行数 （文件开始算起）
` na_values` | 一组用于替换NA的值
`comment` | 用于将注释信息从信息尾拆分出去的字符
`parse_dates` | 尝试将数据解析为日期
`keep_date_col` | 如果连接多列解析日期，保持参与连接的列
`converters` | 由列号、列名跟函数之间的映射关系组成的字典
`dayfirst` | 当解析有歧义的日期时，将其看作是国际格式（7/6/2012 -> June, 7, 2012）
`date_parser` | 用于解析日期的函数
`nrows` | 需要读取的行数 
`iterator` | 返回一个 TextParser 以便逐块读取文件
`chunksize` | 文件快大小 （用于迭代）
`skip_footer` | 需要忽略的行数 （文件末尾算起）
`verbose` | 打印各种解析器输出信息
`encoding` | 用于 unicode 的文本编码格式
`squeeze` | 如果数据经解析后仅含一列，折返回Series
`thousands` | 千分位分隔符，如`,`或`.`