# Data Loading, Storage, and File Formats

In [2]:
%pylab inline

import numpy as np
from numpy.random import randn

import pandas as pd
from pandas import Series, DataFrame

Populating the interactive namespace from numpy and matplotlib


## Reading and Writing Data in Text Format

函数 | 说明
---|---
`read_csv` | 从文件、URL、文件型对象中加载带分隔符的数据。默认分隔符为逗号
`read_table` | 从文件、URL、文件型对象中加载带分隔符的数据。默认分隔符为`\t`
`read_fwf` | 读取定宽列格式数据
`read_clipboard` | 读取剪贴板中的数据，可以看成 `read_table` 的剪贴板

In [3]:
!cat ch06/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [4]:
df = pd.read_csv('ch06/ex1.csv')

In [5]:
pd.read_table('ch06/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
# 没有标题
!cat ch06/ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [7]:
pd.read_csv('ch06/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [8]:
# 自行定义 column name
pd.read_csv('ch06/ex2.csv', names=['a','b','c','d','message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [9]:
# 将某 column 当成索引
pd.read_csv('ch06/ex2.csv', names=['a','b','c','d','message'], index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [10]:
# 将多个 column 做成层次化索引
!cat ch06/csv_mindex.csv

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [11]:
pd.read_csv('ch06/csv_mindex.csv', index_col=['key1','key2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [12]:
# 有些表格不是用固定分隔符号分隔字段
!cat ch06/ex3.txt

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


In [13]:
pd.read_table('ch06/ex3.csv', sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [14]:
pd.read_fwf('ch06/ex3.csv',index_col=0)

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [15]:
# 可以用 skipraw 跳过文件某些行
!cat ch06/ex4.csv

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [16]:
pd.read_csv('ch06/ex4.csv', skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [17]:
# 出现缺失值
!cat ch06/ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [18]:
pd.read_csv('ch06/ex5.csv')

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [19]:
pd.read_csv('ch06/ex5.csv').isnull()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [20]:
# 指定表示缺失值的字符串
pd.read_csv('ch06/ex5.csv', na_values=['NULL'])

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [21]:
# 'message'里面：'foo', 'NA' 都当成缺失值
# ‘something’里面：‘two' 当成缺失值
pd.read_csv('ch06/ex5.csv', 
            na_values={'message': ['foo', 'NA'], 'something': ['two']})

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


read_csv/read_table 函数的参数

参数 | 说明
---|---
`path` | 表示文件系统位置、URL、文件型对象的字串符
`sep` 或 `delimiter` | 用于对行中各字段进行拆分的字符序列或正则表达式
`header` | 用作列名的行号。默认为 0
`index_col` | 用作行索引编号或列名
`names` | 用于结果的列名列表，结合 header=None
`skiprows` | 需要忽略的行数 （文件开始算起）
` na_values` | 一组用于替换NA的值
`comment` | 用于将注释信息从信息尾拆分出去的字符
`parse_dates` | 尝试将数据解析为日期
`keep_date_col` | 如果连接多列解析日期，保持参与连接的列
`converters` | 由列号、列名跟函数之间的映射关系组成的字典
`dayfirst` | 当解析有歧义的日期时，将其看作是国际格式（7/6/2012 -> June, 7, 2012）
`date_parser` | 用于解析日期的函数
`nrows` | 需要读取的行数 
`iterator` | 返回一个 TextParser 以便逐块读取文件
`chunksize` | 文件快大小 （用于迭代）
`skip_footer` | 需要忽略的行数 （文件末尾算起）
`verbose` | 打印各种解析器输出信息
`encoding` | 用于 unicode 的文本编码格式
`squeeze` | 如果数据经解析后仅含一列，折返回Series
`thousands` | 千分位分隔符，如`,`或`.`

### Reading Text Files in Pieces

In [22]:
!wc -l ch06/ex6.csv

   10001 ch06/ex6.csv


In [23]:
# 只想读取文件的一小部分
pd.read_csv('ch06/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [24]:
# 逐块读取文件，需设置 chunksize (number of rows)
# read_csv 返回的 TextFileReader 对象，可根据 chunksize 对文件进行逐块迭代
chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)
chunker

<pandas.io.parsers.TextFileReader at 0x110619350>

In [25]:
# tot: Series of (key,value) pairs
# Series.add: Addition of series and other, element-wise
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

In [26]:
# 按照频率排序
orderByValues = tot.sort_values(ascending=False)
orderByValues[:5]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
dtype: float64

### Writing Data Out to Text Format

In [27]:
data = pd.read_csv('ch06/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [28]:
data.to_csv('ch06/out.csv')
!cat ch06/out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [29]:
# 输出到 stdout，分隔符 |
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [30]:
# 输出到 stdout，缺失值标记为NULL
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [31]:
# 禁用 column, row 标签
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [32]:
# 只输出部分 column
data.to_csv(sys.stdout, columns=['a','b','c'])

,a,b,c
0,1,2,3.0
1,5,6,
2,9,10,11.0


In [33]:
# Series.to_csv
dates = pd.date_range('1/1/2000', periods=7)
ts = Series(np.arange(7), index=dates)
ts

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int64

In [34]:
# Series 输出到文件
ts.to_csv('ch06/tseries.csv')
!cat ch06/tseries.csv

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


In [35]:
# 由文件读取 Series
Series.from_csv('ch06/tseries.csv', parse_dates=True)

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

### Manually Working with Delimited Formats

In [36]:
!cat ch06/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3","4"


In [37]:
import csv

# 使用 Python 内置 csv 模块，将任意打开的文件或文件型对象传给 csv.reader
f = open('ch06/ex7.csv')
reader = csv.reader(f)

# 对 reader 进行迭代，每行产生一个 list (并移除所有的引号)
for line in reader:
    print line

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']


In [38]:
# 去除引号，做一些整理工作
lines = list(csv.reader(open('ch06/ex7.csv')))
lines[:5]

[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]

In [39]:
# 取得表头与内容
header, values = lines[0], lines[1:]

# 使用 zip 绑定 column 与其 values
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [40]:
DataFrame(data_dict)

Unnamed: 0,a,b,c
0,1,2,3
1,1,2,3


csv.Dialect 的属性

参数 | 说明
---|---
`delimiter` | 分隔字段的字符，默认为 `,`
`lineterminator` | 行结束符，默认为 `\r\n`
`quotechar` | 特殊字符的字段的引用符号，默认为 `"`
`quoting` | 引用约定
`skipintialspace` | 忽略分隔符后面的空白，默认为 False
`doublequote` | 如何处理字段内的引用符。如果为 True，则双写
`escapechar` | 用于对分隔符进行转义的字符串

In [41]:
# 定义 cvs.Dialect 子类，即可定义出新格式
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_ALL

#reader = csv.reader(f, dialect=my_dialect)

In [42]:
# 手工输出
with open('ch06/mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one','two','three'))
    writer.writerow(('1','2','3'))
    writer.writerow(('4','5','6'))
    writer.writerow(('7','8','9'))
    
!cat ch06/mydata.csv

"one";"two";"three"
"1";"2";"3"
"4";"5";"6"
"7";"8";"9"


### JSON Data

In [43]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [
     {"name": "Scott", "age": 25, "pet": "Zuko"},
    {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""

In [44]:
# 通过 Python 标准库 json
import json

In [45]:
# json.loads 将 JSON 字符串转换成 Python 形式
result = json.loads(obj)
result

{u'name': u'Wes',
 u'pet': None,
 u'places_lived': [u'United States', u'Spain', u'Germany'],
 u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'},
  {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}

In [46]:
# json.dumps 将 Python 对象转换成 JSON 格式
asjson = json.dumps(result)
asjson

'{"pet": null, "siblings": [{"pet": "Zuko", "age": 25, "name": "Scott"}, {"pet": "Cisco", "age": 33, "name": "Katie"}], "name": "Wes", "places_lived": ["United States", "Spain", "Germany"]}'

In [47]:
# 向 DataFrame 传入一组 JSON 对象
siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


### XML and HTML: Web Scraping

In [48]:
from lxml.html import parse
from urllib2 import urlopen

parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()

In [49]:
# 使用文档根节点的 findall 方法及一个 XPath 
links = doc.findall('.//a')
links[15:20]

[<Element a at 0x110671050>,
 <Element a at 0x1106710a8>,
 <Element a at 0x110671100>,
 <Element a at 0x110671158>,
 <Element a at 0x1106711b0>]

In [50]:
lnk = links[28]

# 使用对象 get 方法，取得 URL
lnk.get('href')

'/quote/NFLX?p=NFLX'

In [51]:
# 使用对象 text_content 方法，取得文本
lnk.text_content()

'NFLX'

In [52]:
# 使用列表推导 (list comprehension)，取得文档中全部 URL
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
urls[-5:]

['http://info.yahoo.com/relevantads/',
 'http://info.yahoo.com/legal/us/yahoo/utos/utos-173.html',
 'http://twitter.com/YahooFinance',
 'http://facebook.com/yahoofinance',
 'http://yahoofinance.tumblr.com']

In [53]:
tables = doc.findall('.//table')

# calls = tables[9]
# pus = tables[13]

# something's wrong...

## Binary Data Formats

In [54]:
frame = pd.read_csv('ch06/ex1.csv')
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [55]:
# 将数据以 pickle 形式保存到磁盘
frame.to_pickle('ch06/frame_pickle')

In [56]:
# 将数据由 pickle 形式读取
pd.read_pickle('ch06/frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### Using HDF5 Format

In [57]:
# 类似字典的 HDFStore 类，通过 PyTables 存储 pandas 对象
store = pd.HDFStore('mydata.h5')

store['obj1'] = frame
store['obj1_col'] = frame['a']

In [58]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
/obj1                frame        (shape->[3,5])
/obj1_col            series       (shape->[3])  

In [59]:
# 通过字典方式获取对象
store['obj1']

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [60]:
!rm -f mydata.h5

### Reading Microsoft Excel Files

In [61]:
xls = pd.read_excel('ch06/ex1.xlsx')
xls

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## Interacting with HTML and Web APIs

In [62]:
# TBC

## Interacting with Databases

In [63]:
# TBC