# 6章 データの読み込み、書き出しとファイル形式
## 6.1 テキスト形式のデータの読み書き

In [1]:
df = pd.read_csv('pydata-book/ch06/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [2]:
pd.read_table('pydata-book/ch06/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [3]:
pd.read_csv('pydata-book/ch06/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
pd.read_csv('pydata-book/ch06/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
names=['a', 'b', 'c', 'd', 'message']
pd.read_csv('pydata-book/ch06/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [6]:
parsed = pd.read_csv('pydata-book/ch06/csv_mindex.csv', index_col=['key1', 'key2'])
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [10]:
# 何文字かの空白文字を区切り文字とする -> '\s+'
result = pd.read_table('pydata-book/ch06/ex3.txt', sep='\s+')
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [11]:
result = pd.read_csv('pydata-book/ch06/ex5.csv')
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [12]:
# 欠損値の値のセットを明示的に指定できる。
result = pd.read_csv('pydata-book/ch06/ex5.csv', na_values=['NULL'])
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [13]:
# 各列で異なる欠損値を示す文字列は、ディクショナリ形式で与える
sentinel = {'message':['foo', 'NA'], 'something':['two']}
pd.read_csv('pydata-book/ch06/ex5.csv', na_values=sentinel)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


### 6.1.1 テキストファイルを少しずつ読む

In [14]:
# 読む行数を指定
pd.read_csv('pydata-book/ch06/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [15]:
# ファイルを少しずつ読む
chunker = pd.read_csv('pydata-book/ch06/ex6.csv', chunksize=1000)
chunker

<pandas.io.parsers.TextFileReader at 0x106f2e2d0>

In [16]:
# chunksizeに応じて処理を繰り返す
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)
tot = tot.order(ascending=False)
tot

E    368
X    364
L    346
O    343
Q    340
M    338
J    337
F    335
K    334
H    330
V    328
I    327
U    326
P    324
D    320
A    320
R    318
Y    314
G    308
S    308
N    306
W    305
T    304
B    302
Z    288
C    286
4    171
6    166
7    164
8    162
3    162
5    157
2    152
0    151
9    150
1    146
dtype: float64

### 6.1.2 テキスト形式でのデータの書き出し

In [17]:
data = pd.read_csv('pydata-book/ch06/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [18]:
# ファイル名を指定すればファイルにかけるが、目で確認したいので、stdoutを出力先に指定している
data.to_csv(sys.stdout)

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [19]:
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [20]:
# 欠損値にNULLという文字列を入れるようにする
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [21]:
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [24]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


### 6.1.3 区切り文字で区切られた形式を手で操作する

In [26]:
# カラムのサイズが異常なので、エラーになる
#pd.read_csv('pydata-book/ch06/ex7.csv')

In [27]:
import csv
f = open('pydata-book/ch06/ex7.csv')
reader = csv.reader(f)
for line in reader:
    print line

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']


### 6.1.4 JSONデータ

In [28]:
# JSON形式データ
obj = """
{"name":"Wes", 
"places_lived":["United States", "Spain", "Germany"], 
"pet":null, 
"siblings": [{"name":"Scott", "age":25, "pet":"Zuko"}, 
 {"name":"Katie", "age":33, "pet":"Cisco"}]
 }
"""

In [29]:
import json

In [30]:
result = json.loads(obj)
result

{u'name': u'Wes',
 u'pet': None,
 u'places_lived': [u'United States', u'Spain', u'Germany'],
 u'siblings': [{u'age': 25, u'name': u'Scott', u'pet': u'Zuko'},
  {u'age': 33, u'name': u'Katie', u'pet': u'Cisco'}]}

In [31]:
siblings = DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


## 6.3 HTMLやWebAPIを用いた読み書き

In [32]:
import requests

In [33]:
url = 'http://search.twitter.com/search.json?q=python%20pandas'
resp = requests.get(url)
resp

<Response [410]>

In [34]:
import json
data = json.loads(resp.text)
data.keys()

[u'errors']

In [35]:
data

{u'errors': [{u'code': 64,
   u'message': u'The Twitter REST API v1 is no longer active. Please migrate to API v1.1. https://dev.twitter.com/docs/api/1.1/overview.'}]}