# データの標準的な読み書き

以下メソッドで可能
- 読み込み : pd.read_csv
- 保存 : (DataFrame).csv_to

In [1]:
import pandas as pd

In [3]:
!cat lec25.csv

q,r,s,t,apple
2,3,4,5,pear
a,s,d,f,rabbit
5,2,5,7,dog


In [5]:
# cavデータの読み込み 1行目はヘッダー情報とされる
dframe = pd.read_csv('lec25.csv')
dframe

Unnamed: 0,q,r,s,t,apple
0,2,3,4,5,pear
1,a,s,d,f,rabbit
2,5,2,5,7,dog


In [7]:
# ヘッダー情報がない場合
dframe = pd.read_csv('lec25.csv',header=None)
dframe

Unnamed: 0,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear
2,a,s,d,f,rabbit
3,5,2,5,7,dog


In [9]:
# データ区切り文字の指定
dframe = pd.read_table('lec25.csv',sep=',',header=None)
dframe

Unnamed: 0,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear
2,a,s,d,f,rabbit
3,5,2,5,7,dog


In [10]:
# 読み込む行数を制限するパターン
pd.read_csv('lec25.csv',header=None,nrows=2)

Unnamed: 0,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear


In [12]:
dframe

Unnamed: 0,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear
2,a,s,d,f,rabbit
3,5,2,5,7,dog


In [14]:
# 保存するぱたーん
dframe.to_csv('mytextdata.csv')

In [16]:
!cat mytextdata.csv

,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear
2,a,s,d,f,rabbit
3,5,2,5,7,dog


In [17]:
# 標準出力にするパターン
import sys
dframe.to_csv(sys.stdout)

,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear
2,a,s,d,f,rabbit
3,5,2,5,7,dog


In [18]:
# 区切り文字を変えることも可能
dframe.to_csv(sys.stdout,sep='_')

_0_1_2_3_4
0_q_r_s_t_apple
1_2_3_4_5_pear
2_a_s_d_f_rabbit
3_5_2_5_7_dog


In [19]:
# columnsを指定することで保存する列を指定
dframe.to_csv(sys.stdout,columns=[0,2])

,0,2
0,q,s
1,2,4
2,a,d
3,5,5


# JSONの使用法

JSON : よくあるデータ送受信とかで使う形式　JavaScrpit用に作られたオブジェクトの標準形式だが、色々なデータベースで使用される

In [31]:
# json例

# JSON (JavaScript Object Notation) のサンプル。
json_obj = """
{   "zoo_animal": "Lion",
    "food": ["Meat", "Veggies", "Honey"],
    "fur": "Golden",
    "clothes": null, 
    "diet": [{"zoo_animal": "Gazelle", "food":"grass", "fur": "Brown"}]
}
"""

In [32]:
# JSONの読み込み
import json
data = json.loads(json_obj)

In [33]:
data

{'zoo_animal': 'Lion',
 'food': ['Meat', 'Veggies', 'Honey'],
 'fur': 'Golden',
 'clothes': None,
 'diet': [{'zoo_animal': 'Gazelle', 'food': 'grass', 'fur': 'Brown'}]}

In [34]:
type(data)

dict

In [35]:
# JSONで読み込んだものは辞書として読み出せる
data['diet']

[{'zoo_animal': 'Gazelle', 'food': 'grass', 'fur': 'Brown'}]

In [36]:
# jsonで書き込ませる場合はdumpを使用
json.dumps(data)

'{"zoo_animal": "Lion", "food": ["Meat", "Veggies", "Honey"], "fur": "Golden", "clothes": null, "diet": [{"zoo_animal": "Gazelle", "food": "grass", "fur": "Brown"}]}'

In [37]:
json.dump(data, open('data.json','w'))

In [39]:
!cat data.json

{"zoo_animal": "Lion", "food": ["Meat", "Veggies", "Honey"], "fur": "Golden", "clothes": null, "diet": [{"zoo_animal": "Gazelle", "food": "grass", "fur": "Brown"}]}

In [42]:
# 読みだすパターン
json.load(open('data.json'))

{'zoo_animal': 'Lion',
 'food': ['Meat', 'Veggies', 'Honey'],
 'fur': 'Golden',
 'clothes': None,
 'diet': [{'zoo_animal': 'Gazelle', 'food': 'grass', 'fur': 'Brown'}]}

# HTMLから読み込むパターン

beautifulsoup4などを使ってWeb上からスクレイピングして、それをデータ化するパターン

In [43]:
import pandas as pd

In [49]:
# URLをもとにデータを取得
#url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' #NOTE 今サイト無いので別のサイトを使う
url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/"
#<table>表から適当に情報を入れ込んでいく
dframe_list = pd.io.html.read_html(url) 

In [51]:
# 最初のテーブル情報を取得
dframe = dframe_list[0]
dframe

Unnamed: 0,Bank NameBank,CityCity,StateSt,CertCert,Acquiring InstitutionAI,Closing DateClosing,FundFund
0,Almena State Bank,Almena,KS,15426,Equity Bank,"October 23, 2020",10538
1,First City Bank of Florida,Fort Walton Beach,FL,16748,"United Fidelity Bank, fsb","October 16, 2020",10537
2,The First State Bank,Barboursville,WV,14361,"MVB Bank, Inc.","April 3, 2020",10536
3,Ericson State Bank,Ericson,NE,18265,Farmers and Merchants Bank,"February 14, 2020",10535
4,City National Bank of New Jersey,Newark,NJ,21111,Industrial Bank,"November 1, 2019",10534
...,...,...,...,...,...,...,...
558,"Superior Bank, FSB",Hinsdale,IL,32646,"Superior Federal, FSB","July 27, 2001",6004
559,Malta National Bank,Malta,OH,6629,North Valley Bank,"May 3, 2001",4648
560,First Alliance Bank & Trust Co.,Manchester,NH,34264,Southern New Hampshire Bank & Trust,"February 2, 2001",4647
561,National State Bank of Metropolis,Metropolis,IL,3815,Banterra Bank of Marion,"December 14, 2000",4646


In [52]:
# テーブルなので、列名から取り出し可能
dframe["CityCity"]

0                 Almena
1      Fort Walton Beach
2          Barboursville
3                Ericson
4                 Newark
             ...        
558             Hinsdale
559                Malta
560           Manchester
561           Metropolis
562             Honolulu
Name: CityCity, Length: 563, dtype: object