## 1 读写文本格式的数据

### 1.1 read_csv/table

In [None]:
import numpy as np
import pandas as pd

In [None]:
pd.read_csv('ch06/f1.csv')

In [None]:
pd.read_table('ch06/f1.csv', sep=',')

In [None]:
pd.read_csv('ch06/f2_无标题行.csv', header=None)

In [None]:
pd.read_csv('ch06/f2_无标题行.csv', names=['a','b','c','d','message'])

In [None]:
# 将message列做成DataFrame的索引
pd.read_csv('ch06/f2_无标题行.csv', names=['a','b','c','d','message'], index_col='message')

In [None]:
pd.read_csv('ch06/f3.csv', index_col=['key1', 'key2'])

In [None]:
list(open('ch06/f4.txt'))

In [None]:
pd.read_table('ch06/f4.txt', sep='\s+')  # 利用正则表达式\s+匹配任何空白字符，包括空格、制表符、换页符等等

In [None]:
pd.read_table('ch06/f4.txt', sep='\s+', skiprows=[1,2])

### 1.2 创建文件并写入DataFrame(只能用.to_csv)/ List数据..

In [None]:
# 方法一：利用pandas、os
import pandas as pd
import numpy as np
import os

path = 'ch06/f5_os.csv'
df = pd.DataFrame(np.random.randn(10000,4), columns=['one','two','three','four'])
df2 = pd.DataFrame((chr(np.random.randint(65,90)) for i in range(10000)), columns=['key'])
df = pd.concat([df, df2], axis=1)
# pd.concat([df,df2], axis=1)

df.to_csv(path, index=False, mode='a')
# 文件不存在则写入header，否则则将df写入
# if not os.path.exists(path):
#    df.to_csv(path, header=['one','two','three','four','key'], index=False, mode='a')
# else:
#    df.to_csv(path, header=False, index=False, mode='a')

In [None]:
# 方法二：利用csv
import numpy as np
import csv

path2 = 'ch06/f6_csv.csv'
file = open(path2, 'a+', encoding='utf-8', newline='')

csv_writer = csv.writer(file)
csv_writer.writerow([f'one','two','three'])
csv_writer.writerows([[1,'luke','96'],[2,'jack','85'],[3,'nick','84']])

file.close()

In [None]:
# 逐行读取文件
pd.read_csv('ch06/f5_os.csv', nrows=5)

In [None]:
# 逐块读取文件
chunker = pd.read_csv('ch06/f5_os.csv', chunksize=1000)

tot = pd.Series([])
for piece in chunker:
    # piece is a DataFrame,lenth is chunksize=1000,and piece[key] is a Series ,key is int ,value is the key column
#     print(piece['key'].value_counts())
    # "piece[key] value_counts" is a Series ,key is the key column, and value is the key count
    tot = tot.add(piece['key'].value_counts(), fill_value=0)
#     chunkercount+=1

tot = tot.sort_values(ascending=False)
tot[:10]

### 1.3 将数据写出到文本格式 df.to_csv('sys.stdout', args)

In [None]:
import sys
import pandas as pd
import numpy as np

In [None]:
data = pd.DataFrame(np.random.randn(4,3), columns=['one','two','three'])
data.to_csv('ch06/f7.csv')

In [None]:
pd.read_csv('ch06/f7.csv')

In [None]:
data.to_csv(sys.stdout)

In [None]:
data.to_csv(sys.stdout, sep='|')  # 对data在文件中的数据做|匹配后重定向后打印出来，不改变原数据

In [None]:
data2 = pd.read_csv('ch06/f8.csv')
# data2.to_csv(sys.stdout)
data2

In [None]:
data2.to_csv(sys.stdout, na_rep='NULL')

In [None]:
data2.to_csv(sys.stdout, index=False, header=False)

In [None]:
data2.to_csv(sys.stdout, index=False, columns=['a','b','c'])

In [None]:
# Series写入文本
dates = pd.date_range('1/1/2023', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('ch06/f9_tseries.csv')

In [None]:
# pd.read_csv('ch06/f9_tseries.csv')

In [None]:
ts.to_csv(sys.stdout)

### 1.4 处理分隔符格式

In [None]:
data2 = pd.read_csv('ch06/f10.csv')
data2.to_csv(sys.stdout)

In [None]:
import csv
f = open('ch06/f10.csv')
reader = csv.reader(f)
for line in reader:
    print(line)

In [None]:
import csv
# 1.读取文件到一个多行列表；此次利用with打开文件，则with外的其他地方读不到f
with open('ch06/f10.csv') as f:
    lines = list(csv.reader(f))

In [None]:
# 2.将⾏分为标题⾏和数据⾏
header, values = lines[0], lines

In [None]:
# 3.字典构造式和zip（*value），将行转置换为列
data_dict = {h: v for h, v in zip(header, zip(*values))}

In [None]:
# 4_1. 定义csv.Dialect的一个子类
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
f2 = open('ch06/f10.csv')
reader = csv.reader(f2, dialect=my_dialect)

In [None]:
# 4_2. 直接参数写明，不写子类
f2 = open('ch06/f10.csv')
reader = csv.reader(f2, delimiter='|')

In [None]:
for line in reader:
    print(line)

In [None]:
with open('mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

### 1.5 JSON数据

In [None]:
import json
import pandas as pd

In [None]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
              {"name": "Katie", "age": 33, "pet": ["Sixes", "Stache", "Cisco"]}]
}"""
# siblings兄弟姐妹

result = json.loads(obj)
result

In [None]:
# json.dumps则将Python对象转换成JSON格式
json.dumps(result)

In [None]:
# JSON对象转换为DataFrame：向DataFrame构造器传入一个字典的列表
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

In [None]:
tables = pd.read_html('ch06/fdic_failed_bank_list.html')

In [None]:
len(tables)

In [None]:
failures = tables[0]
failures.head()

In [None]:
# 按年份计算倒闭的银⾏数
close_timestamps = pd.to_datetime(failures['Closing Date'])
close_timestamps.dt.year.value_counts()

### 1.6 利用lxml.objectify解析XML

In [None]:
from lxml import objectify
import pandas as pd

In [None]:
path = 'ch06/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()

In [None]:
# root.INDICATOR返回⼀个⽤于产⽣各个<INDICATOR>XML元素的⽣成器。
data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root.INDICATOR_UNIT:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

In [None]:
perf = pd.DataFrame(data)
perf.head()

In [None]:
from io import StringIO
tag = '<a href="http://www.google.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()

In [None]:
root

In [None]:
root.get('href')

In [None]:
root.text

## 2 二进制数据格式

### 2.1 使用HDF5格式

In [None]:
import pandas as pd
import numpy as np

In [None]:
frame = pd.read_csv('ch06/f1.csv')
frame

In [None]:
frame.to_pickle('ch06/frame_pickle')

In [None]:
pd.read_pickle('ch06/frame_pickle')

In [None]:
frame = pd.DataFrame({'a': np.random.randn(100)})

In [None]:
store = pd.HDFStore('ch06/mydata.h5')

In [None]:
# 添加数据：way1
store['obj1'] = frame

In [None]:
store['obj1_col'] = frame['a']

In [None]:
store

In [None]:
# 查询/ 读取数据
store['obj1']

In [None]:
# way2
store.put('obj2', frame, format='table')

In [None]:
store.select('obj2', where=['index >= 10 and index <= 15'])

In [None]:
store.close()

In [None]:
# way3
frame.to_hdf('mydata.h5', 'obj3', format='table')

In [None]:
pd.read_hdf('mydata.h5','obj3', where=['index < 5']) 

### 2.2 读取Microsoft Excel文件

In [2]:
import pandas as pd

In [4]:
# ExcelFile + read_excel：读取excel文件
xlsx = pd.ExcelFile('ch06/ex1.xlsx')
pd.read_excel(xlsx, 'Sheet1')

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [5]:
frame = pd.read_excel('ch06/ex1.xlsx', 'Sheet1')
frame

Unnamed: 0.1,Unnamed: 0,a,b,c,d,message
0,0,1,2,3,4,hello
1,1,5,6,7,8,world
2,2,9,10,11,12,foo


In [6]:
# ExcelWriter + to_excel：写入Excel文件
writer = pd.ExcelWriter('ch06/ex2.xlsx')
frame.to_excel(writer, 'Sheet1')  # frame.to_excel('examples/ex2.xlsx')

In [7]:
writer.save()

### 2.3 Web APIs交互

In [8]:
import requests

In [9]:
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'

In [11]:
resp = requests.get(url)
resp

<Response [200]>

In [12]:
data = resp.json()

In [13]:
data[0]['title']

'ENH: Allow ArrowDtype(pa.string()) to be compatable with str accessor'

In [14]:
issues = pd.DataFrame(data, columns=['number', 'title ', 'labels', 'state'])
issues

Unnamed: 0,number,title,labels,state
0,51207,,"[{'id': 57522093, 'node_id': 'MDU6TGFiZWw1NzUy...",open
1,51206,,[],open
2,51205,,[],open
3,51204,,"[{'id': 1280988427, 'node_id': 'MDU6TGFiZWwxMj...",open
4,51203,,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
5,51202,,"[{'id': 134699, 'node_id': 'MDU6TGFiZWwxMzQ2OT...",open
6,51197,,[],open
7,51196,,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
8,51194,,"[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ=...",open
9,51192,,"[{'id': 49254273, 'node_id': 'MDU6TGFiZWw0OTI1...",open


### 2.4 数据库交互