In [2]:
import pandas as pd
import numpy as np
pd.set_option("display.show_dimensions", False)
pd.set_option("display.float_format", "{:4.2g}".format)

## 文件的输入输出

输入输出函数

|函数名      | 说明            |
|:-------------|: ---------------------|
|read_csv()   | 从csv文件读取数据        |
|read_excel()  | 从Excel文件读取数据       |
|HDFStore()   | 使用HDF5文件读写数据       |
|read_sql()   | 从SQ数据库的查询结果载入数据 |
|read_pickle()|  读入Pickle序列化后的数据   |

### CSV文件

### CSV文件

read_csv()的参数有: sep, header, skiprows, na_values, parse_dates, 
    encoding, usecols, chunksize()等

In [16]:
df_list = []
#df_list是一个列表, 列表元素是DataFram对象, 每个DataFrame对象只有100行数据
for df in pd.read_csv(
        u"data/aqi/上海市_201406.csv", 
        encoding="utf-8-sig",  #文件编码
        chunksize=100,         #一次读入的行数
        usecols=[u"时间", u"监测点", "AQI", "PM2.5", "PM10"], #只读入这些列
        na_values=["-", "—"],  #这些字符串表示缺失数据
        parse_dates=[0]):      #第一列为时间列
    df_list.append(df)  #在这里处理数据

In [17]:
len(df_list) #7000/100=70

70

In [20]:
df_list[0].count()

时间       100
监测点       90
AQI      100
PM2.5    100
PM10      98
dtype: int64

In [21]:
 df_list[0].dtypes

时间       datetime64[ns]
监测点              object
AQI               int64
PM2.5             int64
PM10            float64
dtype: object

In [22]:
print type(df.loc[0, u"监测点"]) # 0行, 监测点列

<type 'unicode'>


### HDF5文件

创建HDFStore对象, complib指定压缩数据库, complevel指定压缩级别

In [4]:
store = pd.HDFStore("a.hdf5", complib="blosc", complevel=9)
store

<class 'pandas.io.pytables.HDFStore'>
File path: a.hdf5
/dataframes/df1                    frame        (shape->[100000,4])                                                 
/dataframes/df2                    frame        (shape->[10000,3])                                                  
/dataframes/df_dynamic1            frame_table  (typ->appendable,nrows->100000,ncols->4,indexers->[index],dc->[A,B])
/series/s1                         series       (shape->[1000])                                                     

HDFStore对象支持字典接口, 可用用[]存取元素, get(), key()等方法

In [5]:
df1 = pd.DataFrame(np.random.rand(100000, 4), columns=list("ABCD"))
df2 = pd.DataFrame(np.random.randint(0, 10000, (10000, 3)), 
                   columns=["One", "Two", "Three"])
s1 = pd.Series(np.random.rand(1000))

#store是上面创建的一个HDFStore对象
store["dataframes/df1"] = df1 
store["dataframes/df2"] = df2
store["series/s1"] = s1

print store.keys()
print df1.equals(store["dataframes/df1"])

['/dataframes/df1', '/dataframes/df2', '/dataframes/df_dynamic1', '/series/s1']
True


 > http://pytables.github.io/usersguide/libref/hierarchy_classes.html

> `pytables`项目介绍

get_note()获得HDFStore对象的根节点, 然后调用_f_walknodes()遍历其包含的所有节点

In [6]:
root = store.get_node("//")
root


/ (RootGroup) ''
  children := ['series' (Group), 'dataframes' (Group)]

In [7]:
root = store.get_node("//")
for node in root._f_walknodes():
    print node

/dataframes (Group) u''
/series (Group) u''
/dataframes/df1 (Group) u''
/dataframes/df2 (Group) u''
/series/s1 (Group) u''
/series/s1/index (CArray(1000,), shuffle, blosc(9)) ''
/series/s1/values (CArray(1000,), shuffle, blosc(9)) ''
/dataframes/df1/axis0 (CArray(4,), shuffle, blosc(9)) ''
/dataframes/df1/axis1 (CArray(100000,), shuffle, blosc(9)) ''
/dataframes/df1/block0_items (CArray(4,), shuffle, blosc(9)) ''
/dataframes/df1/block0_values (CArray(100000, 4), shuffle, blosc(9)) ''
/dataframes/df2/axis0 (CArray(3,), shuffle, blosc(9)) ''
/dataframes/df2/axis1 (CArray(10000,), shuffle, blosc(9)) ''
/dataframes/df2/block0_items (CArray(3,), shuffle, blosc(9)) ''
/dataframes/df2/block0_values (CArray(10000, 3), shuffle, blosc(9)) ''


append() 在原始目录中添加新的表格Table对象

In [7]:
store.append('dataframes/df_dynamic1', df1, append=False) 
# append = False 表示覆盖原先的df_dynamicl

df3 = pd.DataFrame(np.random.rand(100, 4), columns=list("ABCD"))

store.append('dataframes/df_dynamic1', df3) #❷
store['dataframes/df_dynamic1'].shape

(100100, 4)

In [8]:
root = store.get_node("//")
for node in root._f_walknodes():
    print node
    
# 多了两行    /dataframes/df_dynamic1 (Group) u''
# /dataframes/df_dynamic1/table (Table(100100,), shuffle, blosc(9)) ''

/dataframes (Group) ''
/series (Group) ''
/dataframes/df1 (Group) u''
/dataframes/df2 (Group) u''
/dataframes/df_dynamic1 (Group) u''
/series/s1 (Group) u''
/series/s1/index (CArray(1000,), shuffle, blosc(9)) ''
/series/s1/values (CArray(1000,), shuffle, blosc(9)) ''
/dataframes/df1/axis0 (CArray(4,), shuffle, blosc(9)) ''
/dataframes/df1/axis1 (CArray(100000,), shuffle, blosc(9)) ''
/dataframes/df1/block0_items (CArray(4,), shuffle, blosc(9)) ''
/dataframes/df1/block0_values (CArray(100000, 4), shuffle, blosc(9)) ''
/dataframes/df2/axis0 (CArray(3,), shuffle, blosc(9)) ''
/dataframes/df2/axis1 (CArray(10000,), shuffle, blosc(9)) ''
/dataframes/df2/block0_items (CArray(3,), shuffle, blosc(9)) ''
/dataframes/df2/block0_values (CArray(10000, 3), shuffle, blosc(9)) ''
/dataframes/df_dynamic1/table (Table(100100,), shuffle, blosc(9)) ''


In [9]:
# 目前store中存储的键有哪些
print store.keys()

['/dataframes/df1', '/dataframes/df2', '/dataframes/df_dynamic1', '/series/s1']


select()对表格进行查询

In [10]:
print store.select('dataframes/df_dynamic1', where='index > 97 & index < 102')

       A    B     C      D
98  0.45 0.72   0.3   0.38
99  0.73 0.34 0.064 0.0063
100 0.35 0.13  0.17  0.034
101 0.32 0.75   0.6   0.76
98  0.24 0.25  0.54  0.031
99  0.59 0.87  0.18   0.42


append()的另一个参数 data_columns 表示在创建新的表格时 指定索引列

In [13]:
store.append('dataframes/df_dynamic1', df1, append=False, data_columns=True)
print store.select('dataframes/df_dynamic1', where='A > 0.99 & B < 0.01')

         A      B     C    D
21223    1 0.0037  0.22 0.95
26626 0.99  0.001  0.37 0.84
32961 0.99 0.0094  0.77 0.48
45890 0.99 0.0059  0.23 0.83
66037 0.99 0.0033  0.52    1
66611 0.99 0.0078 0.046  0.2
93322    1 0.0079   0.6 0.77


> **WARNING**

> 由于所有从CSV文件读入`DataFrame`对象的行索引都为缺省值，因此HDF5文件中的数据的行索引并不是唯一的。

循环读入data\aqi下的scv到HDF5文件

* HDF5文件不支持Unicode字符串
* HDF5文件中的每列数据只能对应一种类型
* 需要指定HDF5文件中字符串的最大长度

In [19]:
def read_aqi_files(fn_pattern):
    from glob import glob
    from os import path
    
#    变量名

    UTF8_BOM = b"\xEF\xBB\xBF"
    # utf8编码文件的头三个字节
    
    cols = "时间,城市,监测点,质量等级,AQI,PM2.5,PM10,CO,NO2,O3,SO2".split(",")
    # 列名列表
    
    float_dtypes = {col:float for col in "AQI,PM2.5,PM10,CO,NO2,O3,SO2".split(",")}
    # 列名:类型,字典, 做dtype参数的
    
    names_map = {"时间":"Time", 
                 "监测点":"Position", 
                 "质量等级":"Level", 
                 "城市":"City", 
                 "PM2.5":"PM2_5"}
    # 将所有的中文列名替换为为英文
    
#    循环操作csv文件

    for fn in glob(fn_pattern):   #文件名样式, 所有的csv文件
        with open(fn, "rb") as f:
            
            sig = f.read(3)     #1 这三行表示在BOM之后重设文件起点
            if sig != UTF8_BOM: #2
                f.seek(0, 0)    #3
            
            # 按指定格式读入csv文件
            df = pd.read_csv(f, 
                             parse_dates=[0],       # 第0列表示日期列
                             na_values=["-", "—"], # 文件中表示NaN的符号 
                             usecols=cols,          # 需要提取的列
                             dtype=float_dtypes)  # 指定每一列的类型
        df.rename_axis(names_map, axis=1, inplace=True) # 重设列名  
        df.dropna(inplace=True)  # 删除所有包含NaN的行
        yield df # 数据表对象df的生成器


store = pd.HDFStore("data/aqi/aqi.hdf5", 
                    complib="blosc", complevel=9) # 创建HDFStore对象

string_size = {"City": 12, "Position": 30, "Level":12} #设置store中字符串的长度

for idx, df in enumerate(read_aqi_files(u"data/aqi/*.csv")):
    store.append('aqi', df, append=idx!=0, min_itemsize=string_size, data_columns=True) #❸
    # 不断往store中添加数据表
store.close() # 关闭hdf5文件

In [27]:
#  打开刚刚创建的HDFStore对象
store = pd.HDFStore("data/aqi/aqi.hdf5")

In [28]:
# 查看所有的键
print store.keys()

['/aqi']


In [30]:
# 查看所有的节点
root = store.get_node("//")
for node in root._f_walkNodes():
    print node

/aqi (Group) ''
/aqi/table (Table(337250,), shuffle, blosc(9)) ''


In [31]:
df_aqi = store.select("aqi")
print len(df_aqi)

337250


读取PM2.5值大于500 的行

In [14]:
df_polluted = store.select("aqi", where="PM2_5 > 500")
print len(df_polluted)

87


### 读写数据库

In [15]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///data/aqi/aqi.db')

In [16]:
try:
    engine.execute("DROP TABLE aqi")
except:
    pass

In [17]:
str_cols = ["Position", "City", "Level"]

for df in read_aqi_files("data/aqi/*.csv"):
    for col in str_cols:
        df[col] = df[col].str.decode("utf8")
    df.to_sql("aqi", engine, if_exists="append", index=False)

In [18]:
df_aqi = pd.read_sql("aqi", engine)

In [19]:
df_polluted = pd.read_sql("select * from aqi where PM2_5 > 500", engine)
print len(df_polluted)

87


### 使用Pickle序列化

In [21]:
df_aqi.to_pickle("data/aqi/aqi.pickle")
df_aqi2 = pd.read_pickle("data/aqi/aqi.pickle")
df_aqi.equals(df_aqi2)

True