# 演示pandas的数据读取

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# 生成demo用数据
colunms_nams = ['a', 'b', 'c', 'd', 'e']
demo_data = np.arange(25).reshape((5,5))
demo_pd = pd.DataFrame(demo_data, columns=colunms_nams)
demo_pd.to_csv('..\dataset\demo_csv_data.csv', sep=',', header=True, index=True)
print(demo_pd)

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
4  20  21  22  23  24


demo csv数据，用excel打开演示
![jupyter](../dataset/demo_csv_data_Excel.png)

In [2]:
# 默认设定，完全读取数据csv的数据，包括colunms跟index
demo_pd = pd.read_csv('..\dataset\\demo_csv_data.csv', sep=',')
demo_pd

# index也被作为其中一列数据被读取了

Unnamed: 0.1,Unnamed: 0,a,b,c,d,e
0,0,0,1,2,3,4
1,1,5,6,7,8,9
2,2,10,11,12,13,14
3,3,15,16,17,18,19
4,4,20,21,22,23,24


In [3]:
demo_pd = pd.read_csv('..\dataset\demo_csv_data.csv', sep=',', index_col=0)
demo_pd

# 显示地指定第一列作为row index

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [4]:
demo_pd = pd.read_csv('..\dataset\demo_csv_data.csv', sep=',', index_col=0, header=0, skiprows=[2,4])
demo_pd

# 显示地指定第一行作为columns name，并且显示地指定需要跳过的行(2, 4)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
2,10,11,12,13,14
4,20,21,22,23,24


In [5]:
demo_pd = pd.read_csv('..\dataset\demo_csv_data.csv', sep=',', usecols=['a', 'b', 'e'], skiprows=[2,4])
demo_pd

# 用列name指定读取某几列，并且skip掉指定某几行，index则是自动重新分配

Unnamed: 0,a,b,e
0,0,1,4
1,10,11,14
2,20,21,24


In [6]:
# 采用index的方法读取指定的几列，而不是采用colunms name

# 先完全读取数据
demo_pd = pd.read_csv('..\dataset\demo_csv_data.csv', sep=',', index_col=0)

col_names = demo_pd.columns              # extract columns name
col_index = [0, 1,4]                     # 指定需要提取的几列
col_names_need = col_names[col_index]    # get the columns name of specified index of colunms

demo_pd_2 = pd.read_csv('..\dataset\demo_csv_data.csv', sep=',', usecols=col_names_need)
# or
demo_pd_3 = demo_pd[col_names_need]

print('demo_pd_2')
print(demo_pd_2)
print('\ndemo_pd_3')
print(demo_pd_3)

demo_pd_2
    a   b   e
0   0   1   4
1   5   6   9
2  10  11  14
3  15  16  19
4  20  21  24

demo_pd_3
    a   b   e
0   0   1   4
1   5   6   9
2  10  11  14
3  15  16  19
4  20  21  24


## 读取含有NA的数据

In [7]:
# 生成含有空数据的demo数据
colunms_nams = ['a', 'b', 'c', 'd', 'e']
demo_data_na = np.arange(25).reshape((5,5))
demo_pd_na = pd.DataFrame(demo_data, columns=colunms_nams)
demo_pd_na['b'][2] = np.nan
demo_pd_na['c'][1] = np.nan
demo_pd_na['d'][4] = '' # set as empty
demo_pd_na.to_csv('..\dataset\demo_csv_data_na.csv', sep=',', header=True, index=True)

demo_pd_na
# empty fail to be reconginzed as NaN

Unnamed: 0,a,b,c,d,e
0,0,1.0,2.0,3.0,4
1,5,6.0,,8.0,9
2,10,,12.0,13.0,14
3,15,16.0,17.0,18.0,19
4,20,21.0,22.0,,24


In [8]:
demo_pd_na = pd.read_csv('..\dataset\demo_csv_data_na.csv', sep=',', index_col=0, na_values='')

# include '' as NaN to read
demo_pd_na

Unnamed: 0,a,b,c,d,e
0,0,1.0,2.0,3.0,4
1,5,6.0,,8.0,9
2,10,,12.0,13.0,14
3,15,16.0,17.0,18.0,19
4,20,21.0,22.0,,24
