In [1]:
import numpy as np
import pandas as pd

## 3.1 從無到有建立DataFrame

In [3]:
# 需要一些串列 作為欄位
fname = ['Paul', 'John', 'Richard', 'George']
lname = ['McCartney', 'Lennon', 'Starkey', 'Harrison']
birth = [1942, 1940, 1940, 1943]

# 使用以上串列 建立字典
people = {'first': fname, 'last': lname, 'birth': birth}

# 使用以上字典 建立DataFrame
beatles = pd.DataFrame(people)
print(beatles)

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [None]:
# 預設索引
beatles.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
# 自訂索引
print(pd.DataFrame(people, index=['a', 'b', 'c', 'd']))

     first       last  birth
a     Paul  McCartney   1942
b     John     Lennon   1940
c  Richard    Starkey   1940
d   George   Harrison   1943


In [None]:
# 用多個字典構成的串列建立DataFrame
print(pd.DataFrame([{'first':'Paul', 'last':'McCartney', 'birth':1942},
                    {'first':'John', 'last':'Lennon', 'birth':1940},
                    {'first':'Richard', 'last':'Starkey', 'birth':1940},
                    {'first':'George', 'last':'Harrison', 'birth':1943}]))

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [8]:
# 用多個字典構成的串列建立DataFrame
print(pd.DataFrame([{'first':'Paul', 'last':'McCartney', 'birth':1942},
                    {'first':'John', 'last':'Lennon', 'birth':1940},
                    {'first':'Richard', 'last':'Starkey', 'birth':1940},
                    {'first':'George', 'last':'Harrison', 'birth':1943}],
                    columns=['last', 'first', 'birth']))

        last    first  birth
0  McCartney     Paul   1942
1     Lennon     John   1940
2    Starkey  Richard   1940
3   Harrison   George   1943


## 3.2 存取CSV 檔案

In [9]:
# 將DataFrame 以CSV 形式存入字串緩衝區
from io import StringIO
fout = StringIO()
beatles.to_csv(fout)

# 查看內容
print(fout.getvalue())

,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943



In [10]:
# 從CSV 存取DataFrame
fout.seek(0) # 移至緩衝區的開頭位置
print(pd.read_csv(fout))

   Unnamed: 0    first       last  birth
0           0     Paul  McCartney   1942
1           1     John     Lennon   1940
2           2  Richard    Starkey   1940
3           3   George   Harrison   1943


In [11]:
# 使用 index_col=
fout.seek(0)
print(pd.read_csv(fout, index_col=0))

     first       last  birth
0     Paul  McCartney   1942
1     John     Lennon   1940
2  Richard    Starkey   1940
3   George   Harrison   1943


In [12]:
# 也可以不要存入索引 index=False
fout = StringIO()
beatles.to_csv(fout, index=False)
print(fout.getvalue())

first,last,birth
Paul,McCartney,1942
John,Lennon,1940
Richard,Starkey,1940
George,Harrison,1943



## 3.3 讀取大型的CSV 檔案

In [15]:
# 限制載入資料量 nrows=
diamonds = pd.read_csv('../../data/diamonds.csv', nrows=1000)

print(diamonds)

     carat      cut color clarity  depth  table  price     x     y     z
0     0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1     0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2     0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3     0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4     0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
..     ...      ...   ...     ...    ...    ...    ...   ...   ...   ...
995   0.54    Ideal     D    VVS2   61.4   52.0   2897  5.30  5.34  3.26
996   0.72    Ideal     E     SI1   62.5   55.0   2897  5.69  5.74  3.57
997   0.72     Good     F     VS1   59.4   61.0   2897  5.82  5.89  3.48
998   0.74  Premium     D     VS2   61.8   58.0   2897  5.81  5.77  3.58
999   1.12  Premium     J     SI2   60.6   59.0   2898  6.68  6.61  4.03

[1000 rows x 10 columns]


In [16]:
# 查看占用多少記憶體
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float64
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float64
 5   table    1000 non-null   float64
 6   price    1000 non-null   int64  
 7   x        1000 non-null   float64
 8   y        1000 non-null   float64
 9   z        1000 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 78.3+ KB


In [17]:
# 指定存入的資料型態
diamonds2 = pd.read_csv('../../data/diamonds.csv', nrows=1000,
                        dtype={'carat':np.float32, 'depth':np.float32,
                               'table': np.float32, 'x': np.float32,
                               'y':np.float32, 'z':np.float32,
                               'price': np.int16})

diamonds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float32
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float32
 5   table    1000 non-null   float32
 6   price    1000 non-null   int16  
 7   x        1000 non-null   float32
 8   y        1000 non-null   float32
 9   z        1000 non-null   float32
dtypes: float32(6), int16(1), object(3)
memory usage: 49.0+ KB


In [19]:
# 比對一下資料集內容
print(diamonds.describe())

             carat        depth        table       price            x  \
count  1000.000000  1000.000000  1000.000000  1000.00000  1000.000000   
mean      0.689280    61.722800    57.734700  2476.54000     5.605940   
std       0.195291     1.758879     2.467946   839.57562     0.625173   
min       0.200000    53.000000    52.000000   326.00000     3.790000   
25%       0.700000    60.900000    56.000000  2777.00000     5.640000   
50%       0.710000    61.800000    57.000000  2818.00000     5.770000   
75%       0.790000    62.600000    59.000000  2856.00000     5.920000   
max       1.270000    69.500000    70.000000  2898.00000     7.120000   

                 y            z  
count  1000.000000  1000.000000  
mean      5.599180     3.457530  
std       0.611974     0.389819  
min       3.750000     2.270000  
25%       5.630000     3.450000  
50%       5.760000     3.550000  
75%       5.910000     3.640000  
max       7.050000     4.330000  


In [21]:
print(diamonds2.describe())

             carat        depth        table       price            x  \
count  1000.000000  1000.000000  1000.000000  1000.00000  1000.000000   
mean      0.689280    61.722801    57.734699  2476.54000     5.605940   
std       0.195291     1.758879     2.467946   839.57562     0.625173   
min       0.200000    53.000000    52.000000   326.00000     3.790000   
25%       0.700000    60.900002    56.000000  2777.00000     5.640000   
50%       0.710000    61.799999    57.000000  2818.00000     5.770000   
75%       0.790000    62.599998    59.000000  2856.00000     5.920000   
max       1.270000    69.500000    70.000000  2898.00000     7.120000   

                 y            z  
count  1000.000000  1000.000000  
mean      5.599180     3.457530  
std       0.611974     0.389819  
min       3.750000     2.270000  
25%       5.630000     3.450000  
50%       5.760000     3.550000  
75%       5.910000     3.640000  
max       7.050000     4.330000  


In [None]:
# 檢視類別型別的相異資料數
diamonds2.cut.value_counts()

cut
Ideal        333
Premium      290
Very Good    226
Good          89
Fair          62
Name: count, dtype: int64

In [23]:
# 檢視類別型別的相異資料數
diamonds2.color.value_counts()

color
E    240
F    226
G    139
D    129
H    125
I     95
J     46
Name: count, dtype: int64

In [24]:
# 檢視類別型別的相異資料數
diamonds2.clarity.value_counts()

clarity
SI1     306
VS2     218
VS1     159
SI2     154
VVS2     62
VVS1     58
I1       29
IF       14
Name: count, dtype: int64

In [25]:
# 轉換為categorical dtype
diamonds3 = pd.read_csv('../../data/diamonds.csv', nrows=1000,
                        dtype={'carat':np.float32, 'depth':np.float32,
                               'table': np.float32, 'x': np.float32,
                               'y':np.float32, 'z':np.float32,
                               'price': np.int16,
                               'cut':'category', 'color': 'category',
                               'clarity': 'category'})

diamonds3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
 7   x        1000 non-null   float32 
 8   y        1000 non-null   float32 
 9   z        1000 non-null   float32 
dtypes: category(3), float32(6), int16(1)
memory usage: 29.4 KB
