# 4.2 預處理資料

In [112]:
# 取得中文字型
!wget 'https://github.com/flyingpath/electron-hand-dicom/raw/master/TaipeiSansTCBeta-Regular.ttf'

--2022-11-17 08:23:39--  https://github.com/flyingpath/electron-hand-dicom/raw/master/TaipeiSansTCBeta-Regular.ttf
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/flyingpath/electron-hand-dicom/master/TaipeiSansTCBeta-Regular.ttf [following]
--2022-11-17 08:23:39--  https://raw.githubusercontent.com/flyingpath/electron-hand-dicom/master/TaipeiSansTCBeta-Regular.ttf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20659344 (20M) [application/octet-stream]
Saving to: ‘TaipeiSansTCBeta-Regular.ttf.1’


2022-11-17 08:23:39 (143 MB/s) - ‘TaipeiSansTCBeta-Regular.ttf.1’ saved [20659344/20659344]



In [113]:
# 共通事前處理

# 隱藏不必要的警告
import warnings
warnings.filterwarnings('ignore')

# 匯入必要的函式庫
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm

# 將字型新增到 matplotlib
fm.fontManager.addfont('./TaipeiSansTCBeta-Regular.ttf')

# 用來顯示資料框的函式
from IPython.display import display

# 調整顯示選項
# NumPy 的浮點數表示精度
np.set_printoptions(suppress=True, precision=4)

# pandas 中的浮點數表示精度
pd.options.display.float_format = '{:.4f}'.format

# 顯示資料框中的所有欄位
pd.set_option("display.max_columns",None)

# 指定圖形的預設字體大小
plt.rcParams["font.size"] = 14
# 指定圖形的預設字型
plt.rcParams['font.family'] = 'Taipei Sans TC Beta'

# 隨機種子
random_seed = 123

In [114]:
# 匯入追加的函式庫
import seaborn as sns

# 載入範例資料
df_titanic = sns.load_dataset("titanic")

# 欄位名稱中文化
columns_t = ['生還', '艙等', '性別', '年齡', '手足與配偶數', 
             '父母與子女數', '票價', '乘船港代碼', '艙等名', 
             '男女兒童', '成人男子', '甲板', '乘船港', '生還與否', '單身']
df_titanic.columns = columns_t

#### 確認資料

In [115]:
display(df_titanic.head())

Unnamed: 0,生還,艙等,性別,年齡,手足與配偶數,父母與子女數,票價,乘船港代碼,艙等名,男女兒童,成人男子,甲板,乘船港,生還與否,單身
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [116]:
print(df_titanic.shape)

(891, 15)


### 4.2.1 刪除多餘的欄位

In [117]:
# 刪除多餘的行

# 「艙等名」（與「艙等」相同）
df1 = df_titanic.drop('艙等名', axis=1)

# 「乘船港」（與「乘船港代碼」相同）
df2 = df1.drop('乘船港', axis=1)

# 「生還與否」（與「生還」相同）
df3 = df2.drop('生還與否', axis=1)

# 確認結果
display(df3.head())

Unnamed: 0,生還,艙等,性別,年齡,手足與配偶數,父母與子女數,票價,乘船港代碼,男女兒童,成人男子,甲板,單身
0,0,3,male,22.0,1,0,7.25,S,man,True,,False
1,1,1,female,38.0,1,0,71.2833,C,woman,False,C,False
2,1,3,female,26.0,0,0,7.925,S,woman,False,,True
3,1,1,female,35.0,1,0,53.1,S,woman,False,C,False
4,0,3,male,35.0,0,0,8.05,S,man,True,,True


### 4.2.2 處理缺失值

In [118]:
# 確認缺失值
display(df3.isnull().sum())

生還          0
艙等          0
性別          0
年齡        177
手足與配偶數      0
父母與子女數      0
票價          0
乘船港代碼       2
男女兒童        0
成人男子        0
甲板        688
單身          0
dtype: int64

In [119]:
display(df3['甲板'].value_counts())

C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: 甲板, dtype: int64

#### 決定策略

**乘船港代碼**: 缺失列數很少，只有 2 列  
-> 逐列刪除      

**年齡**: 數值資料，缺失列數很多，有 177 列  
-> 以資料的平均值代替   

**甲板**: 標籤值資料，缺失列數非常多，有 688 列  
-> 利用代表缺失的虛擬碼處理全列  

In [120]:
# 乘船港代碼：缺失列數很少，只有 2 列 
# -> 逐列刪除

# 利用 dropna 函式
df4 = df3.dropna(subset = ['乘船港代碼'])

# 年齡：數值資料，缺失列數很多，有 177 列 
# -> 以其他資料的平均值代替

# 計算平均值
age_average = df4['年齡'].mean()

# 利用 fillna 函式
df5 = df4.fillna({'年齡': age_average})

# 甲板：標籤值資料，缺失列數非常多，有 688 列
# -> 利用代表缺失的虛擬碼處理全列

#display(df5.head())
#print(df5.isnull())

# 因套件版本更新, 原本的功能失效, 故請改用此處的修正
# 先為甲板 category 中加入類別 N, 再用 fillna 將 nan 填入虛擬碼 'N'）

df5['甲板'].cat.add_categories('N', inplace=True)  #可善用inplace, 就不用一直增加新的 DataFrame
df6 = df5.fillna({'甲板': 'N'})




In [121]:
# 結果確認
display(df6.isnull().sum())

display(df6.head())

生還        0
艙等        0
性別        0
年齡        0
手足與配偶數    0
父母與子女數    0
票價        0
乘船港代碼     0
男女兒童      0
成人男子      0
甲板        0
單身        0
dtype: int64

Unnamed: 0,生還,艙等,性別,年齡,手足與配偶數,父母與子女數,票價,乘船港代碼,男女兒童,成人男子,甲板,單身
0,0,3,male,22.0,1,0,7.25,S,man,True,N,False
1,1,1,female,38.0,1,0,71.2833,C,woman,False,C,False
2,1,3,female,26.0,0,0,7.925,S,woman,False,N,True
3,1,1,female,35.0,1,0,53.1,S,woman,False,C,False
4,0,3,male,35.0,0,0,8.05,S,man,True,N,True


### 4.2.3 數值化二元資料

**性別**: male / female  
**成人男子**: True / False  
**單身**: True / False  
    
將上述欄位各自數值化為 1/ 0

#### 數值化「性別」

In [122]:
display(df6['性別'].value_counts())

male      577
female    312
Name: 性別, dtype: int64

In [123]:
# 定義字典 mf_map
mf_map = {'male': 1, 'female': 0}

# 利用 map 函式進行數值化
df7 = df6.copy()
df7['性別'] = df7['性別'].map(mf_map)

# 確認結果
display(df7.head())

Unnamed: 0,生還,艙等,性別,年齡,手足與配偶數,父母與子女數,票價,乘船港代碼,男女兒童,成人男子,甲板,單身
0,0,3,1,22.0,1,0,7.25,S,man,True,N,False
1,1,1,0,38.0,1,0,71.2833,C,woman,False,C,False
2,1,3,0,26.0,0,0,7.925,S,woman,False,N,True
3,1,1,0,35.0,1,0,53.1,S,woman,False,C,False
4,0,3,1,35.0,0,0,8.05,S,man,True,N,True


#### 數值化「成人男子」與「單身」

In [124]:
display(df7['成人男子'].value_counts())

True     537
False    352
Name: 成人男子, dtype: int64

In [125]:
# 定義字典 tf_map
tf_map = {True: 1, False: 0}

# 利用 map 函式進行數值化
df8 = df7.copy()
df8['成人男子'] = df8['成人男子'].map(tf_map)

# 利用 map 函式進行數值化
df9 = df8.copy()
df9['單身'] = df8['單身'].map(tf_map)

# 確認結果
display(df9.head())

Unnamed: 0,生還,艙等,性別,年齡,手足與配偶數,父母與子女數,票價,乘船港代碼,男女兒童,成人男子,甲板,單身
0,0,3,1,22.0,1,0,7.25,S,man,1,N,0
1,1,1,0,38.0,1,0,71.2833,C,woman,0,C,0
2,1,3,0,26.0,0,0,7.925,S,woman,0,N,1
3,1,1,0,35.0,1,0,53.1,S,woman,0,C,0
4,0,3,1,35.0,0,0,8.05,S,man,1,N,1


### 4.2.4 數值化多元資料
One-Hot 編碼

In [126]:
# 轉換前
display(df9[['男女兒童']].head(10))

Unnamed: 0,男女兒童
0,man
1,woman
2,woman
3,woman
4,man
5,man
6,man
7,child
8,woman
9,child


In [127]:
# get_dummies 函式的使用範例

w = pd.get_dummies(df9['男女兒童'], prefix='男女兒童')
display(w.head(10))

Unnamed: 0,男女兒童_child,男女兒童_man,男女兒童_woman
0,0,1,0
1,0,0,1
2,0,0,1
3,0,0,1
4,0,1,0
5,0,1,0
6,0,1,0
7,1,0,0
8,0,0,1
9,1,0,0


In [128]:
# 定義一個利用 get_dummies 函式將種類值展開成 one hot vector 的函式
# df 目標資料框
# column 目標行

def enc(df, column):
    # 生成 One Hot Vector
    df_dummy = pd.get_dummies(df[column], prefix=column)
    # 刪除原始行
    df_drop = df.drop([column], axis=1)
    # 連結已刪除原始行之資料框與 One Hot 生成之行
    df1 = pd.concat([df_drop,df_dummy],axis=1)
    return df1

#### 男女兒童
man / woman / child

In [129]:
# 確認欄位值
display(df9['男女兒童'].value_counts())

man      537
woman    269
child     83
Name: 男女兒童, dtype: int64

In [130]:
# One-Hot 編碼

# 男女兒童
df10 = enc(df9, '男女兒童')

# 確認結果
display(df10.head())

Unnamed: 0,生還,艙等,性別,年齡,手足與配偶數,父母與子女數,票價,乘船港代碼,成人男子,甲板,單身,男女兒童_child,男女兒童_man,男女兒童_woman
0,0,3,1,22.0,1,0,7.25,S,1,N,0,0,1,0
1,1,1,0,38.0,1,0,71.2833,C,0,C,0,0,0,1
2,1,3,0,26.0,0,0,7.925,S,0,N,1,0,0,1
3,1,1,0,35.0,1,0,53.1,S,0,C,0,0,0,1
4,0,3,1,35.0,0,0,8.05,S,1,N,1,0,1,0


#### 乘船港代碼與甲板

In [131]:
# One-Hot 編碼

# 乘船港代碼
df11 = enc(df10, '乘船港代碼')

# 甲板
df12 = enc(df11, '甲板')

# 確認結果
display(df12.head())

Unnamed: 0,生還,艙等,性別,年齡,手足與配偶數,父母與子女數,票價,成人男子,單身,男女兒童_child,男女兒童_man,男女兒童_woman,乘船港代碼_C,乘船港代碼_Q,乘船港代碼_S,甲板_A,甲板_B,甲板_C,甲板_D,甲板_E,甲板_F,甲板_G,甲板_N
0,0,3,1,22.0,1,0,7.25,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1
1,1,1,0,38.0,1,0,71.2833,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0
2,1,3,0,26.0,0,0,7.925,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1
3,1,1,0,35.0,1,0,53.1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0
4,0,3,1,35.0,0,0,8.05,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1


### 4.2.5 資料標準化

In [132]:
# standardization 

df13 = df12.copy()
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
df13[['年齡', '票價']] = stdsc.fit_transform(df13[['年齡', '票價']])

# 確認結果
display(df13.head())

Unnamed: 0,生還,艙等,性別,年齡,手足與配偶數,父母與子女數,票價,成人男子,單身,男女兒童_child,男女兒童_man,男女兒童_woman,乘船港代碼_C,乘船港代碼_Q,乘船港代碼_S,甲板_A,甲板_B,甲板_C,甲板_D,甲板_E,甲板_F,甲板_G,甲板_N
0,0,3,1,-0.5896,1,0,-0.5002,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1
1,1,1,0,0.6448,1,0,0.7889,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0
2,1,3,0,-0.281,0,0,-0.4866,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1
3,1,1,0,0.4134,1,0,0.4229,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0
4,0,3,1,0.4134,0,0,-0.4841,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1
