In [1]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('titanic.csv',encoding='cp932')

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [6]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [7]:
df_label = df[[ 'sex',  'sibsp', 'parch', 
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone']]

df_label = df_label.astype({'class':str, 'deck':str}).fillna('欠損値')

In [8]:
df_label.head(1)

Unnamed: 0,sex,sibsp,parch,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,male,1,0,S,Third,man,True,,Southampton,no,False


In [10]:
main_col_label_dic = {}
for col in df_label.columns.tolist():
    cnt = 0
    col_label_dic = {}
    for val in np.unique(df_label[col]):
        col_label_dic.setdefault(val, cnt)
        cnt += 1
        main_col_label_dic[col] = col_label_dic
main_col_label_dic

{'sex': {'female': 0, 'male': 1},
 'sibsp': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 8: 6},
 'parch': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6},
 'embarked': {'C': 0, 'Q': 1, 'S': 2, '欠損値': 3},
 'class': {'First': 0, 'Second': 1, 'Third': 2},
 'who': {'child': 0, 'man': 1, 'woman': 2},
 'adult_male': {False: 0, True: 1},
 'deck': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'nan': 7},
 'embark_town': {'Cherbourg': 0, 'Queenstown': 1, 'Southampton': 2, '欠損値': 3},
 'alive': {'no': 0, 'yes': 1},
 'alone': {False: 0, True: 1}}

### ラベルエンコーディング

In [12]:
df_label_enc = df_label.copy()
# 列毎にラベルエンコーディングを実施
# map()の引数に辞書dict（{key: value}）を指定すると、keyと一致する要素がvalueに置き換えられる。
for col in df_label_enc.columns.tolist():
    df_label_enc[col] = df_label_enc[col].map(main_col_label_dic[col])

In [13]:
df_label_enc.head()

Unnamed: 0,sex,sibsp,parch,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,1,1,0,2,2,1,1,7,2,0,0
1,0,1,0,0,0,2,0,2,0,1,0
2,0,0,0,2,2,2,0,7,2,1,1
3,0,1,0,2,0,2,0,2,2,1,0
4,1,0,0,2,2,1,1,7,2,0,1


In [14]:
df_label_enc.shape 

(891, 11)

In [15]:
# 数値型の列と結合してまとめる
df_merge = pd.merge(df, df_label_enc, left_index=True, right_index=True, suffixes=['','_label'])
display(df_merge.shape, df_merge.head())

(891, 26)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,sibsp_label,parch_label,embarked_label,class_label,who_label,adult_male_label,deck_label,embark_town_label,alive_label,alone_label
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,1,0,2,2,1,1,7,2,0,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,1,0,0,0,2,0,2,0,1,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,0,0,2,2,2,0,7,2,1,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,...,1,0,2,0,2,0,2,2,1,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,...,0,0,2,2,1,1,7,2,0,1


In [16]:
gc.collect()

25