In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [5]:
df = pd.read_csv('/content/drive/MyDrive/ybigta/titanic.csv')
df = df.drop('PassengerId', axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# 결측값 대체
# 지표변수
# Feature split
# label encoding
# scaling

**1. 결측값 대체**

In [7]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [10]:
df[df['Embarked'].isnull()]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [14]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [22]:
# 가장 많이 나온 'S'로 결측값 대체
df['Embarked'].fillna('S',inplace=True)

In [23]:
df.iloc[[61,829]]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,S
829,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,S


In [24]:
# 평균값으로 대체
df['Age'].fillna(df['Age'].mean(), inplace=True)

In [25]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      0
dtype: int64

In [26]:
# 'Cabin'은 결측치가 너무 많아서 대체하지 않음

**2. 지표 변수**

In [27]:
df['Age'].describe()

count    891.000000
mean      29.699118
std       13.002015
min        0.420000
25%       22.000000
50%       29.699118
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [28]:
def cut_off(x) :
  if x<10 :
    return 'baby'
  elif 10<=x<20 :
    return 'teenager'
  elif 20<=x<50 :
    return 'adult'
  else :
    return 'elder'

In [29]:
df['Age_cut_off'] = df['Age'].apply(cut_off)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_cut_off
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult


**3. Feature Split**

In [38]:
first_name = []
honorific = []
sur_name = []

for d in df['Name'].str.split(" ",2):
    first_name.append(d[0])
    honorific.append(d[1])
    sur_name.append(d[2])
df['first_name'] = first_name
df['honorific'] = honorific
df['sur_name'] = sur_name

In [39]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_cut_off,first_name,honorific,sur_name
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult,"Braund,",Mr.,Owen Harris
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult,"Cumings,",Mrs.,John Bradley (Florence Briggs Thayer)
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult,"Heikkinen,",Miss.,Laina
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult,"Futrelle,",Mrs.,Jacques Heath (Lily May Peel)
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult,"Allen,",Mr.,William Henry


**4. 스케일링**

In [40]:
from sklearn.preprocessing import StandardScaler
sd_scaler = StandardScaler()

In [42]:
sd_scaler = sd_scaler.fit_transform(df['Fare'].values.reshape(-1,1))
df['sd_scaled_Fare'] = sd_scaler
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_cut_off,first_name,honorific,sur_name,sd_scaled_Fare
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult,"Braund,",Mr.,Owen Harris,-0.502445
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult,"Cumings,",Mrs.,John Bradley (Florence Briggs Thayer),0.786845
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult,"Heikkinen,",Miss.,Laina,-0.488854
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult,"Futrelle,",Mrs.,Jacques Heath (Lily May Peel),0.42073
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult,"Allen,",Mr.,William Henry,-0.486337


**5. 라벨 인코딩**

In [43]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [44]:
encoded_age = encoder.fit_transform(df['Age_cut_off'])
encoded_embarked = encoder.fit_transform(df['Embarked'])
df['encoded_age'] = encoded_age
df['encoded_embarked'] = encoded_embarked

In [45]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_cut_off,first_name,honorific,sur_name,sd_scaled_Fare,encoded_age,encoded_embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult,"Braund,",Mr.,Owen Harris,-0.502445,0,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult,"Cumings,",Mrs.,John Bradley (Florence Briggs Thayer),0.786845,0,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult,"Heikkinen,",Miss.,Laina,-0.488854,0,2
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult,"Futrelle,",Mrs.,Jacques Heath (Lily May Peel),0.42073,0,2
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult,"Allen,",Mr.,William Henry,-0.486337,0,2
