## 가볍게 타이타닉 생존자 예측을 진행해보자

In [90]:
from tensorflow import keras
from sklearn.model_selection import train_test_split

import datawig
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import get_dummies
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import * 
from sklearn.model_selection import GridSearchCV

import warnings    # 경고메시지 제외
warnings.filterwarnings(action='ignore')

In [23]:
data = pd.read_csv("/content/titanic.csv")

In [24]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Embarked는 2개의 결측만 가지므로 해당 행을 제거

In [29]:
data.dropna(axis = 0, subset = ["Embarked"], inplace = True)

## x와 y분할

In [30]:
target = "Survived"
x = data.drop(target, axis = 1)
y = data[target]

In [31]:
x.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

## 개인 식별값을 가지는 열들과 너무 많은 결측치를 갖는 Cabin 제거

In [32]:
x.drop(["PassengerId", "Ticket", "Name"], axis = 1, inplace = True)
x.drop(["Cabin"], axis = 1, inplace = True)

### 범주형 변수 더미화

In [33]:
columns = ["Sex", "Embarked","Pclass"]

In [34]:
x = pd.get_dummies(x, drop_first=True, columns=columns)

### Age 결측치는 딥러닝을 이용하여 채워줌

In [49]:
imputer = datawig.SimpleImputer(input_columns=(set(x.columns) - set("Age")),
                                output_column="Age")
imputer.fit(train_df=x, num_epochs=50)
null_train = x[x["Age"].isnull()]
null_imputed = imputer.predict(null_train)
imputed_train = pd.DataFrame(null_imputed)

In [63]:
imputed_train.head()
imputed_train["Age"] = imputed_train["Age_imputed"]
imputed_train.drop(["Age_imputed"], axis = 1, inplace = True)

In [65]:
imputed_train["Age"] = np.round(imputed_train["Age"])

In [73]:
imputed_train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
5,30.0,0,0,8.4583,1,1,0,0,1
17,30.0,0,0,13.0,1,0,1,1,0
19,30.0,0,0,7.225,0,0,0,0,1
26,30.0,0,0,7.225,1,0,0,0,1
28,30.0,0,0,7.8792,0,1,0,0,1


In [75]:
imputed_train.index

Int64Index([  5,  17,  19,  26,  28,  29,  31,  32,  36,  42,
            ...
            832, 837, 839, 846, 849, 859, 863, 868, 878, 888],
           dtype='int64', length=177)

In [76]:
for idx in imputed_train.index:
  x.loc[idx] = imputed_train.loc[idx]

In [77]:
x.isna().sum()

Age           0
SibSp         0
Parch         0
Fare          0
Sex_male      0
Embarked_Q    0
Embarked_S    0
Pclass_2      0
Pclass_3      0
dtype: int64

### 결측치, 범주형 변수 처리 완료, train data, val data 분할

In [78]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15)

In [79]:
from sklearn.preprocessing import MinMaxScaler

### SVC, KNN을 위한 스케일링

In [99]:
scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

## 모델1. 로지스틱회귀

In [80]:
model1 = LogisticRegression()
model1.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [83]:
pred1 = model1.predict(x_val)

In [86]:
print(confusion_matrix(y_val, pred1))
print(classification_report(y_val, pred1))

[[70  9]
 [19 36]]
              precision    recall  f1-score   support

           0       0.79      0.89      0.83        79
           1       0.80      0.65      0.72        55

    accuracy                           0.79       134
   macro avg       0.79      0.77      0.78       134
weighted avg       0.79      0.79      0.79       134



## 모델2. KNNClassifier

In [87]:
from sklearn.neighbors import KNeighborsClassifier


In [100]:
model2 = KNeighborsClassifier()
model2.fit(x_train_s,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [101]:
pred2 = model2.predict(x_val_s)

In [102]:
print(confusion_matrix(y_val, pred2))
print(classification_report(y_val, pred2))

[[65 14]
 [14 41]]
              precision    recall  f1-score   support

           0       0.82      0.82      0.82        79
           1       0.75      0.75      0.75        55

    accuracy                           0.79       134
   macro avg       0.78      0.78      0.78       134
weighted avg       0.79      0.79      0.79       134



## 모델3. Decision Tree

In [92]:
from   sklearn.tree import  DecisionTreeClassifier

In [94]:
model3 = DecisionTreeClassifier()
model3.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [95]:
pred3 = model3.predict(x_val)
print(confusion_matrix(y_val, pred3))
print(classification_report(y_val, pred3))

[[64 15]
 [16 39]]
              precision    recall  f1-score   support

           0       0.80      0.81      0.81        79
           1       0.72      0.71      0.72        55

    accuracy                           0.77       134
   macro avg       0.76      0.76      0.76       134
weighted avg       0.77      0.77      0.77       134



## 모델4. SVC

In [96]:
from sklearn.svm import SVC

In [103]:
model4 = SVC()
model4.fit(x_train_s, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [105]:
pred4 = model4.predict(x_val_s)
print(confusion_matrix(y_val, pred4))
print(classification_report(y_val, pred4))

[[74  5]
 [23 32]]
              precision    recall  f1-score   support

           0       0.76      0.94      0.84        79
           1       0.86      0.58      0.70        55

    accuracy                           0.79       134
   macro avg       0.81      0.76      0.77       134
weighted avg       0.80      0.79      0.78       134



## 모델5. 랜덤포레스트

In [106]:
from sklearn.ensemble import RandomForestClassifier

In [107]:
model5 = RandomForestClassifier()
model5.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [108]:
pred5 = model5.predict(x_val)
print(confusion_matrix(y_val, pred5))
print(classification_report(y_val, pred5))

[[66 13]
 [13 42]]
              precision    recall  f1-score   support

           0       0.84      0.84      0.84        79
           1       0.76      0.76      0.76        55

    accuracy                           0.81       134
   macro avg       0.80      0.80      0.80       134
weighted avg       0.81      0.81      0.81       134



## 모델6. KNNClassifier + 튜닝

In [111]:
params = {"n_neighbors":range(1,10)}

model = KNeighborsClassifier()

model6 = GridSearchCV(model,
                      params,
                      cv = 10)
model6.fit(x_train_s, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': range(1, 10)}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=False, scoring=None, verbose=0)

In [112]:
pred6 = model6.predict(x_val_s)
print(confusion_matrix(y_val, pred6))
print(classification_report(y_val, pred6))

[[68 11]
 [16 39]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        79
           1       0.78      0.71      0.74        55

    accuracy                           0.80       134
   macro avg       0.79      0.78      0.79       134
weighted avg       0.80      0.80      0.80       134



## 모델7 Decision Tree + 튜닝

In [113]:
params = {'min_samples_leaf' : range(10,100,10), 'max_depth' : range(1,10,2)}

model = DecisionTreeClassifier()
model7 = GridSearchCV(model,
                      params,
                      cv = 10)
model7.fit(x_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': range(1, 10, 2),
                      

In [114]:
pred7 = model7.predict(x_val)
print(confusion_matrix(y_val, pred7))
print(classification_report(y_val, pred7))

[[73  6]
 [18 37]]
              precision    recall  f1-score   support

           0       0.80      0.92      0.86        79
           1       0.86      0.67      0.76        55

    accuracy                           0.82       134
   macro avg       0.83      0.80      0.81       134
weighted avg       0.83      0.82      0.82       134



## 모델8 SVC + 튜닝

In [115]:
params = {'C' : range(11,15), 'gamma' : np.linspace(0.005, 0.02, 5)}

model = SVC()
model8 = GridSearchCV(model,
                      params,
                      cv = 10
)

model8.fit(x_train_s, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': range(11, 15),
                         'gamma': array([0.005  , 0.00875, 0.0125 , 0.01625, 0.02   ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [117]:
pred8 = model8.predict(x_val_s)
print(confusion_matrix(y_val, pred8))
print(classification_report(y_val, pred8))

[[67 12]
 [19 36]]
              precision    recall  f1-score   support

           0       0.78      0.85      0.81        79
           1       0.75      0.65      0.70        55

    accuracy                           0.77       134
   macro avg       0.76      0.75      0.76       134
weighted avg       0.77      0.77      0.77       134



## 모델9. randomforest + 튜닝

In [118]:
params = {'n_estimators':range(2,6),'min_samples_leaf' : range(1,50,10), 'max_depth' : range(12,17,2)}

model = RandomForestClassifier()
model9 = GridSearchCV(model,
                      params,
                      cv = 10)
model9.fit(x_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rand

In [120]:
pred9 = model9.predict(x_val)
print(confusion_matrix(y_val, pred9))
print(classification_report(y_val, pred9))

[[74  5]
 [18 37]]
              precision    recall  f1-score   support

           0       0.80      0.94      0.87        79
           1       0.88      0.67      0.76        55

    accuracy                           0.83       134
   macro avg       0.84      0.80      0.81       134
weighted avg       0.84      0.83      0.82       134



## 모델10. 딥러닝

In [121]:
keras.backend.clear_session()

il = keras.layers.Input(shape = (9,))
hl = keras.layers.Dense(1024,activation = "relu")(il)
hl = keras.layers.Dense(512,activation = "relu")(hl)
hl = keras.layers.Dense(256,activation = "relu")(hl)
hl = keras.layers.Dense(128,activation = "relu")(hl)
ol = keras.layers.Dense(1, activation = "sigmoid")(hl)

model = keras.models.Model(il,ol)

model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ['accuracy'])

In [123]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor = 'val_loss',
                  patience = 5,
                  min_delta = 0,
                  restore_best_weights=True)

In [124]:
model.fit(x_train_s, y_train, epochs = 500, verbose = 1, callbacks=[es], validation_split=0.2)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500


<keras.callbacks.History at 0x7f35ab9d7b10>

In [127]:
pred10 = model.predict(x_val_s)
pred10 = np.round(pred10)
print(confusion_matrix(y_val, pred10))
print(classification_report(y_val, pred10))

[[76  3]
 [23 32]]
              precision    recall  f1-score   support

           0       0.77      0.96      0.85        79
           1       0.91      0.58      0.71        55

    accuracy                           0.81       134
   macro avg       0.84      0.77      0.78       134
weighted avg       0.83      0.81      0.80       134

