In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
import os
ROOT = "/content/gdrive/MyDrive/DACON_TITANIC"
train_pth = os.path.join(ROOT, 'train.csv')
test_pth = os.path.join(ROOT, 'test.csv')
submission_pth = os.path.join(ROOT, 'submission.csv')

In [292]:
train_df = pd.read_csv(train_pth)
test_df = pd.read_csv(test_pth)
ss_df = pd.read_csv(submission_pth)

# 1. 전처리

In [293]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [294]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [295]:
train_df.groupby('Survived').count() # Slightly Imbalanced

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,549,549,549,549,424,549,549,549,549,68,549
1,342,342,342,342,290,342,342,342,342,136,340


In [296]:
# Age는 수치형, Cabin은 범주형, Embarked는 범주형
# Cabin은 결측치가 너무 많아 제거
# Embarked, Age는 채워넣기
# Age 결측에 대한 변수를 추가
train_df = train_df.drop('Cabin', axis = 1)
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
train_df['Fare'] = train_df['Fare'].fillna(train_df['Fare'].mean())
age_Null = []
for age in train_df['Age']:
  if np.isnan(age):
    age_Null.append(1)
  else:
    age_Null.append(0)
train_df['Age_NULL'] = age_Null

In [297]:
test_df = test_df.drop('Cabin', axis = 1)
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

# train_df의 mode로 채움 : data leakage 방지
test_df['Embarked'] = test_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
age_Null = []
for age in test_df['Age']:
  if np.isnan(age):
    age_Null.append(1)
  else:
    age_Null.append(0)
test_df['Age_NULL'] = age_Null

In [298]:
target = train_df['Survived']
train_df = train_df.drop('Survived', axis = 1)

dtypes = train_df.dtypes
cat_cols = []
num_cols = []
for dtype_idx in range(len(dtypes)):
  if dtypes.iloc[dtype_idx] == 'object':
    cat_cols.append(dtypes.index[dtype_idx])
  else:
    num_cols.append(dtypes.index[dtype_idx])

In [299]:
#Name으로부터는 Mr. Mrs. Miss 등의 정보를 추출함. '.' 앞에 이러한 단어가 오는 것으로 추정

def extract(full):
  for item in full.split():
    if '.' in item:
      return item
  return

train_df['NAME_TAG'] = train_df['Name'].apply(extract)
test_df['NAME_TAG'] = test_df['Name'].apply(extract)
tags = train_df['Name'].apply(extract).unique().tolist()

common_tag = {'Mr.' : 1, 'Mrs.' : 2 , 'Miss.' : 3, 'Master.' : 4}
train_df['NAME_TAG'] = train_df['NAME_TAG'].map(common_tag)
train_df['NAME_TAG'] = train_df['NAME_TAG'].fillna(0)

test_df['NAME_TAG'] = test_df['NAME_TAG'].map(common_tag)
test_df['NAME_TAG'] = test_df['NAME_TAG'].fillna(0)

In [300]:
train_df = train_df.drop('Name', axis = 1)
train_df = train_df.drop('PassengerId', axis = 1)
train_df = train_df.drop('Ticket', axis = 1)
test_df = test_df.drop('Name', axis = 1)
test_df = test_df.drop('PassengerId', axis = 1)
test_df = test_df.drop('Ticket', axis = 1)

In [301]:
cat_cols = cat_cols + ['Pclass', 'SibSp', 'Parch', 'Age_NULL', 'NAME_TAG']
cat_cols.remove('Name')
cat_cols.remove('Ticket')
cat_cols

['Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch', 'Age_NULL', 'NAME_TAG']

In [302]:
# Parch : 0,1,2, else로 인코딩
# SibSp : 0,1, else로 인코딩
# Pclass : 1,2,3으로 인코딩
# Embarked : S,C,Q로 인코딩
# NAME_TAG : 0,1,2,3,4로 인코딩
# Sex : male, female로 인코딩
# Age_NULL : 인코딩 필요 없음

encoding_map = dict()

for c in cat_cols:
    a = train_df.groupby(c)[c].count().sort_values(ascending = False)/len(train_df)*100
    a = a[a>1]
    
    encoding_map[c] = []
    
    for idx in a.index:
        encoding_map[c].append(idx)

for c in cat_cols:
    for idx in encoding_map[c]:
        train_df[c+'_'+str(idx)] = np.where(train_df[c] == idx, 1, 0)
        test_df[c+'_'+str(idx)] = np.where(test_df[c] == idx, 1, 0)

train_df = train_df.drop(['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Age_NULL', 'NAME_TAG'], axis = 1)
test_df =  test_df.drop(['Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch', 'Age_NULL', 'NAME_TAG'], axis = 1)
display(train_df)
display(test_df)

Unnamed: 0,Age,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Pclass_3,Pclass_1,Pclass_2,SibSp_0,SibSp_1,SibSp_2,SibSp_4,SibSp_3,Parch_0,Parch_1,Parch_2,Age_NULL_0,Age_NULL_1,NAME_TAG_1.0,NAME_TAG_3.0,NAME_TAG_2.0,NAME_TAG_4.0,NAME_TAG_0.0
0,22.0,7.2500,1,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0
1,38.0,71.2833,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0
2,26.0,7.9250,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0
3,35.0,53.1000,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0
4,35.0,8.0500,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,13.0000,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1
887,19.0,30.0000,0,1,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0
888,,23.4500,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0
889,26.0,30.0000,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0


Unnamed: 0,Age,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Pclass_3,Pclass_1,Pclass_2,SibSp_0,SibSp_1,SibSp_2,SibSp_4,SibSp_3,Parch_0,Parch_1,Parch_2,Age_NULL_0,Age_NULL_1,NAME_TAG_1.0,NAME_TAG_3.0,NAME_TAG_2.0,NAME_TAG_4.0,NAME_TAG_0.0
0,34.5,7.8292,1,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0
1,47.0,7.0000,0,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0
2,62.0,9.6875,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0
3,27.0,8.6625,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0
4,22.0,12.2875,0,1,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,,8.0500,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0
414,39.0,108.9000,0,1,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1
415,38.5,7.2500,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0
416,,8.0500,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0


In [303]:
# Normalize Fare
def minmaxscaling(series):
  max = series.max()
  min = series.min()
  return series.map(lambda x : (x-min)/max)

train_df['Fare'] = minmaxscaling(train_df['Fare'])
test_df['Fare'] = minmaxscaling(test_df['Fare'])

In [304]:
import xgboost
age_regressor = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)
age_train_df = train_df[~train_df['Age'].isnull()]
age_target = age_train_df['Age']
age_feature = age_train_df.drop('Age', axis = 1)
age_regressor.fit(age_feature, age_target)



XGBRegressor(learning_rate=0.08, max_depth=7, subsample=0.75)

In [305]:
train_input = train_df[train_df['Age'].isnull()].drop('Age', axis = 1)
test_input = test_df[test_df['Age'].isnull()].drop('Age', axis = 1)

train_df['Age'].iloc[np.where(train_df['Age'].isnull()==True)] =  age_regressor.predict(train_input)
test_df['Age'].iloc[np.where(test_df['Age'].isnull()==True)] =  age_regressor.predict(test_input)

In [306]:
train_df['Age'] = minmaxscaling(train_df['Age'])
test_df['Age'] = minmaxscaling(test_df['Age'])

#2. 모델

In [307]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.model_selection import GridSearchCV

In [308]:
X_train, X_val, y_train, y_val = train_test_split(train_df, target, test_size = 0.2, random_state = 42, stratify = target)


## 2.1. 트리

In [309]:
# Pruned model
dt_clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, max_leaf_nodes = 24) # gini is better than entropy
dt_clf.fit(X_train, y_train)
tree_pred = dt_clf.predict(X_val)
print('f1', f1_score(y_val, tree_pred))
print('accuracy', accuracy_score(y_val, tree_pred))
print('AUC', roc_auc_score(y_val, tree_pred))

f1 0.7596899224806202
accuracy 0.8268156424581006
AUC 0.8050724637681159


In [310]:
# GridSearch
grid_dt = DecisionTreeClassifier()
criterion = ['gini', 'entropy']
max_depth = [3,4,5,6]
max_leaf = [16,20,24,28]
parameter_grid = {'criterion': criterion,
                    'max_depth': max_depth,
                    'max_leaf_nodes': max_leaf}
gs = GridSearchCV(estimator= grid_dt, param_grid= parameter_grid, scoring= 'f1') 
gs.fit(X_train, y_train)
print('GridSearch 최적 parameter: {}'.format(gs.best_params_),
      'GridSearch 최고 Validation Score: {:.3f}'.format(gs.best_score_), sep = '\n')

GridSearch 최적 parameter: {'criterion': 'gini', 'max_depth': 6, 'max_leaf_nodes': 20}
GridSearch 최고 Validation Score: 0.741


In [311]:
# 최적모델 활용
dt_clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 5, max_leaf_nodes = 16)
dt_clf.fit(X_train, y_train)
tree_pred = dt_clf.predict(X_val)
print('f1', f1_score(y_val, tree_pred))
print('accuracy', accuracy_score(y_val, tree_pred))
print('AUC', roc_auc_score(y_val, tree_pred))

f1 0.7575757575757576
accuracy 0.8212290502793296
AUC 0.8032279314888011


## 2.2 Logistic Regression

In [312]:
from sklearn.linear_model import LogisticRegression

logistic_clf = LogisticRegression() 
logistic_clf.fit(X_train, y_train) 
logistic_pred = logistic_clf.predict(X_val)
print("--Vanilla Logistic Regression--")
print("")
print('Accuracy:: {:.2f}'.format(accuracy_score(y_val, logistic_pred)),
      'F1_Score:: {:.2f}'.format(f1_score(y_val, logistic_pred)),
      'ROC_AUC_Score:: {:.2f}'.format(roc_auc_score(y_val, logistic_pred)), sep = '\n')

--Vanilla Logistic Regression--

Accuracy:: 0.83
F1_Score:: 0.77
ROC_AUC_Score:: 0.81


## 2.3 SVM

In [313]:
from sklearn import svm

svm_clf = svm.SVC(probability = True) 
svm_clf.fit(X_train, y_train) 
svm_pred = svm_clf.predict(X_val)
print("--Vanilla SVM--")
print("")
print('Accuracy:: {:.2f}'.format(accuracy_score(y_val, svm_pred)),
      'F1_Score:: {:.2f}'.format(f1_score(y_val, svm_pred)),
      'ROC_AUC_Score:: {:.2f}'.format(roc_auc_score(y_val, svm_pred)), sep = '\n')

--Vanilla SVM--

Accuracy:: 0.82
F1_Score:: 0.73
ROC_AUC_Score:: 0.79


## 2.4 XGB

In [314]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier() 
xgb_clf.fit(X_train, y_train) 
xgb_pred = xgb_clf.predict(X_val)
print("--Vanilla XGB--")
print("")
print('Accuracy:: {:.2f}'.format(accuracy_score(y_val, xgb_pred)),
      'F1_Score:: {:.2f}'.format(f1_score(y_val, xgb_pred)),
      'ROC_AUC_Score:: {:.2f}'.format(roc_auc_score(y_val, xgb_pred)), sep = '\n')

--Vanilla XGB--

Accuracy:: 0.80
F1_Score:: 0.72
ROC_AUC_Score:: 0.77


# 3. Ensemble with softvoting

In [315]:
tree_prob = dt_clf.predict_proba(X_val)
log_prob = logistic_clf.predict_proba(X_val)
svm_prob = svm_clf.predict_proba(X_val)
xgb_prob = xgb_clf.predict_proba(X_val)

In [316]:
final_prob = (tree_prob + log_prob + svm_prob + xgb_prob)/4
pred = []
for item in final_prob:
  pred.append(np.argmax(item))
print("--Ensemble--")
print("")
print('Accuracy:: {:.2f}'.format(accuracy_score(y_val, pred)),
      'F1_Score:: {:.2f}'.format(f1_score(y_val, pred)),
      'ROC_AUC_Score:: {:.2f}'.format(roc_auc_score(y_val, pred)), sep = '\n')

--Ensemble--

Accuracy:: 0.83
F1_Score:: 0.77
ROC_AUC_Score:: 0.81


# 4. Inference

In [317]:
test_df[test_df['Fare'].isnull()]

Unnamed: 0,Age,Fare,Sex_male,Sex_female,Embarked_S,Embarked_C,Embarked_Q,Pclass_3,Pclass_1,Pclass_2,SibSp_0,SibSp_1,SibSp_2,SibSp_4,SibSp_3,Parch_0,Parch_1,Parch_2,Age_NULL_0,Age_NULL_1,NAME_TAG_1.0,NAME_TAG_3.0,NAME_TAG_2.0,NAME_TAG_4.0,NAME_TAG_0.0


In [319]:
tree_prob = dt_clf.predict_proba(test_df)
log_prob = logistic_clf.predict_proba(test_df)
svm_prob = svm_clf.predict_proba(test_df)
xgb_prob = xgb_clf.predict_proba(test_df)
final_prob = (tree_prob + log_prob + svm_prob + xgb_prob)/4
pred = []
for item in final_prob:
  pred.append(np.argmax(item))
ss_df['Survived'] = pred
ss_df.to_csv('submission.csv', index = False)