### Module import, Read data

In [1]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
cd '/content/drive/MyDrive/Colab Notebooks'

/content/drive/MyDrive/Colab Notebooks


In [None]:
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

train_x = train.drop(['Survived'], axis=1)
train_y = train['Survived']

test_x = test.copy()

### Feature engineering

In [None]:
from sklearn.preprocessing import LabelEncoder

rmCols= ['PassengerId','Name', 'Ticket', 'Cabin']

train_x = train_x.drop(rmCols, axis=1)
test_x = test_x.drop(rmCols, axis=1)

for c in ['Sex', 'Embarked']:
  le = LabelEncoder()
  le.fit(train[c].fillna('NA'))

  train_x[c] = le.transform(train_x[c].fillna('NA'))
  test_x[c] = le.transform(test_x[c].fillna('NA'))

print(train_x.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    int64  
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 48.9 KB
None


### Modeling  

TODO 

---
* XGBClassifier
  * 개념/동작방식
  * params n_estimator, random_state의 개념

* visualize
  * https://subinium.github.io/MLwithPython-2-4/
  * [TF 블로그](https://tensorflow.blog/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D/2-4-%EB%B6%84%EB%A5%98-%EC%98%88%EC%B8%A1%EC%9D%98-%EB%B6%88%ED%99%95%EC%8B%A4%EC%84%B1-%EC%B6%94%EC%A0%95/) 참고해서 시각화하기



In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators = 20, random_state=71, use_label_encoder = False)
model.fit(train_x,train_y) 

pred = model.predict_proba(test_x)[:,1]
pred_label  = np.where(pred > 0.5, 1, 0)

submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission.to_csv('./titanic/submission_first.csv', index=False)


### 모델 검증

Todo 

--- 
* pyplot 이용해 검증결과 시각화

In [None]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

scores_accuracy = []
scores_logloss = []

kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):

  tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
  tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

  model = XGBClassifier(n_estimators = 20, random_state=71, use_label_encoder =False)
  model.fit(tr_x,tr_y)

  va_pred = model.predict_proba(va_x)[:,1]
  logloss= log_loss(va_y, va_pred)
  accuracy = accuracy_score(va_y, va_pred > 0.5)

  print(f'logloss: {logloss:.4f}, accuracy: {accuracy:.4f}')
  scores_logloss.append(logloss)
  scores_accuracy.append(accuracy)

logloss = np.mean(scores_logloss)
accuracy = np.mean(scores_accuracy)

  

logloss: 0.3972, accuracy: 0.8341
logloss: 0.4337, accuracy: 0.8072
logloss: 0.4485, accuracy: 0.8027
logloss: 0.4285, accuracy: 0.8153



### Model tuning

Research

---
* max_depth, min_child_weight의 역할 조사
* max_depth: 결정 트리에서 최대 깊이
* over,underfitting을 방지하기 위한 변수

** hyper parameter 튜닝에 대한 아주 좋은 설명!! 꼭 읽어보기
https://statkclee.github.io/model/model-python-xgboost-hyper.html

In [None]:
import itertools

param_space = {
    'max_depth': [3,5,7],
    'min_child_weight': [1.0, 2.0, 4.0]
}

param_combinations = itertools.product(param_space['max_depth'], param_space['min_child_weight'])

params = []
scores = []

for max_depth, min_child_weight in param_combinations:

  score_folds = []
  kf = KFold(n_splits= 4, shuffle=True, random_state=71)
  for tr_idx, va_idx in kf.split(train_x): 
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    model = XGBClassifier(n_estimators=20, random_state=71, use_label_encoder=False, max_depth=max_depth, min_child_weight=min_child_weight)
    model.fit(tr_x, tr_y)

    va_pred = model.predict_proba(va_x)[:,1]
    logloss = log_loss(va_y, va_pred)
    score_folds.append(logloss)
  
  score_mean = np.mean(score_folds)

  params.append((max_depth, min_child_weight))
  scores.append(score_mean)

best_idx = np.argsort(scores)[0]
best_param = params[best_idx]

print(f'max_depth: {best_param[0]}, min_child_weight:{best_param[1]}')

max_depth: 5, min_child_weight:2.0


### 로지스틱 회귀용 특징 작성

Todo 

---


In [None]:
from sklearn.preprocessing import OneHotEncoder

train_x2 = train.drop(['Survived'], axis=1)
test_x2 = test.copy()

train_x2 = train_x2.drop(['PassengerId'], axis=1)
test_x2 = test_x2.drop(['PassengerId'], axis=1)

train_x2 = train_x2.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_x2 = test_x2.drop(['Name', 'Ticket', 'Cabin'], axis=1)

cat_cols = ['Sex', 'Embarked', 'Pclass']
ohe = OneHotEncoder(categories = 'auto', sparse=False)
ohe.fit(train_x2[cat_cols].fillna('NA'))

ohe_columns = []
for i,c in enumerate(cat_cols):
  ohe_columns += [f'{c}_{v}' for v in ohe.categories_[i]]

ohe_train_x2 = pd.DataFrame(ohe.transform(train_x2[cat_cols].fillna('NA')), columns=ohe_columns)
ohe_test_x2 = pd.DataFrame(ohe.transform(test_x2[cat_cols].fillna('NA')), columns = ohe_columns)

train_x2 = pd.concat([ train_x2.drop(cat_cols, axis=1), ohe_train_x2], axis=1)
test_x2 = pd.concat( [test_x2.drop(cat_cols, axis=1), ohe_test_x2], axis=1)

num_cols = ['Age', 'SibSp', 'Parch', 'Fare']
for col in num_cols:
  train_x2[col].fillna(train_x2[col].mean(), inplace=True)
  test_x2[col].fillna(train_x2[col].mean(), inplace=True)

train_x2['Fare'] = np.log1p(train_x2['Fare'])
test_x2['Fare'] = np.log1p(test_x2['Fare'])

print(train_x2.head(10))
print(test_x2.head(10))


         Age  SibSp  Parch      Fare  ...  Embarked_S  Pclass_1  Pclass_2  Pclass_3
0  22.000000      1      0  2.110213  ...         1.0       0.0       0.0       1.0
1  38.000000      1      0  4.280593  ...         0.0       1.0       0.0       0.0
2  26.000000      0      0  2.188856  ...         1.0       0.0       0.0       1.0
3  35.000000      1      0  3.990834  ...         1.0       1.0       0.0       0.0
4  35.000000      0      0  2.202765  ...         1.0       0.0       0.0       1.0
5  29.699118      0      0  2.246893  ...         0.0       0.0       0.0       1.0
6  54.000000      0      0  3.967694  ...         1.0       1.0       0.0       0.0
7   2.000000      3      1  3.094446  ...         1.0       0.0       0.0       1.0
8  27.000000      0      2  2.495954  ...         1.0       0.0       0.0       1.0
9  14.000000      1      0  3.436268  ...         0.0       0.0       1.0       0.0

[10 rows x 13 columns]
    Age  SibSp  Parch      Fare  ...  Embarked_S  Pc

In [None]:
from sklearn.linear_model import LogisticRegression

model_xgb = XGBClassifier(n_estimators=20, random_state=71, use_label_encoder=False)
model_xgb.fit(train_x, train_y)
pred_xgb = model_xgb.predict_proba(test_x)[:,1]

model_lr = LogisticRegression(solver='lbfgs', max_iter=300)
model_lr.fit(train_x2, train_y) 
pred_lr = model_lr.predict_proba(test_x2)[:,1]

pred = pred_xgb * 0.8 + pred_lr * 0.2
pred_label = np.where(pred > 0.5, 1, 0)

submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission.to_csv('submission_first_ensemble.csv', index=False)