In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb

# load data
df_train = pd.read_csv("./preprocessed_dataset/train.csv")
df_test = pd.read_csv("./preprocessed_dataset/test.csv")
submission = pd.read_csv("./dataset/gender_submission.csv")

# feature: Name, Pclass, Sex, Age, Family
train_target = df_train["Survived"].to_numpy()
del df_train["PassengerId"]
del df_train["Survived"]
del df_train["SibSp"]
del df_train["Parch"]
del df_train["Ticket"]
del df_train["Fare"]
del df_train["Cabin"]
del df_train["Embarked"]
train_input = df_train.to_numpy()

del df_test["PassengerId"]
del df_test["SibSp"]
del df_test["Parch"]
del df_test["Ticket"]
del df_test["Fare"]
del df_test["Cabin"]
del df_test["Embarked"]
test_input = df_test.to_numpy()

## Random Forests

In [3]:
params = {
    'n_estimators':range(100, 2100, 100),
    'max_depth' : range(1, 21), 
    'min_samples_leaf' : [4, 6, 8, 10],
    'min_samples_split' : [4, 6, 8, 10]
}
train_input, val_input, train_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42
)

model = RandomForestClassifier()
grid_cv = GridSearchCV(model , param_grid=params , cv=2, n_jobs=2, verbose=2)
grid_cv.fit(train_input , train_target)
estimator = grid_cv.best_estimator_
pred = estimator.predict(val_input)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))
accuracy_score(val_target , pred)

Fitting 2 folds for each of 6400 candidates, totalling 12800 fits
최적 하이퍼 파라미터:
 {'max_depth': 16, 'min_samples_leaf': 6, 'min_samples_split': 6, 'n_estimators': 100}
최고 예측 정확도: 0.8315


0.8156424581005587

In [8]:
model = RandomForestClassifier(n_estimators=100,  max_depth=16,  min_samples_split=6, min_samples_leaf=6)
model.fit(train_input, train_target)

predictions=model.predict(test_input)
submission['Survived']=predictions
submission.to_csv('Result.csv', index=False)

## XGBoost

In [9]:
params = {
    'n_estimators' :range(10, 210, 10),
    'max_depth' : range(2, 21),
    'learning_rate' : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}
train_input, val_input, train_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42
)

model = xgb.XGBClassifier()
grid_cv = GridSearchCV(model , param_grid=params , cv=2, n_jobs=2, verbose=2)
grid_cv.fit(train_input , train_target)
estimator = grid_cv.best_estimator_
pred = estimator.predict(val_input)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))
accuracy_score(val_target , pred)

Fitting 2 folds for each of 3800 candidates, totalling 7600 fits
최적 하이퍼 파라미터:
 {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 30}
최고 예측 정확도: 0.8488


0.8767123287671232

In [11]:
model = xgb.XGBClassifier(n_estimators=30, max_depth=2, learning_rate=0.1)
model.fit(train_input, train_target)

predictions=model.predict(test_input)
submission['Survived']=predictions
submission.to_csv('Result.csv', index=False)