In [84]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

In [85]:
train  = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

In [87]:
train_x = train[['Pclass', 'SibSp','Age', 'Sex']]
train_y = train['Survived']

test_x = test[['Pclass', 'SibSp', 'Age', 'Sex']]

In [91]:
train_x[train_x.isnull().any(axis=1)]
# Pclass별 성별 중위수를 구해서 대처
grouped = train_x.groupby(['Pclass','Sex'])

In [102]:
grouped['Age'].median()

Pclass  Sex   
1       female    35.0
        male      40.0
2       female    28.0
        male      30.0
3       female    21.5
        male      25.0
Name: Age, dtype: float64

In [103]:
train_x['Age'] = train_x['Age'].fillna(grouped['Age'].transform('median'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x['Age'] = train_x['Age'].fillna(grouped['Age'].transform('median'))


In [105]:
train['Sex'] = train['Sex'].map({'male' : 0, 'female' : 1})
test['Sex'] = test['Sex'].map({'male' : 0, 'female' : 1})
test_x['Age'] = test_x['Age'].fillna(test_x.groupby(['Pclass','Sex'])['Age'].transform('median'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['Age'] = test_x['Age'].fillna(test_x.groupby(['Pclass','Sex'])['Age'].transform('median'))


In [27]:
# 모델 비교 - 교차 검증
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [33]:
def cross_val(model, cv=10):
    scores = cross_validate(model,train_x,train_y,return_train_score=True, cv=cv)
    print(np.mean(scores['train_score']), np.mean(scores['test_score']) )

In [34]:
cross_val(RandomForestClassifier() )
cross_val(XGBClassifier() )
cross_val(LGBMClassifier() )
cross_val(KNeighborsClassifier() )
cross_val(LogisticRegression() )

0.9072207122642831 0.8014107365792759
0.8931295357112836 0.8215605493133582
0.8812824368541816 0.8182022471910111
0.8448686336592974 0.7901622971285893
0.7898737861961825 0.7867041198501873


In [35]:
model = XGBClassifier()
model.fit(train_x,train_y)

In [57]:
from scipy.stats import uniform, randint
uni =  uniform(0.0001,0.001)
uni.rvs(10)



array([0.00015206, 0.00107015, 0.00039609, 0.00104887, 0.00089861,
       0.00090685, 0.00065303, 0.00091484, 0.00100482, 0.00057045])

In [58]:
# 하이퍼 파라메터 튜닝
from sklearn.model_selection import GridSearchCV
params = {
    'n_estimators': range(100,200,10),
    'max_depth' : range(1,10),
    'learning_rate':uni.rvs(10)
}

In [59]:
gsc = GridSearchCV(XGBClassifier(), n_jobs=-1, param_grid = params)
gsc.fit(train_x,train_y)

In [64]:
gsc.best_params_

{'learning_rate': 0.0008626355304508852, 'max_depth': 9, 'n_estimators': 100}

In [61]:
model = gsc.best_estimator_

In [62]:
test_y_pred = model.predict(test_x)
submission = pd.read_csv("./titanic/submission.csv")
submission['Survived'] = test_y_pred
submission.to_csv('./titanic/temp.csv', index = False)