In [1]:
import numpy as np 
import pandas as pd

import easydict
args = easydict.EasyDict()

#path
args.default_path = 'C:/Users/rladb/workspace/data/kagglecomp/'
args.train_csv =args.default_path +'train.csv'
args.test_csv = args.default_path +'test.csv'
args.submi_csv = args.default_path +'submission.csv'
args.submi_csv_0306 = args.default_path + 'submi0306'
#변수
SEED = 35

In [2]:
o_train = pd.read_csv(args.train_csv)
o_test = pd.read_csv(args.test_csv)

In [3]:
X = o_train.drop(['survived'], axis=1)
y = o_train['survived']

from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, stratify=y, random_state=SEED)
X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((687, 11), (229, 11), (687,), (229,))

In [4]:
train = X_tr.copy()
test = X_te.copy()
o_te = o_test.copy()

dfs = [train, test, o_te]

In [5]:
#fill, map, encode
age_median = o_train['age'].median()
fare_median = o_train['fare'].median()
embarked_mode = o_train['embarked'].mode().values[0]

for df in dfs:
    df['title'] = df.name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['title'] = df['title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	    'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
    df['title'] = df['title'].replace(['Mlle', 'Ms'], 'Miss')
    df['title'] = df['title'].replace('Mme', 'Mrs')
    df['age'].fillna(age_median, inplace=True)
    df['fare'].fillna(fare_median, inplace=True)
    df['embarked'].fillna(embarked_mode, inplace=True)
    df['gender'] = df['gender'].map({'male': 0, 'female': 1}).astype(bool)
    df['title'] = df['title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5})
    df['embarked_C']=(df['embarked']=='C')
    df['embarked_S']=(df['embarked']=='S')
    df['embarked_Q']=(df['embarked']=='Q')

In [6]:
#drop
dropcols = ['ticket', 'cabin', 'name', 'embarked']
for df in dfs:
    df.drop(dropcols, axis=1, inplace=True)

train.drop('passengerid', axis=1, inplace=True)
test.drop('passengerid', axis=1, inplace=True)
o_te.set_index(['passengerid'], inplace=True)

In [7]:
from sklearn.ensemble import RandomForestClassifier
RFChp = {
    "random_state": SEED,
    "max_features": "sqrt", # None 을 줄경우 전체피쳐 사용 
    "n_estimators": 100, # 트리개수 
    "max_depth": 10,
    "min_samples_split": 10,
    "min_samples_leaf": 3,
}
RFC = RandomForestClassifier(**RFChp).fit(train, y_tr)

RFC.score(train, y_tr), RFC.score(test, y_te)

(0.883551673944687, 0.868995633187773)

In [8]:
from xgboost import XGBClassifier
XGBhp = {
    "random_state": SEED,
    "max_depth": 2,
    "n_estimators": 100, # 수행할 부스팅 단계 수
    "learning_rate": 0.01,
    "min_child_weight": 2,
    "gamma": 1,
}

XGB = XGBClassifier(**XGBhp).fit(train, y_tr)

XGB.score(train, y_tr), XGB.score(test, y_te)

(0.8573508005822417, 0.8777292576419214)

In [9]:
from catboost import CatBoostClassifier
CAThp = {
    "random_state": SEED,
    "max_depth" : 2,
    "n_estimators" : 100,
    "verbose": 0 # 부스팅 단계 출력 안보이게 하기
}

CAT = CatBoostClassifier(**CAThp).fit(train, y_tr)

CAT.score(train, y_tr), CAT.score(test, y_te)

(0.8719068413391557, 0.868995633187773)

In [10]:
from sklearn.ensemble import VotingClassifier
VOTestimators = [
    ("RFC", RandomForestClassifier(**RFChp)),
    ("XGB", XGBClassifier(**XGBhp)),
    ("CAT", CatBoostClassifier(**CAThp))
]
VOThp = {
    "estimators" : VOTestimators,
    "voting" : "soft"
}
VOT = VotingClassifier(**VOThp).fit(train, y_tr)

VOT.score(train, y_tr), VOT.score(test, y_te)

(0.8733624454148472, 0.8646288209606987)

In [11]:
from sklearn.ensemble import StackingClassifier
SCestimators = [
    ("RFC", RandomForestClassifier(**RFChp)),
    ("XGB", XGBClassifier(**XGBhp)),
    ("CAT", CatBoostClassifier(**CAThp))
]
SChp = {
    "estimators" : SCestimators,
    "final_estimator" : VotingClassifier(**VOThp)
}

SC = StackingClassifier(**SChp,n_jobs=-1).fit(train, y_tr)

SC.score(train, y_tr), SC.score(test, y_te)

(0.8806404657933042, 0.868995633187773)

In [14]:
test_pred_SC = SC.predict_proba(o_te)[:,1]

In [15]:
submi = pd.read_csv(args.submi_csv)

In [16]:
submi['survived'] = test_pred_SC
submi.to_csv(args.submi_csv_0306+'_00.csv', header=True, index=False)