In [44]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC, StackingClassifier
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.linear_model import LogisticRegression

In [3]:
titanic = pd.read_csv('train.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [27]:
data = titanic.copy()
data = data.dropna(subset=['Embarked'])
data['Embarked'] = data['Embarked'].map({'S':0, 'C':2, 'Q':1})
data['Sex'] = data['Sex'].map({'male':0, 'female':1})
data['Age'] = data['Age'].fillna(data['Age'].mode()[0])

In [25]:
data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005028,-0.03533,-0.043136,0.033351,-0.057686,-0.001657,0.012703,-0.013166
Survived,-0.005028,1.0,-0.335549,0.541585,-0.057833,-0.03404,0.083151,0.25529,0.169718
Pclass,-0.03533,-0.335549,1.0,-0.127741,-0.352812,0.081656,0.016824,-0.548193,-0.164681
Sex,-0.043136,0.541585,-0.127741,1.0,-0.078802,0.116348,0.247508,0.179958,0.11032
Age,0.033351,-0.057833,-0.352812,-0.078802,1.0,-0.231639,-0.154027,0.104553,0.002171
SibSp,-0.057686,-0.03404,0.081656,0.116348,-0.231639,1.0,0.414542,0.160887,-0.0689
Parch,-0.001657,0.083151,0.016824,0.247508,-0.154027,0.414542,1.0,0.217532,-0.040449
Fare,0.012703,0.25529,-0.548193,0.179958,0.104553,0.160887,0.217532,1.0,0.226311
Embarked,-0.013166,0.169718,-0.164681,0.11032,0.002171,-0.0689,-0.040449,0.226311,1.0


In [28]:
label = data['Survived']
data = data[['Pclass', 'Sex', 'Fare', 'Embarked']]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.4, stratify=label)

In [33]:
params_svm = {
    'C': [1, 10, 100, 1000, 1e4, 1e5, 1e6],
    'gamma': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}
params_rfc = {
    'n_estimators': [100, 200]
}
params_dnn = {
    'random_state': [None]
}
clf_svm = GridSearchCV(SVC(), params_svm, cv=5)
clf_rfc = GridSearchCV(RFC(), params_rfc, cv=5)
clf_mlp = GridSearchCV(MLP(), params_dnn, cv=5)
clf_svm.fit(X_train, y_train)
clf_rfc.fit(X_train, y_train)
clf_mlp.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'n_estimators': [100, 200]})

In [39]:
params_dnn = {
    'random_state': [None]
}
clf_mlp = GridSearchCV(MLP(), params_dnn, cv=5)
clf_mlp.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=MLPClassifier(),
             param_grid={'random_state': [None]})

In [40]:
res_svm = pd.DataFrame(clf_svm.cv_results_)
res_rfc = pd.DataFrame(clf_rfc.cv_results_)
res_mlp = pd.DataFrame(clf_mlp.cv_results_)

In [46]:
estimators = [
    ('SVM', SVC(C=1e6, gamma=1e-9)),
    ('RFC', RFC()),
    ('MLP', MLP())
]
# モデルの設定
clf = StackingClassifier(estimators, LogisticRegression())
# スタッキングによる学習
clf.fit(X_train, y_train)

StackingClassifier(estimators=[('SVM', SVC(C=1000000.0, gamma=1e-09)),
                               ('RFC', RandomForestClassifier()),
                               ('MLP', MLPClassifier())],
                   final_estimator=LogisticRegression())

In [47]:
clf.score(X_test, y_test)

0.8117977528089888