In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC, VotingClassifier
from sklearn.neural_network import MLPClassifier as MLP

In [2]:
df = pd.read_csv('train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [4]:
data = df.copy()
data['Sex'] = data['Sex'].map({'male':0, 'female':1})
data['Embarked'] = data['Embarked'].map({'S':0, 'C':2, 'Q':1})
data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005007,-0.035144,-0.042939,0.036847,-0.057527,-0.001652,0.012658,-0.013166
Survived,-0.005007,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307,0.169718
Pclass,-0.035144,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495,-0.164681
Sex,-0.042939,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333,0.11032
Age,0.036847,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067,0.032565
SibSp,-0.057527,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651,-0.0689
Parch,-0.001652,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225,-0.040449
Fare,0.012658,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0,0.226311
Embarked,-0.013166,0.169718,-0.164681,0.11032,0.032565,-0.0689,-0.040449,0.226311,1.0


In [5]:
data = data.dropna(axis=0, subset=['Embarked'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    int64  
 5   Age          712 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    float64
dtypes: float64(3), int64(6), object(3)
memory usage: 90.3+ KB


In [6]:
data.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,1.0,-0.005028,-0.03533,-0.043136,0.033681,-0.057686,-0.001657,0.012703,-0.013166
Survived,-0.005028,1.0,-0.335549,0.541585,-0.082446,-0.03404,0.083151,0.25529,0.169718
Pclass,-0.03533,-0.335549,1.0,-0.127741,-0.365902,0.081656,0.016824,-0.548193,-0.164681
Sex,-0.043136,0.541585,-0.127741,1.0,-0.099037,0.116348,0.247508,0.179958,0.11032
Age,0.033681,-0.082446,-0.365902,-0.099037,1.0,-0.307351,-0.187896,0.093143,0.032565
SibSp,-0.057686,-0.03404,0.081656,0.116348,-0.307351,1.0,0.414542,0.160887,-0.0689
Parch,-0.001657,0.083151,0.016824,0.247508,-0.187896,0.414542,1.0,0.217532,-0.040449
Fare,0.012703,0.25529,-0.548193,0.179958,0.093143,0.160887,0.217532,1.0,0.226311
Embarked,-0.013166,0.169718,-0.164681,0.11032,0.032565,-0.0689,-0.040449,0.226311,1.0


In [7]:
label = data['Survived']
data = data.drop(labels=['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.4, stratify=label, random_state=10)

In [9]:
models = {
    'SVM': SVC(C=1000, gamma=1e-5),
    'RandomForest': RFC(max_depth=5, n_estimators=20),
    'MachineLearning': MLP(hidden_layer_sizes=10)
}
scores = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    scores[(model_name, 'train_score')] = model.score(X_train, y_train)
    scores[(model_name, 'test_score')] = model.score(X_test, y_test)




In [10]:
pd.Series(scores).unstack()

Unnamed: 0,test_score,train_score
MachineLearning,0.780899,0.778612
RandomForest,0.811798,0.853659
SVM,0.77809,0.78424


In [11]:
models.items()

dict_items([('SVM', SVC(C=1000, gamma=1e-05)), ('RandomForest', RandomForestClassifier(max_depth=5, n_estimators=20)), ('MachineLearning', MLPClassifier(hidden_layer_sizes=10))])

In [17]:
clf_vote = VotingClassifier(estimators=[('SVM', SVC(C=1000, gamma=1e-5)), ('RandomForest', RFC(max_depth=5, n_estimators=20)), (('MachineLearning'), MLP(hidden_layer_sizes=10))], voting='hard', weights=[2,1,1])
clf_vote.fit(X_train, y_train)
clf_vote.score(X_test, y_test)



0.7865168539325843