In [182]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.simplefilter('ignore')

In [166]:
# loading the trainning and the test sets
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [167]:
display(df_train.head())
display(df_test.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [168]:
# Droping out unnecessary features: 'Name', 'Ticket' and 'Cabin'
train = df_train.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)
display(train.head())
display(test.head())

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [169]:
# Parsing categorical features:
train = pd.get_dummies(train)
test = pd.get_dummies(test)
display(train.head())
display(test.head())

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1,1,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,1,0,0,1


Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,0,1,0,1,0
1,893,3,47.0,1,0,7.0,1,0,0,0,1
2,894,2,62.0,0,0,9.6875,0,1,0,1,0
3,895,3,27.0,0,0,8.6625,0,1,0,0,1
4,896,3,22.0,1,1,12.2875,1,0,0,0,1


In [170]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Age            177
SibSp            0
Parch            0
Fare             0
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
dtype: int64

### Model Evaluation: Baseline

In [240]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.impute import MissingIndicator
from sklearn.pipeline import FeatureUnion
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier

# Creating features and target to build the DecisionTreeModel
X_train = train.drop('Survived', axis=1)
y_train = train['Survived']

# Feature Union
features = []
features.append(('na_indicator', MissingIndicator()))
features.append(('imp_mean', SimpleImputer(strategy='mean')))
transformers = FeatureUnion(transformer_list=features)

# Building models
models = []
models.append(('tree', Pipeline([('imputation', transformers),('tree', DecisionTreeClassifier(max_depth=3, random_state=0))])))
models.append(('rf', Pipeline([('imputation', transformers),('rf', RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1))])))
models.append(('gb', Pipeline([('imputation', transformers),('gb', GradientBoostingClassifier(random_state=0))])))
models.append(('ab', Pipeline([('imputation', transformers),('ab', ExtraTreesClassifier(random_state=0))])))
models.append(('ex', Pipeline([('imputation', transformers),('ex', AdaBoostClassifier(random_state=0))])))

for name, model in models:
    kfold = KFold(n_splits=10, random_state = 0)
    cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring='accuracy')
    msg = "{}: {} ({})".format(name, cv_results.mean(), cv_results.std())
    print(msg)

tree: 0.8159425717852684 (0.036960220198097256)
rf: 0.812621722846442 (0.04295344604478196)
gb: 0.8226841448189763 (0.03429797647421347)
ab: 0.8002621722846441 (0.03410915934913142)
ex: 0.784756554307116 (0.08517187447147517)


In [241]:
X_train.isnull().sum()

PassengerId      0
Pclass           0
Age            177
SibSp            0
Parch            0
Fare             0
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
dtype: int64

In [243]:
# Placing a columns indicating the value was missing
X_train['age_na_ind'] = np.where(X_train['Age'].isnull(), 1, 0)
age_mean = X_train['Age'].mean()
X_train['Age'] = X_train.Age.fillna(age_mean)

model = GradientBoostingClassifier(random_state=0)
model.fit(X_train, y_train)
predictions = model.predict(X_train)
accuracy_score(y_train, predictions)

0.8978675645342312

In [244]:
test.isnull().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
age_na_ind     0
dtype: int64

In [227]:
test['age_na_ind'] = np.where(test['Age'].isnull(), 1, 0)
fare_mean = X_train['Fare'].mean()
test['Fare'] = test.Fare.fillna(fare_mean)
test['Age'] = test.Age.fillna(age_mean)

In [228]:
test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,age_na_ind
0,892,3,34.5,0,0,7.8292,0,1,0,1,0,0
1,893,3,47.0,1,0,7.0,1,0,0,0,1,0
2,894,2,62.0,0,0,9.6875,0,1,0,1,0,0
3,895,3,27.0,0,0,8.6625,0,1,0,0,1,0
4,896,3,22.0,1,1,12.2875,1,0,0,0,1,0


In [230]:
# Predicting and preparing for submission
y_pred = tree.predict(test)
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId']
submission['Survived'] = y_pred

In [231]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [232]:
submission.shape

(418, 2)

In [233]:
# Exporting to a csv file to submit in the kaggle
submission.to_csv('new_submission.csv', index=False)