In [3]:
import numpy as np
import pandas as pd
import os
import glob
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
g = glob.glob('data/*.csv')
g

['data/submission_minmax.csv',
 'data/gender_submission.csv',
 'data/test.csv',
 'data/submission_fit_pipe.csv',
 'data/submission.csv',
 'data/train.csv']

In [5]:
df = pd.read_csv('data/train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Exploratory Data Analysis

### 1. Checking for missing values

#### Age

In [6]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

High number of missing values for "Age", "Cabin", and 2 missing values for "Embarked".

In [7]:
# Age
df_age_missing = df[df.Age.isnull()]
df_age_missing.count()

PassengerId    177
Survived       177
Pclass         177
Name           177
Sex            177
Age              0
SibSp          177
Parch          177
Ticket         177
Fare           177
Cabin           19
Embarked       177
dtype: int64

In [8]:
df_age_missing.Pclass.value_counts()

3    136
1     30
2     11
Name: Pclass, dtype: int64

Mainly third class passengers who have missing ages.

In [9]:
df_age_missing.Sex.value_counts()

male      124
female     53
Name: Sex, dtype: int64

In [10]:
# Impute Age based on gender and class mean for further accuracy

means = {}
for g in ['male','female']:
    for c in [1,2,3]:
        means[f'{g}_{c}'] = df['Age'][(df.Sex == f'{g}') & (df.Pclass == c)].mean()
means

{'male_1': 41.28138613861386,
 'male_2': 30.74070707070707,
 'male_3': 26.507588932806325,
 'female_1': 34.61176470588235,
 'female_2': 28.722972972972972,
 'female_3': 21.75}

In [11]:
def impute_age(row, means):
    if pd.notnull(row['Age']):
        return row['Age']
    else:
        g = row['Sex']
        c = row['Pclass']

        return means[f'{g}_{c}']

In [12]:
df['Age'] = df.apply(impute_age, axis = 1, means = means)

In [13]:
df['Age'].isnull().sum()

0

#### Cabin

In [14]:
# not very informative if you look at the raw counts
df.Cabin.value_counts()

C23 C25 C27    4
G6             4
B96 B98        4
C22 C26        3
F33            3
              ..
C45            1
C118           1
A36            1
A32            1
A7             1
Name: Cabin, Length: 147, dtype: int64

In [15]:
# clearer picture with just the first letter
df.Cabin.str[0].value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64

In [17]:
df['cabin_class'] = df.Cabin.str[0]

In [16]:
pd.crosstab(columns=df.Survived, index=[df.Cabin.str[0], df.Pclass]).reset_index()

Survived,Cabin,Pclass,0,1
0,A,1,8,7
1,B,1,12,35
2,C,1,24,35
3,D,1,7,22
4,D,2,1,3
5,E,1,7,18
6,E,2,1,3
7,E,3,0,3
8,F,2,1,7
9,F,3,4,1


## Creating Family Size variable

Those with larger families may have had harder time getting on lifeboats

In [21]:
df['family_size'] = df.Parch + df.SibSp + 1

# Train ML model

In [133]:
from sklearn.svm import LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [108]:
X = df[['Sex','Age','Pclass', 'family_size', 'cabin_class']]
y = df['Survived']

In [109]:
X.isnull().sum()

Sex              0
Age              0
Pclass           0
family_size      0
cabin_class    687
dtype: int64

In [110]:
ms_imputer = SimpleImputer(strategy='constant', fill_value='0')

In [111]:
ct2 = ColumnTransformer([("ohe", OneHotEncoder(sparse=False), [0,2,4])], remainder='passthrough')
transformation_pipeline = Pipeline([('mising_imputer', ms_imputer),('ohe', ct2),('scaler', MinMaxScaler())])
transformation_pipeline
simple_pipe = Pipeline([('mising_imputer', ms_imputer),('ohe', ct2)])

In [112]:
X = simple_pipe.fit_transform(X)
X

array([[0.0, 1.0, 0.0, ..., 0.0, 22.0, 2],
       [1.0, 0.0, 1.0, ..., 0.0, 38.0, 2],
       [1.0, 0.0, 0.0, ..., 0.0, 26.0, 1],
       ...,
       [1.0, 0.0, 0.0, ..., 0.0, 21.75, 4],
       [0.0, 1.0, 1.0, ..., 0.0, 26.0, 1],
       [0.0, 1.0, 0.0, ..., 0.0, 32.0, 1]], dtype=object)

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=88)

In [114]:
models = [KNeighborsClassifier(), RandomForestClassifier(), LogisticRegression(), LinearSVC(), NuSVC()]

In [131]:
best_model = None
best_acc = 0

for m in models:
    m.fit(X_train, y_train)
    y_pred = m.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    if acc > best_acc:
        best_model = m
        best_acc = acc
        print(f'Best model is now {m} with acc: {best_acc}')

Best model is now KNeighborsClassifier() with acc: 0.8044692737430168
Best model is now NuSVC() with acc: 0.8324022346368715


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [138]:
# tuning randomforest classifier
# with gridsearch
params = {
    'n_estimators':[n for n in range(10,60,10)],
    'min_samples_split':[n for n in range(4,11)],
    'min_samples_leaf':[n for n in range(2, 5)]
}

In [139]:
gridsearchcv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params, cv=5, n_jobs=3)
gridsearchcv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=3,
             param_grid={'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [4, 5, 6, 7, 8, 9, 10],
                         'n_estimators': [10, 20, 30, 40, 50]})

In [140]:
gridsearchcv.best_params_

{'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 10}

In [141]:
# test using test set
m = gridsearchcv.best_estimator_
y_pred = m.predict(X_test)
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1-score: {f1_score(y_test, y_pred)}")

Precision: 0.8035714285714286
Recall: 0.7258064516129032
F1-score: 0.7627118644067797


In [142]:
confusion_matrix(y_test, y_pred)

array([[106,  11],
       [ 17,  45]])

In [152]:
# train on all data
m.fit(X, y)

RandomForestClassifier(min_samples_leaf=4, min_samples_split=8, n_estimators=10)

# Creation of submission

In [153]:
df_sub = pd.read_csv('data/test.csv')

In [154]:
df_sub.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [155]:
df_sub['Age'] = df_sub.apply(impute_age, axis = 1, means = means)
df_sub['family_size'] = df_sub.Parch + df_sub.SibSp + 1
df_sub['cabin_class'] = df_sub.Cabin.str[0]
test_X = df_sub[['Sex','Age','Pclass', 'family_size', 'cabin_class']]

In [156]:
transformation_pipeline

Pipeline(steps=[('mising_imputer',
                 SimpleImputer(fill_value='0', strategy='constant')),
                ('ohe',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe',
                                                  OneHotEncoder(sparse=False),
                                                  [0, 2, 4])])),
                ('scaler', MinMaxScaler())])

In [157]:
test_X = simple_pipe.transform(test_X)
test_X

array([[0.0, 1.0, 0.0, ..., 0.0, 34.5, 1],
       [1.0, 0.0, 0.0, ..., 0.0, 47.0, 2],
       [0.0, 1.0, 0.0, ..., 0.0, 62.0, 1],
       ...,
       [0.0, 1.0, 0.0, ..., 0.0, 38.5, 1],
       [0.0, 1.0, 0.0, ..., 0.0, 26.507588932806325, 1],
       [0.0, 1.0, 0.0, ..., 0.0, 26.507588932806325, 3]], dtype=object)

In [158]:
test_y_pred = m.predict(test_X)

In [159]:
submission = pd.DataFrame({
    'PassengerId':df_sub['PassengerId'],
    'Survived':test_y_pred
})
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [160]:
submission.Survived.value_counts()

0    280
1    138
Name: Survived, dtype: int64

In [161]:
submission.to_csv('data/submission_noscale_better_CV_alltrain.csv', index=False)