In [20]:
import pandas as pd
import numpy as np
import seaborn as sns

# logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# for feature engineering
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

# Importing and preprocessing of the data sets
(required for Feature Engineering)

In [64]:
df = pd.read_csv('../data/train.csv', index_col=0)
df_test = pd.read_csv('../data/test.csv', index_col=0)

## Cleaning training data

In [67]:
titles = df['Name'].str.extract(',\s((\w|\s)+)\.')
titles.value_counts()

0             1
Mr            r    517
Miss          s    182
Mrs           s    125
Master        r     40
Dr            r      7
Rev           v      6
Major         r      2
Col           l      2
Mlle          e      2
Sir           r      1
Ms            s      1
Capt          t      1
Mme           e      1
Lady          y      1
Jonkheer      r      1
Don           n      1
the Countess  s      1
dtype: int64

Because there are a lot of different titles, I want to reduce them to five. This includes Mr, Miss, Mrs and Master, since there are already a lot observations. This means I have to match the other titles into the existing groups. I map them like this:
* `Mlle` and `Ms` to `Miss` 
* `Mme` to `Mrs`
* `Dr`, `Rev`, `Col`, `Major`, `Capt`, `Jonkheer`, `Lady` and `Sir` to `rest`

In [68]:
df['title'] = titles[0].replace({
     'Mlle': 'Miss', 
     'Ms': 'Miss', 
     'Mme': 'Mrs', 
     'Dr': 'rest', 
     'Don': 'rest',
     'Dona': 'rest', # found in test.csv
     'Rev': 'rest', 
     'Col': 'rest', 
     'Major': 'rest', 
     'Capt': 'rest', 
     'Jonkheer': 'rest', 
     'Lady': 'rest', 
     'Sir': 'rest',
     'the Countess' : 'rest'
})
df['title'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
rest       23
Name: title, dtype: int64

In [71]:
df['family_size'] = df['SibSp'] + df['Parch']
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,1
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,0


In [78]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,family_size
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.904602
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,1.613459
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,0.0
75%,1.0,3.0,38.0,1.0,0.0,31.0,1.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,10.0


## Cleaning of test data

In [79]:
titles_test = df_test['Name'].str.extract(',\s((\w|\s)+)\.')
titles_test.value_counts()

0       1
Mr      r    240
Miss    s     78
Mrs     s     72
Master  r     21
Col     l      2
Rev     v      2
Dona    a      1
Dr      r      1
Ms      s      1
dtype: int64

In [80]:
df_test['title'] = titles_test[0].replace({
     'Mlle': 'Miss', 
     'Ms': 'Miss', 
     'Mme': 'Mrs', 
     'Dr': 'rest', 
     'Don': 'rest',
     'Dona': 'rest', # found in test.csv
     'Rev': 'rest', 
     'Col': 'rest', 
     'Major': 'rest', 
     'Capt': 'rest', 
     'Jonkheer': 'rest', 
     'Lady': 'rest', 
     'Sir': 'rest',
     'the Countess' : 'rest'
    
})
df_test['title'].value_counts()

Mr        240
Miss       79
Mrs        72
Master     21
rest        6
Name: title, dtype: int64

In [82]:
df_test['family_size'] = df_test['SibSp'] + df_test['Parch']
df_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Mr,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Mrs,1
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Mr,0
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,Mr,0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,Mrs,2


In [83]:
df_test.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,family_size
count,418.0,332.0,418.0,418.0,417.0,418.0
mean,2.26555,30.27259,0.447368,0.392344,35.627188,0.839713
std,0.841838,14.181209,0.89676,0.981429,55.907576,1.519072
min,1.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,21.0,0.0,0.0,7.8958,0.0
50%,3.0,27.0,0.0,0.0,14.4542,0.0
75%,3.0,39.0,1.0,0.0,31.5,1.0
max,3.0,76.0,8.0,9.0,512.3292,10.0


## Checking for Missing Values in both the train and test dataset

In [86]:
print('TRAIN DATA: --------')
print(df.isna().sum())
print('\nTEST DATA: -------')
print(df_test.isna().sum())

TRAIN DATA: --------
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
title            0
family_size      0
dtype: int64

TEST DATA: -------
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
title            0
family_size      0
dtype: int64


This gives me information about which columns I need to impute. Since I plan on using `Age`, `Fare` and `Embarked`, which all have missing values, I can now decide on a strategy of how to impute those columns.

# Feature Engineering
Creating training data from the whole of `train.csv` and _test data_ from `test.csv`. I wont be creating a test/validation dataset from the `train.csv` like we used to, because I now use `cross_validation` to fit the model(s).

In [29]:
X_train = df[['Pclass', 'Sex', 'Age', 'Fare', 'title', 'family_size', 'Embarked']]
y_train = df['Survived']
X_test = df_test[['Pclass', 'Sex', 'Age', 'Fare', 'title', 'family_size', 'Embarked']]

### Creating Pipelines

In [87]:
impute_and_bin_age = make_pipeline(
    SimpleImputer(strategy = 'mean'),
    KBinsDiscretizer(n_bins = 5, encode = 'onehot-dense', strategy = 'quantile')
)   # TODO: set impute strategy to median because of outliers seen in plotting (not yet inlcuded in this notebook)
    # Also play around with bin strategy

impute_and_bin_fare = make_pipeline(
    SimpleImputer(strategy = 'most_frequent'),
    KBinsDiscretizer(n_bins = 3, encode = 'onehot-dense', strategy = 'kmeans')
)

impute_and_encode_embarked = make_pipeline(
    SimpleImputer(strategy = 'most_frequent'),
    OneHotEncoder()
)   # since there are only three missing values, we can just add those three to the port at which most passengers got on

# TODO: Create a custom binning function for family_size

In [31]:
feat_eng = ColumnTransformer(
    transformers = [
        ('age_transformation', impute_and_bin_age, ['Age']),
        ('familiy_scale', MinMaxScaler(), ['family_size']),  # also checked the min(0) and max(10) for this column in both datasets: identical
        ('hot_titles', OneHotEncoder(), ['title', 'Sex']),
        ('impute_fare', impute_and_bin_fare, ['Fare']),
        ('embarked', impute_and_encode_embarked, ['Embarked'])
    ],
    remainder = 'passthrough'
)

In [33]:
feat_eng.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('age_transformation',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('kbinsdiscretizer',
                                                  KBinsDiscretizer(encode='onehot-dense'))]),
                                 ['Age']),
                                ('familiy_scale', MinMaxScaler(),
                                 ['family_size']),
                                ('hot_titles', OneHotEncoder(),
                                 ['title', 'Sex']),
                                ('impute_fare',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('kbinsdiscretizer',
                                                  

In [34]:
X_train_trans = feat_eng.transform(X_train)
X_test_trans = feat_eng.transform(X_test)

## Cross Validation

In [61]:
# model = LogisticRegression(max_iter=10000)
# model = RandomForestClassifier(n_estimators=100, max_depth=5, n_jobs=1)

In [130]:
results = pd.DataFrame(
    columns=['fit_time', 'score_time', 'test_score', 'train_score']
)

for x in range(1,11):
    validation_model = RandomForestClassifier(n_estimators=100, max_depth=x, n_jobs=-1) # seems to perform best on 5 and 6
#     validation_model = LogisticRegression(max_iter=x * 1000)  # no difference in accuracy
    score = cross_validate(
        estimator=validation_model,    # the model you want to evaluate 
        X=X_train_trans,               # the training input data 
        y=y_train,                     # the training output data  
        cv=5,                          # number of cross validation datasets 
        scoring='accuracy',            # evaluation metric 
        return_train_score=True,       # return both the score on the training and the cross validated data
        n_jobs=1                       # n_jobs = -1 for using all your processores
    )
    print('\nMax depth of ', x)
    print(pd.DataFrame(score).mean())


Max depth of  1
fit_time       0.056963
score_time     0.008170
test_score     0.786737
train_score    0.786755
dtype: float64

Max depth of  2
fit_time       0.052785
score_time     0.008360
test_score     0.786737
train_score    0.787317
dtype: float64

Max depth of  3
fit_time       0.055295
score_time     0.008871
test_score     0.802448
train_score    0.815656
dtype: float64

Max depth of  4
fit_time       0.055954
score_time     0.009056
test_score     0.828253
train_score    0.835578
dtype: float64

Max depth of  5
fit_time       0.057839
score_time     0.009347
test_score     0.832747
train_score    0.845962
dtype: float64

Max depth of  6
fit_time       0.060384
score_time     0.009928
test_score     0.831662
train_score    0.856064
dtype: float64

Max depth of  7
fit_time       0.064096
score_time     0.009613
test_score     0.824951
train_score    0.867286
dtype: float64

Max depth of  8
fit_time       0.061676
score_time     0.009622
test_score     0.821581
train_score    

## Summary:
1. Logistic Regression
    * There is no difference in modifying the max_iter value
    * accuracy stays at `0.8305`
2. Random Forest Classifier
    * has best accuracy with a depth of 5 and 6, which is around `0.8327`

In [47]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, n_jobs=-1)

model.fit(X_train_trans, y_train)
y_pred = model.predict(X_test_trans)

In [48]:
X_test['Survived'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Survived'] = y_pred


In [49]:
X_test['Survived'].to_csv('../data/my_predictions.csv')