In [1]:
import pandas as pd
import numpy as np

In [4]:
trainset = pd.read_csv('datasets/titanic/train.csv')

In [5]:
trainset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
mv_cols = (trainset.isnull().sum())
print(mv_cols[mv_cols > 0])

Age    177
dtype: int64


In [7]:
testset = pd.read_csv('datasets/titanic/test.csv')

In [8]:
testset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [18]:
mv_cols = (testset.isnull().sum())
print(mv_cols[mv_cols > 0])

Age    86
dtype: int64


##### We'll do the following for imputation:
- Age: train a regressor
- Fare: impute w a single val
- Cabin: impute as NA and then create a new feature 'has_cabin' based on it
- Embarked: impute w single val

In [10]:
trainset[trainset['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [11]:
trainset['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [12]:
trainset['Embarked'] = trainset['Embarked'].fillna('C')

In [13]:
trainset['Cabin'] = trainset['Cabin'].fillna('NA')
testset['Cabin'] = testset['Cabin'].fillna('NA')

In [16]:
testset['Fare'] = testset['Fare'].fillna(7.5)

### Feature Engineering

In [19]:
trainset['Has_cabin'] = trainset['Cabin'].apply(lambda x: 0 if x == 'NA' else 1)
testset['Has_cabin'] = testset['Cabin'].apply(lambda x: 0 if x == 'NA' else 1)

In [20]:
trainset['FamilySize'] = trainset['SibSp'] + trainset['Parch'] + 1
testset['FamilySize'] = testset['SibSp'] + testset['Parch'] + 1

In [21]:
trainset['Is_alone'] = trainset['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
testset['Is_alone'] = testset['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

In [24]:
trainset['Title'] = trainset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
testset['Title'] = testset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [25]:
trainset['Title'] = trainset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 
                                               'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
testset['Title'] = testset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 
                                               'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

In [26]:
trainset['Title'] = trainset['Title'].replace('Mlle','Miss')
trainset['Title'] = trainset['Title'].replace('Ms','Miss')
trainset['Title'] = trainset['Title'].replace('Mme','Mrs')

In [27]:
testset['Title'] = testset['Title'].replace('Mlle','Miss')
testset['Title'] = testset['Title'].replace('Ms','Miss')
testset['Title'] = testset['Title'].replace('Mme','Mrs')

In [29]:
title_mapping = {'Mr':1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}

In [30]:
trainset['Title'] = trainset['Title'].map(title_mapping) ## this could've been achieved w a labelencoder
testset['Title'] = testset['Title'].map(title_mapping)

In [32]:
trainset['Title'] = trainset['Title'].fillna(0)
testset['Title'] = testset['Title'].fillna(0)

In [33]:
trainset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_cabin,FamilySize,Is_alone,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,2,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,2,0,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1,1,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,2,0,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,1,1,1


In [34]:
testset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_cabin,FamilySize,Is_alone,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,1,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,2,0,3
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,1,1,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,1,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,3,0,3


### Preparing for regressor training (Age)

In [46]:
features4Training = ['Pclass','Sex','Age','Fare','Embarked','Has_cabin','FamilySize','Is_alone','Title']

##### Splitting train and test sets in 2: those with Age and those without

In [47]:
trainsetAgeNull = trainset[trainset['Age'].isnull()]
trainsetAgeNotNull = trainset[trainset['Age'].notnull()]
testsetAgeNull = testset[testset['Age'].isnull()]
testsetAgeNotNull = testset[testset['Age'].notnull()]

In [48]:
trainsetAgeNull = trainsetAgeNull[features4Training]
trainsetAgeNotNull = trainsetAgeNotNull[features4Training]
testsetAgeNull = testsetAgeNull[features4Training]
testsetAgeNotNull = testsetAgeNotNull[features4Training]

In [49]:
trainsetAge = trainsetAgeNotNull.append(testsetAgeNotNull)

In [50]:
trainsetAge = pd.get_dummies(trainsetAge)

In [51]:
trainsetAge.head()

Unnamed: 0,Pclass,Age,Fare,Has_cabin,FamilySize,Is_alone,Title,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,7.25,0,2,0,1,0,1,0,0,1
1,1,38.0,71.2833,1,2,0,3,1,0,1,0,0
2,3,26.0,7.925,0,1,1,2,1,0,0,0,1
3,1,35.0,53.1,1,2,0,3,1,0,0,0,1
4,3,35.0,8.05,0,1,1,1,0,1,0,0,1


#### Regresssor for Age

In [52]:
X = trainsetAge.drop(['Age'], inplace=False, axis=1)
y = trainsetAge['Age']

In [53]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor

In [54]:
kFold = KFold(n_splits=10, random_state=0, shuffle=True)

In [56]:
maes = []
r2s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelGB_age = GradientBoostingRegressor(random_state=0)
    modelGB_age.fit(X_train, y_train)
    preds = modelGB_age.predict(X_test)
    
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print('MAE:', mae, 'R2:', r2)
    maes.append(mae)
    r2s.append(r2)
    print()
print('Avg MAE:', np.mean(maes))
print('Avg R2:', np.mean(r2s))

Fold: 0
MAE: 8.25414759105658 R2: 0.34856450685026263

Fold: 1
MAE: 8.316736164537012 R2: 0.3415580082068448

Fold: 2
MAE: 8.935574940580164 R2: 0.33468162345902763

Fold: 3
MAE: 8.385165867106773 R2: 0.43163854966049076

Fold: 4
MAE: 8.338396111748189 R2: 0.35461795319849754

Fold: 5
MAE: 7.631661221427318 R2: 0.4632167101283916

Fold: 6
MAE: 8.16030632670174 R2: 0.5419102127122761

Fold: 7
MAE: 10.279637244346002 R2: 0.36661498022192585

Fold: 8
MAE: 8.935706009825262 R2: 0.4074701356689947

Fold: 9
MAE: 8.11361493375698 R2: 0.5163783474117509

Avg MAE: 8.535094641108603
Avg R2: 0.41066510275184626


##### Predict on test set

In [62]:
testsetAge = trainsetAgeNull.drop(['Age'], inplace=False, axis=1)

In [64]:
testsetAge = pd.get_dummies(testsetAge)

In [67]:
preds = modelGB_age.predict(testsetAge)

In [70]:
trainsetAgeImputed = pd.DataFrame({'Pclass': trainsetAgeNull['Pclass'],
                                   'Sex': trainsetAgeNull['Sex'],
                                   'Age': trainsetAgeNull['Age'],
                                   'Fare': trainsetAgeNull['Fare'],
                                   'Embarked': trainsetAgeNull['Embarked'],
                                   'Has_cabin': trainsetAgeNull['Has_cabin'],
                                   'FamilySize': trainsetAgeNull['FamilySize'],
                                   'Is_alone': trainsetAgeNull['Is_alone'],
                                   'Title': trainsetAgeNull['Title'],
                                   'Age_imputed': preds
                                  })

##### Let's do the same with the other test set

In [73]:
testsetAge = testsetAgeNull.drop(['Age'], inplace=False, axis=1)
testsetAge = pd.get_dummies(testsetAge)
preds = modelGB_age.predict(testsetAge)

In [74]:
testsetAgeImputed = pd.DataFrame({'Pclass': testsetAgeNull['Pclass'],
                                  'Sex': testsetAgeNull['Sex'],
                                  'Age': testsetAgeNull['Age'],
                                  'Fare': testsetAgeNull['Fare'],
                                  'Embarked': testsetAgeNull['Embarked'],
                                  'Has_cabin': testsetAgeNull['Has_cabin'],
                                  'FamilySize': testsetAgeNull['FamilySize'],
                                  'Is_alone': testsetAgeNull['Is_alone'],
                                  'Title': testsetAgeNull['Title'],
                                  'Age_imputed': preds
                                 })

In [75]:
testsetAgeImputed.head()

Unnamed: 0,Age,Age_imputed,Embarked,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Sex,Title
10,,28.796705,S,1,7.8958,0,1,3,male,1
22,,46.071919,S,1,31.6833,0,1,1,female,3
29,,25.672337,C,3,21.6792,0,0,3,male,1
33,,29.908774,S,4,23.45,0,0,3,female,3
36,,23.641844,S,1,8.05,0,1,3,female,2


In [76]:
trainsetAgeImputed.head()

Unnamed: 0,Age,Age_imputed,Embarked,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Sex,Title
5,,30.318199,Q,1,8.4583,0,1,3,male,1
17,,32.244762,S,1,13.0,0,1,2,male,1
19,,32.5593,C,1,7.225,0,1,3,female,3
26,,26.448083,C,1,7.225,0,1,3,male,1
28,,23.624418,Q,1,7.8792,0,1,3,female,2


##### Put back the target var into the train set

In [77]:
trainsetAgeNotNull.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Has_cabin,FamilySize,Is_alone,Title
0,3,male,22.0,7.25,S,0,2,0,1
1,1,female,38.0,71.2833,C,1,2,0,3
2,3,female,26.0,7.925,S,0,1,1,2
3,1,female,35.0,53.1,S,1,2,0,3
4,3,male,35.0,8.05,S,0,1,1,1


In [78]:
trainsetAgeNotNull = pd.DataFrame({'Pclass': trainsetAgeNotNull['Pclass'],
                                  'Sex': trainsetAgeNotNull['Sex'],
                                  'Age': trainsetAgeNotNull['Age'],
                                  'Fare': trainsetAgeNotNull['Fare'],
                                  'Embarked': trainsetAgeNotNull['Embarked'],
                                  'Has_cabin': trainsetAgeNotNull['Has_cabin'],
                                  'FamilySize': trainsetAgeNotNull['FamilySize'],
                                  'Is_alone': trainsetAgeNotNull['Is_alone'],
                                  'Title': trainsetAgeNotNull['Title'],
                                  'Age_imputed': trainsetAgeNotNull['Age']
                                 })

In [81]:
trainsetAgeNotNull.head()

Unnamed: 0,Age,Age_imputed,Embarked,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Sex,Title
0,22.0,22.0,S,2,7.25,0,0,3,male,1
1,38.0,38.0,C,2,71.2833,1,0,1,female,3
2,26.0,26.0,S,1,7.925,0,1,3,female,2
3,35.0,35.0,S,2,53.1,1,0,1,female,3
4,35.0,35.0,S,1,8.05,0,1,3,male,1


In [120]:
fullTrainset = trainsetAgeNotNull.append(trainsetAgeImputed)

In [121]:
fullTrainset.sort_index(inplace=True)
trainset.sort_index(inplace=True)

In [122]:
fullTrainset['Survived'] = trainset['Survived']

In [123]:
fullTrainset.head()

Unnamed: 0,Age,Age_imputed,Embarked,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Sex,Title,Survived
0,22.0,22.0,S,2,7.25,0,0,3,male,1,0
1,38.0,38.0,C,2,71.2833,1,0,1,female,3,1
2,26.0,26.0,S,1,7.925,0,1,3,female,2,1
3,35.0,35.0,S,2,53.1,1,0,1,female,3,1
4,35.0,35.0,S,1,8.05,0,1,3,male,1,0


##### Put back the id in the test set

In [90]:
testsetAgeNotNull = pd.DataFrame({'Pclass': testsetAgeNotNull['Pclass'],
                                  'Sex': testsetAgeNotNull['Sex'],
                                  'Age': testsetAgeNotNull['Age'],
                                  'Fare': testsetAgeNotNull['Fare'],
                                  'Embarked': testsetAgeNotNull['Embarked'],
                                  'Has_cabin': testsetAgeNotNull['Has_cabin'],
                                  'FamilySize': testsetAgeNotNull['FamilySize'],
                                  'Is_alone': testsetAgeNotNull['Is_alone'],
                                  'Title': testsetAgeNotNull['Title'],
                                  'Age_imputed': testsetAgeNotNull['Age']
                                 })

In [124]:
fullTestset = testsetAgeNotNull.append(testsetAgeImputed)

In [125]:
fullTestset.sort_index(inplace=True)
testset.sort_index(inplace=True)

In [126]:
fullTestset['PassengerId'] = testset['PassengerId']

In [127]:
fullTestset.head()

Unnamed: 0,Age,Age_imputed,Embarked,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Sex,Title,PassengerId
0,34.5,34.5,Q,1,7.8292,0,1,3,male,1,892
1,47.0,47.0,S,2,7.0,0,0,3,female,3,893
2,62.0,62.0,Q,1,9.6875,0,1,2,male,1,894
3,27.0,27.0,S,1,8.6625,0,1,3,male,1,895
4,22.0,22.0,S,3,12.2875,0,0,3,female,3,896


##### Drop Age but keep Age_imputed

In [128]:
fullTrainset.drop(['Age'], axis=1, inplace=True)
fullTestset.drop(['Age'], axis=1, inplace=True)

#### Create a Age category col

In [101]:
def get_age_category(age):
    if age <= 3:
        return 0
    elif age > 3 and age <= 14:
        return 1
    elif age > 14 and age <= 24:
        return 2
    elif age > 24 and age <= 34:
        return 3
    elif age > 34 and age <= 44:
        return 4
    elif age > 44 and age <= 54:
        return 5
    elif age > 54 and age <= 64:
        return 6
    elif age > 64:
        return 7   

In [129]:
fullTrainset['Age_category'] = fullTrainset['Age_imputed'].apply(get_age_category)

In [130]:
fullTestset['Age_category'] = fullTestset['Age_imputed'].apply(get_age_category)

## Modelling

In [131]:
fullTrainset.drop(['Age_imputed'], axis=1, inplace=True)
fullTestset.drop(['Age_imputed'], axis=1, inplace=True)

#### Divide train and test set in 2 pieces: children and non children

In [132]:
fullTrainset.head()

Unnamed: 0,Embarked,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Sex,Title,Survived,Age_category
0,S,2,7.25,0,0,3,male,1,0,2
1,C,2,71.2833,1,0,1,female,3,1,4
2,S,1,7.925,0,1,3,female,2,1,3
3,S,2,53.1,1,0,1,female,3,1,4
4,S,1,8.05,0,1,3,male,1,0,4


In [133]:
fullTestset.head()

Unnamed: 0,Embarked,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Sex,Title,PassengerId,Age_category
0,Q,1,7.8292,0,1,3,male,1,892,4
1,S,2,7.0,0,0,3,female,3,893,5
2,Q,1,9.6875,0,1,2,male,1,894,6
3,S,1,8.6625,0,1,3,male,1,895,3
4,S,3,12.2875,0,0,3,female,3,896,2


In [157]:
trainset_children = fullTrainset[fullTrainset['Age_category']<=1]
trainset_nonChildren = fullTrainset[fullTrainset['Age_category']>1]
testset_children = fullTestset[fullTestset['Age_category']<=1]
testset_nonChildren = fullTestset[fullTestset['Age_category']>1]

##### Let's train first for children

In [140]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score

In [142]:
kFold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)

In [196]:
X = pd.get_dummies(trainset_children.drop(['Survived'], axis=1, inplace=False))
y = trainset_children['Survived']

In [197]:
X.head()

Unnamed: 0,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Title,Age_category,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
7,5,21.075,0,0,3,4,0,0,0,1,0,1
9,2,30.0708,0,0,2,3,1,1,0,0,1,0
10,3,16.7,1,0,3,2,1,0,0,1,1,0
14,1,7.8542,0,1,3,2,1,0,0,1,1,0
16,6,29.125,0,0,3,4,0,0,1,0,0,1


In [198]:
accs = []
f1s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelGB = GradientBoostingClassifier(random_state=0)    
    modelGB.fit(X_train, y_train)
    preds = modelGB.predict(X_test)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    accs.append(acc)
    f1s.append(f1)
    print('Acc:', acc, 'F1:', f1)
    print()
print('Mean acc:', np.mean(accs))
print('Mean F1:', np.mean(f1s))

Fold: 0
Acc: 0.7777777777777778 F1: 0.8333333333333333

Fold: 1
Acc: 0.8888888888888888 F1: 0.9090909090909091

Fold: 2
Acc: 0.7777777777777778 F1: 0.8000000000000002

Fold: 3
Acc: 1.0 F1: 1.0

Fold: 4
Acc: 0.7777777777777778 F1: 0.7499999999999999

Fold: 5
Acc: 1.0 F1: 1.0

Fold: 6
Acc: 0.8888888888888888 F1: 0.9090909090909091

Fold: 7
Acc: 0.75 F1: 0.75

Fold: 8
Acc: 0.875 F1: 0.888888888888889

Fold: 9
Acc: 0.625 F1: 0.7272727272727272

Mean acc: 0.836111111111111
Mean F1: 0.8567676767676767


##### Grid search

In [146]:
from sklearn.model_selection import GridSearchCV

In [144]:
modelGB.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': 0,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [145]:
params = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': list(range(1, 15, 5)),
    'max_leaf_nodes': list(range(5, 20, 5)),
    'learning_rate': np.arange(0.1,1,0.1),
    'warm_start': [True, False]
}

In [147]:
grid = GridSearchCV(modelGB, param_grid=params, scoring='accuracy', cv=5)
grid.fit(X, y)
print(grid.best_params_)

{'max_depth': 6, 'n_estimators': 50, 'learning_rate': 0.1, 'warm_start': True, 'max_leaf_nodes': 5}


In [199]:
accs = []
f1s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelGB = GradientBoostingClassifier(n_estimators=1000, max_depth=4, learning_rate=0.1)    
    modelGB.fit(X_train, y_train)
    preds = modelGB.predict(X_test)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    accs.append(acc)
    f1s.append(f1)
    print('Acc:', acc, 'F1:', f1)
    print()
print('Mean acc:', np.mean(accs))
print('Mean F1:', np.mean(f1s))

Fold: 0
Acc: 0.7777777777777778 F1: 0.8333333333333333

Fold: 1
Acc: 0.8888888888888888 F1: 0.9090909090909091

Fold: 2
Acc: 0.6666666666666666 F1: 0.7272727272727272

Fold: 3
Acc: 1.0 F1: 1.0

Fold: 4
Acc: 0.7777777777777778 F1: 0.7499999999999999

Fold: 5
Acc: 1.0 F1: 1.0

Fold: 6
Acc: 0.8888888888888888 F1: 0.9090909090909091

Fold: 7
Acc: 0.75 F1: 0.75

Fold: 8
Acc: 0.875 F1: 0.888888888888889

Fold: 9
Acc: 0.75 F1: 0.8333333333333333

Mean acc: 0.8375
Mean F1: 0.8601010101010103


##### Let's predict on the testset

In [200]:
test = pd.get_dummies(testset_children.drop(['PassengerId'], axis=1, inplace=False))
ids = testset_children['PassengerId']

In [201]:
test.head()

Unnamed: 0,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Title,Age_category,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
5,1,9.225,0,1,3,1,1,0,0,1,0,1
21,2,3.1708,0,0,3,4,1,0,0,1,0,1
55,6,29.125,0,0,3,4,1,0,1,0,0,1
64,5,262.375,1,0,1,4,1,1,0,0,0,1
80,3,15.2458,0,0,3,4,1,1,0,0,0,1


In [202]:
preds = modelGB.predict(test)

In [203]:
outputChildren = pd.DataFrame({'PassengerId': ids, 'Survived': preds})

In [204]:
outputChildren.head()

Unnamed: 0,PassengerId,Survived
5,897,0
21,913,1
55,947,0
64,956,1
80,972,1


##### Let's train for non-children

In [181]:
X = pd.get_dummies(trainset_nonChildren.drop(['Survived'], axis=1, inplace=False))
y = trainset_nonChildren['Survived']

In [182]:
accs = []
f1s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelGB = GradientBoostingClassifier(random_state=0)    
    modelGB.fit(X_train, y_train)
    preds = modelGB.predict(X_test)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    accs.append(acc)
    f1s.append(f1)
    print('Acc:', acc, 'F1:', f1)
    print()
print('Mean acc:', np.mean(accs))
print('Mean F1:', np.mean(f1s))

Fold: 0
Acc: 0.8518518518518519 F1: 0.7777777777777777

Fold: 1
Acc: 0.7777777777777778 F1: 0.689655172413793

Fold: 2
Acc: 0.8148148148148148 F1: 0.7457627118644068

Fold: 3
Acc: 0.8271604938271605 F1: 0.7307692307692307

Fold: 4
Acc: 0.8 F1: 0.6923076923076923

Fold: 5
Acc: 0.8625 F1: 0.8070175438596492

Fold: 6
Acc: 0.775 F1: 0.6785714285714286

Fold: 7
Acc: 0.85 F1: 0.7692307692307693

Fold: 8
Acc: 0.875 F1: 0.8148148148148148

Fold: 9
Acc: 0.7875 F1: 0.7017543859649122

Mean acc: 0.8221604938271605
Mean F1: 0.7407661527574476


##### Grid Search

In [184]:
modelGB.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.05,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [185]:
params = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': list(range(1, 15, 5)),
    'max_leaf_nodes': list(range(5, 20, 5)),
    'learning_rate': np.arange(0.1,1,0.1),
    'warm_start': [True, False]
}

In [186]:
grid = GridSearchCV(modelGB, param_grid=params, scoring='accuracy', cv=5)
grid.fit(X, y)
print(grid.best_params_)

{'max_depth': 1, 'n_estimators': 150, 'learning_rate': 0.9, 'warm_start': True, 'max_leaf_nodes': 5}


In [187]:
accs = []
f1s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelGB = GradientBoostingClassifier(random_state=0,
                                         max_depth=1,
                                         n_estimators=150,
                                         learning_rate=0.9,
                                         warm_start=True,
                                         max_leaf_nodes=5
                                        )    
    modelGB.fit(X_train, y_train)
    preds = modelGB.predict(X_test)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    accs.append(acc)
    f1s.append(f1)
    print('Acc:', acc, 'F1:', f1)
    print()
print('Mean acc:', np.mean(accs))
print('Mean F1:', np.mean(f1s))

Fold: 0
Acc: 0.8148148148148148 F1: 0.736842105263158

Fold: 1
Acc: 0.7777777777777778 F1: 0.7

Fold: 2
Acc: 0.8271604938271605 F1: 0.7741935483870969

Fold: 3
Acc: 0.8395061728395061 F1: 0.7636363636363636

Fold: 4
Acc: 0.8375 F1: 0.7796610169491527

Fold: 5
Acc: 0.85 F1: 0.7999999999999999

Fold: 6
Acc: 0.8125 F1: 0.7272727272727274

Fold: 7
Acc: 0.825 F1: 0.7307692307692308

Fold: 8
Acc: 0.85 F1: 0.7777777777777777

Fold: 9
Acc: 0.8125 F1: 0.736842105263158

Mean acc: 0.8246759259259259
Mean F1: 0.7526994875318666


##### Let's predict on the testset

In [192]:
test = pd.get_dummies(testset_nonChildren.drop(['PassengerId'], axis=1, inplace=False))
ids = testset_nonChildren['PassengerId']

In [193]:
test.head()

Unnamed: 0,FamilySize,Fare,Has_cabin,Is_alone,Pclass,Title,Age_category,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,1,7.8292,0,1,3,1,4,0,1,0,0,1
1,2,7.0,0,0,3,3,5,0,0,1,1,0
2,1,9.6875,0,1,2,1,6,0,1,0,0,1
3,1,8.6625,0,1,3,1,3,0,0,1,0,1
4,3,12.2875,0,0,3,3,2,0,0,1,1,0


In [194]:
preds = modelGB.predict(test)

In [195]:
outputNonChildren = pd.DataFrame({'PassengerId': ids, 'Survived': preds})

In [205]:
outputNonChildren.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [207]:
fullOutput = outputChildren.append(outputNonChildren)
fullOutput.sort_index(inplace=True)

In [208]:
fullOutput.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [210]:
fullOutput.shape

(418, 2)

In [211]:
testset.shape

(418, 15)

In [212]:
fullOutput.to_csv('results/titanic_2models1dataset.csv', index=False)

## At this point you should submit your preds to Kaggle

##### It achieved a score of 0.75119 :-(