In [1]:
import pandas as pd
import numpy as np

In [4]:
trainset = pd.read_csv('datasets/titanic/train.csv')

In [5]:
trainset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
mv_cols = (trainset.isnull().sum())
print(mv_cols[mv_cols > 0])

Age    177
dtype: int64


In [7]:
testset = pd.read_csv('datasets/titanic/test.csv')

In [8]:
testset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [18]:
mv_cols = (testset.isnull().sum())
print(mv_cols[mv_cols > 0])

Age    86
dtype: int64


##### We'll do the following for imputation:
- Age: train a regressor
- Fare: impute w a single val
- Cabin: impute as NA and then create a new feature 'has_cabin' based on it
- Embarked: impute w single val

In [10]:
trainset[trainset['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [11]:
trainset['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [12]:
trainset['Embarked'] = trainset['Embarked'].fillna('C')

In [13]:
trainset['Cabin'] = trainset['Cabin'].fillna('NA')
testset['Cabin'] = testset['Cabin'].fillna('NA')

In [16]:
testset['Fare'] = testset['Fare'].fillna(7.5)

### Feature Engineering

In [19]:
trainset['Has_cabin'] = trainset['Cabin'].apply(lambda x: 0 if x == 'NA' else 1)
testset['Has_cabin'] = testset['Cabin'].apply(lambda x: 0 if x == 'NA' else 1)

In [20]:
trainset['FamilySize'] = trainset['SibSp'] + trainset['Parch'] + 1
testset['FamilySize'] = testset['SibSp'] + testset['Parch'] + 1

In [21]:
trainset['Is_alone'] = trainset['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
testset['Is_alone'] = testset['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

In [24]:
trainset['Title'] = trainset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
testset['Title'] = testset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [25]:
trainset['Title'] = trainset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 
                                               'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
testset['Title'] = testset['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 
                                               'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

In [26]:
trainset['Title'] = trainset['Title'].replace('Mlle','Miss')
trainset['Title'] = trainset['Title'].replace('Ms','Miss')
trainset['Title'] = trainset['Title'].replace('Mme','Mrs')

In [27]:
testset['Title'] = testset['Title'].replace('Mlle','Miss')
testset['Title'] = testset['Title'].replace('Ms','Miss')
testset['Title'] = testset['Title'].replace('Mme','Mrs')

In [29]:
title_mapping = {'Mr':1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}

In [30]:
trainset['Title'] = trainset['Title'].map(title_mapping) ## this could've been achieved w a labelencoder
testset['Title'] = testset['Title'].map(title_mapping)

In [32]:
trainset['Title'] = trainset['Title'].fillna(0)
testset['Title'] = testset['Title'].fillna(0)

In [33]:
trainset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_cabin,FamilySize,Is_alone,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,2,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,2,0,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1,1,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,2,0,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,1,1,1


In [34]:
testset.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_cabin,FamilySize,Is_alone,Title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,1,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,2,0,3
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,1,1,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,1,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,3,0,3


### Preparing for regressor training (Age)

In [46]:
features4Training = ['Pclass','Sex','Age','Fare','Embarked','Has_cabin','FamilySize','Is_alone','Title']

##### Splitting train and test sets in 2: those with Age and those without

In [47]:
trainsetAgeNull = trainset[trainset['Age'].isnull()]
trainsetAgeNotNull = trainset[trainset['Age'].notnull()]
testsetAgeNull = testset[testset['Age'].isnull()]
testsetAgeNotNull = testset[testset['Age'].notnull()]

In [48]:
trainsetAgeNull = trainsetAgeNull[features4Training]
trainsetAgeNotNull = trainsetAgeNotNull[features4Training]
testsetAgeNull = testsetAgeNull[features4Training]
testsetAgeNotNull = testsetAgeNotNull[features4Training]

In [49]:
trainsetAge = trainsetAgeNotNull.append(testsetAgeNotNull)

In [50]:
trainsetAge = pd.get_dummies(trainsetAge)

In [51]:
trainsetAge.head()

Unnamed: 0,Pclass,Age,Fare,Has_cabin,FamilySize,Is_alone,Title,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,7.25,0,2,0,1,0,1,0,0,1
1,1,38.0,71.2833,1,2,0,3,1,0,1,0,0
2,3,26.0,7.925,0,1,1,2,1,0,0,0,1
3,1,35.0,53.1,1,2,0,3,1,0,0,0,1
4,3,35.0,8.05,0,1,1,1,0,1,0,0,1


#### Regresssor for Age

In [52]:
X = trainsetAge.drop(['Age'], inplace=False, axis=1)
y = trainsetAge['Age']

In [53]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor

In [54]:
kFold = KFold(n_splits=10, random_state=0, shuffle=True)

In [56]:
maes = []
r2s = []
for i, (train_idx, test_idx) in enumerate(kFold.split(X, y)):
    print('Fold:', i)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    modelGB_age = GradientBoostingRegressor(random_state=0)
    modelGB_age.fit(X_train, y_train)
    preds = modelGB_age.predict(X_test)
    
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    print('MAE:', mae, 'R2:', r2)
    maes.append(mae)
    r2s.append(r2)
    print()
print('Avg MAE:', np.mean(maes))
print('Avg R2:', np.mean(r2s))

Fold: 0
MAE: 8.25414759105658 R2: 0.34856450685026263

Fold: 1
MAE: 8.316736164537012 R2: 0.3415580082068448

Fold: 2
MAE: 8.935574940580164 R2: 0.33468162345902763

Fold: 3
MAE: 8.385165867106773 R2: 0.43163854966049076

Fold: 4
MAE: 8.338396111748189 R2: 0.35461795319849754

Fold: 5
MAE: 7.631661221427318 R2: 0.4632167101283916

Fold: 6
MAE: 8.16030632670174 R2: 0.5419102127122761

Fold: 7
MAE: 10.279637244346002 R2: 0.36661498022192585

Fold: 8
MAE: 8.935706009825262 R2: 0.4074701356689947

Fold: 9
MAE: 8.11361493375698 R2: 0.5163783474117509

Avg MAE: 8.535094641108603
Avg R2: 0.41066510275184626
