## Titanic Data Preprocessing

### Feature extraction, NaN removal, and categorical variable manipulation

In [35]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [36]:
cd Dropbox/Portfolio/DataScience-Portfolio/Titanic

[Errno 2] No such file or directory: 'Dropbox/Portfolio/DataScience-Portfolio/Titanic'
/Users/Capgemini/Dropbox/Portfolio/DataScience-Portfolio/Titanic


In [37]:
# load data and test set
titanic = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

In [38]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


### Remove features not useful for predictions: Ticket and PassengerId

In [39]:
# Ticket has too many different values, I doubt that it will have predictive power
titanic.drop(['Ticket', 'PassengerId'], axis = 1, inplace = True)
titanic_test.drop(['Ticket','PassengerId'], axis = 1, inplace = True)

In [40]:
titanic.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

### Age

* Make additional column indicating if age is unknown
* Make additional column imputing Age with mean(Age)
* Make additional column with the title of the person (Mr, Mrs, Miss, etc) and use to impute Age

In [41]:
# Additional columnt to indicate if age is unknonw
def isNaN(num):
    return num != num # checks if cell is NaN

def is_age_unknown(age):
    if not isNaN(age):
        return 0
    else:
        return 1

In [42]:
# create age unkown binary column 
titanic['Age_Unknown'] = titanic['Age'].apply(is_age_unknown)
titanic_test['Age_Unknown'] = titanic_test['Age'].apply(is_age_unknown)

In [43]:
# age filled with mean: copy column Age and fill NaN with mean(age)
titanic['Age_mean'] = titanic.Age
titanic['Age_mean'] = titanic['Age_mean'].fillna(titanic.Age.mean())

titanic_test['Age_mean'] = titanic_test.Age
titanic_test['Age_mean'] = titanic_test['Age_mean'].fillna(titanic.Age.mean())

In [44]:
# function to extract title from Name feature
def get_title(passenger):
    if re.search('Mrs', passenger):
        return 'Mrs'
    elif re.search('Mr', passenger):
        return 'Mr'
    elif re.search('Miss', passenger):
        return 'Miss'
    elif re.search('Master', passenger):
        return 'Master'
    else:
        return 'Other'

In [45]:
# extract title  
titanic['Title'] = titanic['Name'].apply(get_title)
titanic_test['Title'] = titanic_test['Name'].apply(get_title)

In [46]:
# fill missing age, with median from title segregation: funtion
def fill_age(passenger):
    
    # determine age by group 
    temp = titanic.groupby(titanic.Title).median()
    
    age, title = passenger
    
    if age == age:
        return age
    else:
        if title == 'Mr':
            return temp.Age['Mr']
        elif title == 'Miss':
            return temp.Age['Miss']
        elif title == ['Mrs']:
            return temp.Age['Mrs']
        elif title == 'Master':
            return temp.Age['Master']
        else:
            return temp.Age['Other']        

In [47]:
# fill age according to title
titanic['Age'] = titanic[['Age', 'Title']].apply(fill_age, axis = 1)
titanic_test['Age'] = titanic_test[['Age', 'Title']].apply(fill_age, axis = 1)

In [48]:
# Remove column Name, it is not useful for predictions and we extracted the title already
titanic.drop('Name', axis = 1, inplace = True)
titanic_test.drop('Name', axis = 1, inplace = True)

In [49]:
titanic.isnull().sum()

Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
Age_Unknown      0
Age_mean         0
Title            0
dtype: int64

### Extract Deck from Cabin and add 'Unknown' where NA

In [50]:
# get the first letter (the number of the cabin is irrelevant as the letter specifies the location in the boat)
def get_cabin(cabin):
    if not isNaN(cabin):
        return cabin[0]
    else:
        return 'Unknown'

In [51]:
titanic['Deck'] = titanic['Cabin'].apply(get_cabin)
titanic_test['Deck'] = titanic_test['Cabin'].apply(get_cabin)

In [52]:
# drop old variable Cabin
titanic.drop('Cabin', axis = 1, inplace = True)
titanic_test.drop('Cabin', axis = 1, inplace = True)

### Impute Embarked with the most frequent port (S)

In [53]:
titanic["Embarked"].fillna("S", inplace = True)
titanic_test['Embarked'].fillna("S", inplace = True)

### Check that all NaN are removed

In [54]:
titanic.isnull().sum()

Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Age_Unknown    0
Age_mean       0
Title          0
Deck           0
dtype: int64

In [55]:
titanic_test.isnull().sum()

Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
Age_Unknown    0
Age_mean       0
Title          0
Deck           0
dtype: int64

In [56]:
# there is a null value in Fare for titanic_test, I will replace it with the mean fare for titanic train
titanic_test['Fare'].fillna(titanic.Fare.mean(), inplace = True)

### Make column 'Child'  and 'Minor' indicating whether passenger is under 16 or 12

In [57]:
# function
def is_minor(age):
    
    if age < 16:
        return 1
    else:
        return 0

In [58]:
titanic['Minor'] = titanic['Age'].apply(is_minor)
titanic_test['Minor'] = titanic_test['Age'].apply(is_minor)

In [59]:
# is child
def is_child(age):
    if age < 12:
        return 1
    else:
        return 0

titanic['Child'] = titanic['Age'].apply(is_child)
titanic_test['Child'] = titanic_test['Age'].apply(is_child)

In [60]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Unknown,Age_mean,Title,Deck,Minor,Child
0,0,3,male,22,1,0,7.25,S,0,22,Mr,Unknown,0,0
1,1,1,female,38,1,0,71.2833,C,0,38,Mrs,C,0,0
2,1,3,female,26,0,0,7.925,S,0,26,Miss,Unknown,0,0
3,1,1,female,35,1,0,53.1,S,0,35,Mrs,C,0,0
4,0,3,male,35,0,0,8.05,S,0,35,Mr,Unknown,0,0


### Make additional column: travel alone

In [61]:
def travel_alone(df):
    df['Alone'] = df.Parch + df.SibSp
    df['Alone'].loc[df['Alone'] > 0] = 0
    df['Alone'].loc[df['Alone'] == 0] = 1
    
    return df

In [62]:
titanic = travel_alone(titanic)
titanic_test = travel_alone(titanic_test)

titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Unknown,Age_mean,Title,Deck,Minor,Child,Alone
0,0,3,male,22,1,0,7.25,S,0,22,Mr,Unknown,0,0,1
1,1,1,female,38,1,0,71.2833,C,0,38,Mrs,C,0,0,1
2,1,3,female,26,0,0,7.925,S,0,26,Miss,Unknown,0,0,1
3,1,1,female,35,1,0,53.1,S,0,35,Mrs,C,0,0,1
4,0,3,male,35,0,0,8.05,S,0,35,Mr,Unknown,0,0,1


### Create variable man travelling with spouse

In [63]:
def man_with_spouse(passenger):
    sex, sibsp = passenger
    if sex == 'male' and sibsp > 0:
        return 1
    else:
        return 0

In [64]:
# create var man with spouse
titanic['man_w_spouse'] = titanic[['Sex', 'SibSp']].apply(man_with_spouse, axis = 1)
titanic_test['man_w_spouse'] = titanic_test[['Sex', 'SibSp']].apply(man_with_spouse, axis = 1)

### Create variable mother travelling with children

In [65]:
def woman_with_child(passenger):
    age, sex, parch = passenger
    if age > 20 and sex == 'female' and parch > 0:
        return 1
    else:
        return 0

In [66]:
# create var woman with child
titanic['woman_w_child'] = titanic[['Age', 'Sex', 'Parch']].apply(woman_with_child, axis = 1)
titanic_test['woman_w_child'] = titanic_test[['Age', 'Sex', 'Parch']].apply(woman_with_child, axis = 1)

In [67]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Age_Unknown,Age_mean,Title,Deck,Minor,Child,Alone,man_w_spouse,woman_w_child
0,0,3,male,22,1,0,7.25,S,0,22,Mr,Unknown,0,0,1,1,0
1,1,1,female,38,1,0,71.2833,C,0,38,Mrs,C,0,0,1,0,0
2,1,3,female,26,0,0,7.925,S,0,26,Miss,Unknown,0,0,1,0,0
3,1,1,female,35,1,0,53.1,S,0,35,Mrs,C,0,0,1,0,0
4,0,3,male,35,0,0,8.05,S,0,35,Mr,Unknown,0,0,1,0,0


### Make dummy variables from categorical variables

* Sex
* Embarked
* Person

In [68]:
# Sex is binary so I code male = 1, female = 0
titanic['Sex'].loc[titanic['Sex'] == 'male'] = 1
titanic['Sex'].loc[titanic['Sex'] == 'female'] = 0

titanic_test['Sex'].loc[titanic_test['Sex'] == 'male'] = 1
titanic_test['Sex'].loc[titanic_test['Sex'] == 'female'] = 0

In [69]:
# create dummies function
def make_dummies(df):
    embarked_dummies = pd.get_dummies(df['Embarked'])
    deck_dummies = pd.get_dummies(df['Deck'])
    title_dummies = pd.get_dummies(df['Title'])

    #Drop one column from dummies df as it can be inferred from the remaining ones
    #embarked_dummies.drop('Q', axis = 1, inplace = True)
    #title_dummies.drop('Other', axis = 1, inplace = True)
    #deck_dummies.drop('Unknown', axis = 1, inplace = True)
    
    # Remove from the original dataset
    df.drop(['Embarked', 'Deck', 'Title'], axis = 1, inplace = True)

    # concatenate all 3 dataframes
    df = pd.concat([df, embarked_dummies, deck_dummies, title_dummies], axis = 1)
    
    return df

In [70]:
titanic = make_dummies(titanic)
titanic.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Age_Unknown', 'Age_mean', 'Minor', 'Child', 'Alone', 'man_w_spouse',
       'woman_w_child', 'C', 'Q', 'S', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'T',
       'Unknown', 'Master', 'Miss', 'Mr', 'Mrs', 'Other'],
      dtype='object')

In [72]:
titanic_test = make_dummies(titanic_test)
titanic_test.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Age_Unknown',
       'Age_mean', 'Minor', 'Child', 'Alone', 'man_w_spouse', 'woman_w_child',
       'C', 'Q', 'S', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'Unknown', 'Master',
       'Miss', 'Mr', 'Mrs', 'Other'],
      dtype='object')

In [73]:
titanic.drop('T', axis =1, inplace=True)

In [74]:
# Fare
def is_low_fare(fare):
    if fare < 20:
        return 1
    else:
        return 0

titanic['low_fare'] = titanic['Fare'].apply(is_low_fare)
titanic_test['low_fare'] = titanic_test['Fare'].apply(is_low_fare)

In [75]:
# save to csv to use in following notebook
titanic.to_csv('titanic_train_ready3.csv', header = True, index = False)
titanic_test.to_csv('titanic_test_ready3.csv', header = True, index = False)

In [76]:
titanic.shape

(891, 31)

In [77]:
titanic_test.shape

(418, 30)

In [78]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Age_Unknown,Age_mean,Minor,...,E,F,G,Unknown,Master,Miss,Mr,Mrs,Other,low_fare
0,0,3,1,22,1,0,7.25,0,22,0,...,0,0,0,1,0,0,1,0,0,1
1,1,1,0,38,1,0,71.2833,0,38,0,...,0,0,0,0,0,0,0,1,0,0
2,1,3,0,26,0,0,7.925,0,26,0,...,0,0,0,1,0,1,0,0,0,1
3,1,1,0,35,1,0,53.1,0,35,0,...,0,0,0,0,0,0,0,1,0,0
4,0,3,1,35,0,0,8.05,0,35,0,...,0,0,0,1,0,0,1,0,0,1
