# Kaggle Data Competition: Titanic - Machine Learning from Disaster

### Environment Setup

In [None]:
# Dataset: https://www.kaggle.com/c/titanic/data
# Initial Kaggle Setup
import pandas as pd
import warnings

# Ignoring warning sign
%matplotlib inline
warnings.filterwarnings('ignore')

# data import
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
gender_submission = pd.read_csv('data/gender_submission.csv')

train.info()

### DataSet Information

|Variable|Definition|Key|
|------|---|---|
|survival|Survival|0 = No, 1 = Yes|
|pclass|Ticket class|1 = 1st, 2 = 2nd, 3 = 3rd|
|Sex|Sex||
|Age|Age in years||
|sibsp|# of siblings / spouses aboard the Titanic||
|parch|# of parents / children aboard the Titanic||
|ticket|Ticket number||
|fare|Passenger fare||
|cabin|Cabin number||
|embarked|Port of Embarkation|C = Cherbourg, Q = Queenstown, S = Southampton|

### Variable Notes
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.


In [3]:
# transition of categorical data into numeric value
train['Sex_clean'] = train['Sex'].astype('category').cat.codes
test['Sex_clean'] = test['Sex'].astype('category').cat.codes

In [4]:
train['Embarked'].isnull().sum()
#2

test['Embarked'].isnull().sum()
# 0
train['Embarked'].value_counts()
# S    644 <- Pick this to fill NaN
# C    168
# Q     77

# Fill the NaN Values
train['Embarked'].fillna('S', inplace=True)

# Transition of categorical data into numeric value
train['Embarked_clean'] = train['Embarked'].astype('category').cat.codes
test['Embarked_clean'] = test['Embarked'].astype('category').cat.codes

In [11]:
# Add Family and Solo Count, to clarify if solo status matters
train['Family'] = 1 + train['SibSp'] + train['Parch']
test['Family'] = 1 + test['SibSp'] + test['Parch']

train['Solo'] = (train['Family'] == 1)
test['Solo'] = (test['Family'] == 1)

In [12]:
train['FareBin'] = pd.qcut(train['Fare'], 5)
test['FareBin'] = pd.qcut(test['Fare'], 5)

train['FareBin'].value_counts()
# (7.854, 10.5]        184
# (21.679, 39.688]     180
# (-0.001, 7.854]      179
# (39.688, 512.329]    176
# (10.5, 21.679]       172
# Name: FareBin, dtype: int64

train['Fare_clean'] = train['FareBin'].astype('category').cat.codes
test['Fare_clean'] = test['FareBin'].astype('category').cat.codes

train['Fare_clean'].value_counts()
# 1    184
# 3    180
# 0    179
# 4    176
# 2    172
# Name: Fare_clean, dtype: int64

1    184
3    180
0    179
4    176
2    172
Name: Fare_clean, dtype: int64

In [23]:
# Title extraction and categorization for minimizing complexity
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

train['Title'].value_counts()
# Mr        517
# Miss      182
# Mrs       125
# Master     40
# Other      23
# Mlle        2
# Ms          1
# Mme         1
# Name: Title, dtype: int64

# Cleasing and integration of typo(expected) and others
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

#train['Title'].value_counts()
# Mr        517
# Miss      185
# Mrs       126
# Master     40
# Other      23
# Name: Title, dtype: int64

# Same Process for test dataset
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

test['Title'].value_counts()
# Mr        240
# Miss       79
# Mrs        72
# Master     21
# Other       6
# Name: Title, dtype: int64


Mr        240
Miss       79
Mrs        72
Master     21
Other       6
Name: Title, dtype: int64

In [25]:
# Transition of categorical data into numeric value
train['Title_clean'] = train['Title'].astype('category').cat.codes
test['Title_clean'] = test['Title'].astype('category').cat.codes