# Predict survival on the Titanic


## Import the Libraries and Data

In [None]:
import numpy as np
import pandas as pd
import os
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split

df_train=pd.read_csv('./input/train.csv',sep=',')
df_test=pd.read_csv('./input/test.csv',sep=',')
df_data = df_train.append(df_test)

# Train the data without preprocessing

Remember train_baseline.py in the first time we use to predict survival.

The auc score will around about 0.78x

In [None]:
train = df_train.select_dtypes(include=['float64', 'int64'])
test  = df_test.select_dtypes(include=['float64', 'int64'])

target = ['Survived']
x_train = train.drop(target, axis=1)
y_train = train['Survived']

X_train,X_valid, Y_train, Y_valid = train_test_split(x_train, y_train, test_size=0.1,random_state=2018)

dtrain = lgb.Dataset(X_train, label=Y_train)

myparams = {
    'objective': 'binary',
    'metric': {'auc'},
    'learning_rate': 0.05,
  }

model = lgb.train(params=myparams, train_set=dtrain)

my_pred = model.predict(X_valid)

auc_score = roc_auc_score(Y_valid, my_pred)
print (auc_score)


Let's Do some Data preprocessing and get higher auc score. 

# Data Preprocessing and Feature Engineering

In Data Preprocessing, we usully do something like:
- deal with outlier
- categorical variable encoding.(One-Hot Encoding)
- text data embedding.
    
IN Feature Engineering, we usully do something like:
- Feature Extraction(Create some new feature)
- Feature Selection(by feature correlation)

Fill the na values in Fare based on embarked data

In [None]:
embarked = ['S', 'C', 'Q']
for port in embarked:
    fare_to_impute = df_data.groupby('Embarked')['Fare'].median()[embarked.index(port)]
    df_data.loc[(df_data['Fare'].isnull()) & (df_data['Embarked'] == port), 'Fare'] = fare_to_impute
# Fare in df_train and df_test:
df_train["Fare"] = df_data['Fare'][:891]
df_test["Fare"] = df_data['Fare'][891:]

Fill in missing Fare value in training set based on mean fare for that Pclass 

In [None]:
for x in range(len(df_train["Fare"])):
    if pd.isnull(df_train["Fare"][x]):
        pclass = df_train["Pclass"][x] #Pclass = 3
        df_train["Fare"][x] = round(df_train[df_train["Pclass"] == pclass]["Fare"].mean(), 4)

Fill in missing Fare value in test set based on mean fare for that Pclass         

In [None]:
for x in range(len(df_test["Fare"])):
    if pd.isnull(df_test["Fare"][x]):
        pclass = df_test["Pclass"][x] #Pclass = 3
        df_test["Fare"][x] = round(df_test[df_test["Pclass"] == pclass]["Fare"].mean(), 4)

Map Fare values into groups of numerical values

In [None]:
df_data["FareBand"] = pd.qcut(df_data['Fare'], 4, labels = [1, 2, 3, 4]).astype('int')
df_train["FareBand"] = pd.qcut(df_train['Fare'], 4, labels = [1, 2, 3, 4]).astype('int')
df_test["FareBand"] = pd.qcut(df_test['Fare'], 4, labels = [1, 2, 3, 4]).astype('int')

Map each Embarked value to a numerical value

In [None]:
embarked_mapping = {"S": 1, "C": 2, "Q": 3}
df_data["Embarked"] = df_data["Embarked"].map(embarked_mapping)
# split Embanked into df_train and df_test:
df_train["Embarked"] = df_data["Embarked"][:891]
df_test["Embarked"] = df_data["Embarked"][891:]

Fill the na values in Embanked based on fareband data

In [None]:
fareband = [1,2,3,4]
for fare in fareband:
    embark_to_impute = df_data.groupby('FareBand')['Embarked'].median()[fare]
    df_data.loc[(df_data['Embarked'].isnull()) & (df_data['FareBand'] == fare), 'Embarked'] = embark_to_impute
# Fare in df_train and df_test:
df_train["Embarked"] = df_data['Embarked'][:891]
df_test["Embarked"] = df_data['Embarked'][891:]

convert categories to Columns

In [None]:
dummies=pd.get_dummies(df_train[['Sex']], prefix_sep='_') #Gender
df_train = pd.concat([df_train, dummies], axis=1) 
testdummies=pd.get_dummies(df_test[['Sex']], prefix_sep='_') #Gender
df_test = pd.concat([df_test, testdummies], axis=1) 

Map each Gendre value to a numerical value

In [None]:
gender_mapping = {"female": 0, "male": 1}
df_data["Sex"] = df_data['Sex'].map(gender_mapping)
df_data["Sex"]=df_data["Sex"].astype('int')

# Family_Survival in TRAIN_DF and TEST_DF:
df_train["Sex"] = df_data["Sex"][:891]
df_test["Sex"] = df_data["Sex"][891:]

Get titles

In [None]:

df_data["Title"] = df_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

#Unify common titles. 
df_data["Title"] = df_data["Title"].replace('Mlle', 'Miss')
df_data["Title"] = df_data["Title"].replace('Master', 'Master')
df_data["Title"] = df_data["Title"].replace(['Mme', 'Dona', 'Ms'], 'Mrs')
df_data["Title"] = df_data["Title"].replace(['Jonkheer','Don'],'Mr')
df_data["Title"] = df_data["Title"].replace(['Capt','Major', 'Col','Rev','Dr'], 'Millitary')
df_data["Title"] = df_data["Title"].replace(['Lady', 'Countess','Sir'], 'Honor')

# Age in df_train and df_test:
df_train["Title"] = df_data['Title'][:891]
df_test["Title"] = df_data['Title'][891:]



Convert Title categories to Columns

In [None]:
titledummies=pd.get_dummies(df_train[['Title']], prefix_sep='_') #Title
df_train = pd.concat([df_train, titledummies], axis=1) 
ttitledummies=pd.get_dummies(df_test[['Title']], prefix_sep='_') #Title
df_test = pd.concat([df_test, ttitledummies], axis=1) 

Mapping titles

In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Millitary": 5, "Honor": 6}
df_data["TitleCat"] = df_data['Title'].map(title_mapping)
df_data["TitleCat"] = df_data["TitleCat"].astype(int)
df_train["TitleCat"] = df_data["TitleCat"][:891]
df_test["TitleCat"] = df_data["TitleCat"][891:]

In [None]:
titles = ['Master', 'Miss', 'Mr', 'Mrs', 'Millitary','Honor']
for title in titles:
    age_to_impute = df_data.groupby('Title')['Age'].median()[title]
    df_data.loc[(df_data['Age'].isnull()) & (df_data['Title'] == title), 'Age'] = age_to_impute
# Age in df_train and df_test:
df_train["Age"] = df_data['Age'][:891]
df_test["Age"] = df_data['Age'][891:]

Visualise Age Data 

In [None]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Training Age values - Titanic')
axis2.set_title('Test Age values - Titanic')

# plot original Age values
df_train['Age'].dropna().astype(int).hist(bins=70, ax=axis1)
        
# plot new Age Values
df_test['Age'].hist(bins=70, ax=axis2)

# peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(df_train, hue="Survived",palette = 'seismic',aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, df_train['Age'].max()))
facet.add_legend()

We can see the trend of survived-age above. So, what is the story?

In [None]:
df_train["Alone"] = np.where(df_train['SibSp'] + df_train['Parch'] + 1 == 1, 1,0) # People travelling alone
df_test["Alone"] = np.where(df_test['SibSp'] + df_test['Parch'] + 1 == 1, 1,0) # People travelling alone

In [None]:
df_train["Family Size"] = (df_train['SibSp'] + df_train['Parch'] + 1)
df_test["Family Size"] = df_test['SibSp'] + df_test['Parch'] + 1

In [None]:
# check if cabin inf exists
df_data["HadCabin"] = (df_data["Cabin"].notnull().astype('int'))
# split Embanked into df_train and df_test:
df_train["HadCabin"] = df_data["HadCabin"][:891]
df_test["HadCabin"] = df_data["HadCabin"][891:]

In [None]:
# Extract Deck
df_data["Deck"] = df_data.Cabin.str.extract('([A-Za-z])', expand=False)
df_data["Deck"] = df_data["Deck"].fillna("N")
# Map Deck
deck_mapping = {"N":0,"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}
df_data['Deck'] = df_data['Deck'].map(deck_mapping)
#Split to training and test
df_train["Deck"] = df_data["Deck"][:891]
df_test["Deck"] = df_data["Deck"][891:]

In [None]:
df_train.describe()

Now, we did some data preprocessing and feature engineering.

Let's trian this data again, and see how is the auc score working.

In [None]:
# Re-evaluate with new features
NUMERIC_COLUMNS=['Alone','Family Size','Sex','Pclass','Fare','FareBand','Age','TitleCat','Embarked'] #72
ORIGINAL_NUMERIC_COLUMNS=['Pclass','Age','SibSp','Parch','Sex_female','Sex_male','Title_Master', 'Title_Miss','Title_Mr', 'Title_Mrs', 'Title_Millitary','Embarked'] #83
REVISED_NUMERIC_COLUMNS=['Pclass','Age','SibSp','Parch','Alone','Sex_female','Sex_male','Title_Master', 'Title_Miss','Title_Mr', 'Title_Mrs', 'Title_Millitary','Embarked'] #84

# create test and training data
data_to_train = df_train[REVISED_NUMERIC_COLUMNS].fillna(-1000)
y=df_train['Survived']
X=data_to_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=2018)

dtrain = lgb.Dataset(X_train, label=y_train)

myparams = {
    'objective': 'binary',
    'metric': {'auc'}
}

model = lgb.train(params=myparams, train_set=dtrain)

my_pred = model.predict(X_test)

auc_score = roc_auc_score(y_test, my_pred)
print (auc_score)

# Parameter Tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

# Set params
# Scores ~0.784 (without tuning and early stopping)    
params = {'boosting_type': 'gbdt',
          'objective': 'binary', 
          'num_leaves': 12, 
          'learning_rate': 0.05, 
          'metric' : 'auc'}
# Create parameters to search
gridParams = {
    'learning_rate': [0.005, 0.01,0.05,0.1,0.3],
    'num_leaves': [3,4,6,8,12,16],
    }

# Create classifier to use. Note that parameters have to be input manually
# not as a dict!
mdl = lgb.LGBMClassifier()

# To view the default model params:
mdl.get_params().keys()

# Create the grid
grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)

# Run the grid
grid.fit(X_train, y_train,verbose=3)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

# Using parameters already set above, replace in the best from the grid search
params['learning_rate'] = grid.best_params_['learning_rate'] 
params['num_leaves'] = grid.best_params_['num_leaves']

print('Fitting with params: ')
print(params)

In [None]:
dtrain = lgb.Dataset(X_train, label=y_train)


model = lgb.train(params=params, train_set=dtrain)

my_pred = model.predict(X_test)

auc_score = roc_auc_score(y_test, my_pred)
print (auc_score)