In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Code structure Roadmap**

1. EDA
   * Identifying most important features based on their relationship with the survived column
        * Visualizations       
   * Which variables have null values?
3. Feature engineering
   * Apply all changes to both train and test data
   * Replace/drop null values (Imputation)
   * Outliers
   * Create new features
        * Binning
        * Extract features
   * Convert string categoricals
4. Model Selection
   * For each selected model:
        * Hyperparameter tuning
        * Cross-validation
5. Final model
   * Apply selected model to whole training data
   * Final results submit


In [None]:
# Import visualizations libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Import training and test data
titanic_train = pd.read_csv('/kaggle/input/titanic/train.csv')
titanic_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
# Checking head
titanic_train.head()

In [None]:
titanic_train.describe()

In [None]:
titanic_train.info()

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(titanic_train.isnull(),yticklabels=False,cmap='viridis')

In [None]:
sns.set_style('whitegrid')
sns.set_palette('RdBu_r')
sns.countplot(data=titanic_train,x='Survived')

In [None]:
sns.countplot(data=titanic_train,x='Sex',hue='Survived')

In [None]:
x1 = list(titanic_train[titanic_train['Survived'] == 1]['Age'])
x2 = list(titanic_train[titanic_train['Survived'] == 0]['Age'])
colors = ['#E69F00', '#56B4E9']
names = ['Survived', 'Did not survive']
plt.hist([x1, x2], stacked=True,color = colors, label=names,bins=30)
plt.legend()

In [None]:
sns.countplot(data=titanic_train,x='Pclass',hue='Survived')

In [None]:
x1 = list(titanic_train[titanic_train['Pclass'] == 1]['Age'])
x2 = list(titanic_train[titanic_train['Pclass'] == 2]['Age'])
x3 = list(titanic_train[titanic_train['Pclass'] == 3]['Age'])
colors = ['#E69F00', '#56B4E9', '#009E73']
names = ['First Class', 'Second Class', ' Third Class']
plt.figure(figsize=(15,5))
plt.hist([x1, x2, x3], stacked=True,color = colors, label=names,bins=30)
plt.legend()

In [None]:
titanic_train[['Pclass', 'Age']].groupby(['Pclass'], as_index=False).median().sort_values(by='Pclass', ascending=True)

In [None]:
x1 = list(titanic_train[titanic_train['Survived'] == 1]['Fare'])
x2 = list(titanic_train[titanic_train['Survived'] == 0]['Fare'])
colors = ['#E69F00', '#56B4E9']
names = ['Survived', 'Did not survive']
plt.figure(figsize=(10,5))
plt.hist([x1, x2], stacked=True,color = colors, label=names,bins=30)
plt.legend()

In [None]:
x1 = list(titanic_train[titanic_train['Pclass'] == 1]['Fare'])
x2 = list(titanic_train[titanic_train['Pclass'] == 2]['Fare'])
x3 = list(titanic_train[titanic_train['Pclass'] == 3]['Fare'])
colors = ['#E69F00', '#56B4E9', '#009E73']
names = ['First Class', 'Second Class', ' Third Class']
plt.figure(figsize=(15,5))
plt.hist([x1, x2, x3], stacked=True,color = colors, label=names,bins=50)
plt.legend()

In [None]:
sns.countplot(data=titanic_train,x='SibSp',hue='Survived')

In [None]:
sns.countplot(data=titanic_train,x='Parch',hue='Survived')

In [None]:
sns.countplot(data=titanic_train,x='Embarked',hue='Survived')

In [None]:
sns.countplot(data=titanic_train,x='Embarked',hue='Pclass')

In [None]:
sns.countplot(data=titanic_train,x='Embarked',hue='Sex')

In [None]:
titanic_train.columns

Key observations from EDA:
* Should bin 'Age' and 'Fare'. Both of them seem to have cases which might lead to overfitting.
* Consider dropping 'Embarked' column. Not sure if it would add any value.
* Drop 'PassengerId'
* Should drop 'Cabin'. Too many null values
* Would need to fill NULL values in 'Age'
* Only two NULL values in 'Embarked'. Fill or just drop those two?

In [None]:
titanic_train.head()

In [None]:
titanic_test.head()

In [None]:
#Dropping columns not required
combine = [titanic_train, titanic_test]

print("Before", titanic_train.shape, titanic_test.shape, combine[0].shape, combine[1].shape)

titanic_train = titanic_train.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
titanic_test = titanic_test.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
combine = [titanic_train, titanic_test]

print("After", titanic_train.shape, titanic_test.shape, combine[0].shape, combine[1].shape)

In [None]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    
titanic_train.head()

In [None]:
pd.crosstab(titanic_train['Title'], titanic_train['Sex'],margins=True)

In [None]:
pd.crosstab(titanic_train['Title'], titanic_train['Survived'],margins=True)

In [None]:
pd.crosstab(titanic_test['Title'], titanic_train['Survived'],margins=True)

In [None]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
sns.countplot(data=titanic_train,x='Title',hue='Survived')

In [None]:
titanic_train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by='Title', ascending=True)

In [None]:
sns.countplot(data=titanic_test,x='Title')

In [None]:
freq_port = titanic_train.Embarked.dropna().mode()[0]
freq_port

In [None]:
# Replacing NULL values in 'Embarked' with the most probable port
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

titanic_test.info()

In [None]:
# Get dummies for categorical columns which are not ordinal

# Training data
sex = pd.get_dummies(titanic_train['Sex'],drop_first=True)
title = pd.get_dummies(titanic_train['Title'],drop_first=True)
embark = pd.get_dummies(titanic_train['Embarked'],drop_first=True)
titanic_train = pd.concat([titanic_train,sex,title,embark],axis=1)

titanic_train.head()

In [None]:
# Test data
sex = pd.get_dummies(titanic_test['Sex'],drop_first=True)
title = pd.get_dummies(titanic_test['Title'],drop_first=True)
embark = pd.get_dummies(titanic_test['Embarked'],drop_first=True)
titanic_test = pd.concat([titanic_test,sex,title,embark],axis=1)

titanic_test.head()

In [None]:
# Dropping columns not needed anymore
titanic_train = titanic_train.drop(['Name', 'Sex', 'Embarked','Title'], axis=1)
titanic_test = titanic_test.drop(['Name', 'Sex', 'Embarked','Title'], axis=1)
combine = [titanic_train, titanic_test]

In [None]:
titanic_train.head()

In [None]:
# Creating new family column
for dataset in combine:
    dataset['Family'] = dataset['SibSp'] + dataset['Parch'] + 1

titanic_train[['Family', 'Survived']].groupby(['Family'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
sns.countplot(data=titanic_train,x='Family',hue='Survived')

In [None]:
# Merging anything more than 8 into 8
for dataset in combine:    
    dataset.loc[ dataset['Family'] >= 8, 'Family'] = 8
    
sns.countplot(data=titanic_train,x='Family',hue='Survived')

In [None]:
titanic_train[['Pclass', 'Fare']].groupby(['Pclass'], as_index=False).median().sort_values(by='Pclass', ascending=False)

In [None]:
# Filling NULL Values in Fare
def impute_Fare(cols):
    Fare = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Fare):

        if Pclass == 1:
            return 60.2875

        elif Pclass == 2:
            return 14.2500

        else:
            return 8.0500

    else:
        return Fare

titanic_train['Fare'] = titanic_train[['Fare','Pclass']].apply(impute_Fare,axis=1)
titanic_test['Fare'] = titanic_test[['Fare','Pclass']].apply(impute_Fare,axis=1)

titanic_test.info()

In [None]:
# Filling NULL Ages using median age based on sex and Pclass

guess_ages = np.zeros((2,3))

for i in range(0, 2):
    for j in range(0, 3):
        guess_df = titanic_train[(titanic_train['male'] == i) & (titanic_train['Pclass'] == j+1)]['Age'].dropna()

        age_guess = guess_df.median()

        guess_ages[i,j] = int(age_guess)
            
guess_ages

In [None]:
combine = [titanic_train, titanic_test]
for dataset in combine:          
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[(dataset.Age.isnull()) & (dataset.male == i) & (dataset.Pclass == j+1),'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

titanic_train.info()

In [None]:
# Creating Age bands
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

sns.countplot(data=titanic_train,x='Age',hue='Survived')

In [None]:
titanic_train['FareBand'] = pd.qcut(titanic_train['Fare'], 4)
titanic_train[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

In [None]:
titanic_train = titanic_train.drop(['FareBand'], axis=1)
combine = [titanic_train, titanic_test]

for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
sns.countplot(data=titanic_train,x='Fare',hue='Survived')

In [None]:
# Dropping columns not needed anymore
titanic_train = titanic_train.drop(['SibSp', 'Parch'], axis=1)
titanic_test = titanic_test.drop(['SibSp', 'Parch'], axis=1)

print(titanic_train.shape, titanic_test.shape)

In [None]:
titanic_train.head()

In [None]:
titanic_test.head()

Models to try:
* Logistic regression
* SVM
* PCA + SVM (There are four additional columns due to dummies)
* Random forrest

In [None]:
# Creating X and y
X = titanic_train.drop('Survived',axis=1)
y = titanic_train['Survived']

In [None]:
# Creating PCA components to try alternatively

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.decomposition import PCA
pca_st = PCA(n_components = 0.95)
pca_st.fit(X_scaled)
X_pca = pca_st.transform(X_scaled)

X_pca.shape

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression()

from sklearn.model_selection import cross_val_score

scores = cross_val_score(logr, X, y, cv=5)
scores

In [None]:
# Logistic Regression with PCA
logr_pca = LogisticRegression()

from sklearn.model_selection import cross_val_score

scores = cross_val_score(logr_pca, X_pca, y, cv=5)
scores

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Logistic Regression with hyperparameter tuning
from sklearn.pipeline import Pipeline
from sklearn import linear_model,decomposition
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Create an scaler object
sc = StandardScaler()

# Create a pca object
pca = decomposition.PCA()

# Create a logistic regression object with an L2 penalty
logistic = linear_model.LogisticRegression()

# Create a pipeline of three steps. First, standardize the data.
# Second, tranform the data with PCA.
# Third, train a logistic regression on the data.
pipe = Pipeline(steps=[('sc', sc),
                       ('pca', pca),
                       ('logistic', logistic)])

# Create Parameter Space
# Create a list of a sequence of integers from 1 to 30 (the number of features in X + 1)
n_components = list(range(1,X.shape[1]+1,1))
# Create a list of values of the regularization parameter
C = np.logspace(-4, 4, 50)
# Create a list of options for the regularization penalty
penalty = ['l1', 'l2']
# Create a dictionary of all the parameter options 
# Note has you can access the parameters of steps of a pipeline by using '__’
parameters = dict(pca__n_components=n_components,
                  logistic__C=C,
                  logistic__penalty=penalty)

# Conduct Parameter Optmization With Pipeline
# Create a grid search object
clf = GridSearchCV(pipe, parameters)

# Fit the grid search
clf.fit(X, y)
# View The Best Parameters
print('Best Penalty:', clf.best_estimator_.get_params()['logistic__penalty'])
print('Best C:', clf.best_estimator_.get_params()['logistic__C'])
print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])
print(); print(clf.best_estimator_.get_params()['logistic'])

# Use Cross Validation To Evaluate Model
CV_Result = cross_val_score(clf, X, y, cv=5, n_jobs=-1)
print(); print(CV_Result)
print(); print(CV_Result.mean())
print(); print(CV_Result.std())

Logistic Regression results:
* The normal logistic regression works as good as the tuned model
* PCA doesn't seem to provide good results since the best parameter included all the variables

In [None]:
# SVM
from sklearn.svm import SVC

sup_vec = SVC()

from sklearn.model_selection import cross_val_score

scores = cross_val_score(sup_vec, X, y, cv=5)
print(scores.mean())
scores

In [None]:
# SVM with PCA
from sklearn.svm import SVC

sup_vec_pca = SVC()

from sklearn.model_selection import cross_val_score

scores = cross_val_score(sup_vec_pca, X_pca, y, cv=5)
print(scores.mean())
scores

In [None]:
# SVM with Hyperparameter tuning
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

from sklearn.model_selection import GridSearchCV

sup_vec_ht = GridSearchCV(SVC(),param_grid,refit=True,verbose=3, n_jobs = -1)
sup_vec_ht.fit(X, y)

sup_vec_ht.best_params_
sup_vec_ht.best_estimator_

# Use Cross Validation To Evaluate Model
CV_svm_Result = cross_val_score(sup_vec_ht, X, y, cv=5, n_jobs=-1)
print(); print(CV_svm_Result)
print(); print(CV_svm_Result.mean())
print(); print(CV_svm_Result.std())

SVM Results: Usual SVM works better compared to SVM with hyperparameter tuning

In [None]:
# Random forrest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(rfc, X, y, cv=5)
scores

In [None]:
# Random forrest with PCA
from sklearn.ensemble import RandomForestClassifier
rfc_pca = RandomForestClassifier(n_estimators=100)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(rfc_pca, X_pca, y, cv=5)
scores

# Random Forrest with Hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

random_search = {'criterion': ['entropy', 'gini'],
               'max_depth': list(np.linspace(10, 1200, 10, dtype = int)) + [None],
               'max_features': ['auto', 'sqrt','log2', None],
               'min_samples_leaf': [4, 6, 8, 12],
               'min_samples_split': [5, 7, 10, 14],
               'n_estimators': list(np.linspace(151, 1200, 10, dtype = int))}

rfc_rs = RandomForestClassifier()
rfc_rs_ht = RandomizedSearchCV(estimator = rfc_rs, param_distributions = random_search, n_iter = 80, 
                                  cv = 4, verbose= 3, random_state= 101, n_jobs = -1)
rfc_rs_ht.fit(X, y)

rfc_rs_ht.best_params_
rfc_rs_ht.best_estimator_

CV_rfc_rs_ht = cross_val_score(rfc_rs_ht, X, y, cv=5, n_jobs=-1)
print(); print(CV_rfc_rs_ht)
print(); print(CV_rfc_rs_ht.mean())
print(); print(CV_rfc_rs_ht.std())

grid_search = {
    'criterion': [rfc_rs_ht.best_params_['criterion']],
    'max_depth': [rfc_rs_ht.best_params_['max_depth']],
    'max_features': [rfc_rs_ht.best_params_['max_features']],
    'min_samples_leaf': [rfc_rs_ht.best_params_['min_samples_leaf'] - 2, 
                         rfc_rs_ht.best_params_['min_samples_leaf'], 
                         rfc_rs_ht.best_params_['min_samples_leaf'] + 2],
    'min_samples_split': [rfc_rs_ht.best_params_['min_samples_split'] - 3, 
                          rfc_rs_ht.best_params_['min_samples_split'], 
                          rfc_rs_ht.best_params_['min_samples_split'] + 3],
    'n_estimators': [rfc_rs_ht.best_params_['n_estimators'] - 150, 
                     rfc_rs_ht.best_params_['n_estimators'] - 100, 
                     rfc_rs_ht.best_params_['n_estimators'], 
                     rfc_rs_ht.best_params_['n_estimators'] + 100, 
                     rfc_rs_ht.best_params_['n_estimators'] + 150]
}

rfc_gs = RandomForestClassifier()
rfc_gs_ht = GridSearchCV(estimator = rfc_gs, param_grid = grid_search, cv = 4, verbose= 3, n_jobs = -1)
rfc_gs_ht.fit(X, y)

rfc_gs_ht.best_params_
rfc_gs_ht.best_estimator_

from sklearn.model_selection import cross_val_score

# Use Cross Validation To Evaluate Model
CV_rfc_gs_ht = cross_val_score(rfc_gs_ht, X, y, cv=5, n_jobs=-1)
print(); print(CV_rfc_gs_ht)
print(); print(CV_rfc_gs_ht.mean())
print(); print(CV_rfc_gs_ht.std())

Random forrest with hyperparameter tuning works best

In [None]:
# predicting using the final selected model
sup_vec.fit(X, y)

predictions = sup_vec.predict(titanic_test)

# importing again to get the passengerid column
test_import = pd.read_csv('/kaggle/input/titanic/test.csv')

# creating submission file
submission = pd.DataFrame({
        "PassengerId": test_import["PassengerId"],
        "Survived": predictions
    })

submission.to_csv('lv_submission_svm.csv', index=False)
print("Your submission was successfully saved!")