# Titanic - Machine Learning from Disaster

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load training and test Data
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
train_df.head(3)

In [None]:
train_df.info()

In [None]:
train_df.columns

891 observations for 11 variables

Variables with null values: Age, Cabin and Embarked

replace null values in age with mean

drop null values of embarked

keep null values of cabin

In [None]:
# Dealing with nulls
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
train_df['Fare'] = train_df['Fare'].fillna(train_df['Fare'].mean())
train_df.dropna(subset=['Embarked'],inplace = True)
train_df.info()

In [None]:
# separate numerical and categorical variables while maintaining labels
train_num = train_df[['Survived', 'Age', 'SibSp', 'Parch', 'Fare']]
train_cat = train_df[['Survived', 'Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked']]

In [None]:
# Describe numerical data
train_num.describe()

## Visualizing the data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Examine the correlation of numerical data
print(train_num.corr())

In [None]:
# Distribution and corelation plots for numerical variables
sns.pairplot(train_num, hue='Survived')
plt.show()

In [None]:
# Distribution Plots for categorical variables
for i in train_cat.columns:
    sns.displot(train_cat[i])
    plt.title(i)
    plt.show()

In [None]:
# We try to see how the Gender would affect Survival
gender_df = train_df[['PassengerId','Survived','Sex']].groupby(['Survived','Sex']).count().reset_index()
sns.barplot(data=gender_df, x='Sex', y='PassengerId', hue='Survived')

In [None]:
# We try to see Passenger class would affect Survival
class_df = train_df[['PassengerId','Survived','Pclass']].groupby(['Survived','Pclass']).count().reset_index()
sns.barplot(data=class_df, x='Pclass', y='PassengerId', hue='Survived')

In [None]:
# We try to see how port of embarkation would affect Survival
port_df = train_df[['PassengerId','Survived','Embarked']].groupby(['Survived','Embarked']).count().reset_index()
sns.barplot(data=port_df,x='Embarked', y='PassengerId', hue='Survived')

## Feature Engineering

In [None]:
# We try to look deeper into the ticket variable
train_df['Ticket'].value_counts()

In [None]:
# We try to look deeper into the Cabin variable
train_df['Cabin'].value_counts()

In [None]:
# We create new vaiables to represent the length of the cabin number and the letter category and check how this affects survival
train_df['cabin_num'] = train_df['Cabin'].apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
train_df['cabin_let'] = train_df['Cabin'].apply(lambda x: str(x)[0])

In [None]:
# We try to see the number of cabins would affect Survival
cabin_num_df = train_df[['PassengerId','Survived','cabin_num']].groupby(['Survived','cabin_num']).count().reset_index()
sns.barplot(data=cabin_num_df, x='cabin_num', y='PassengerId', hue='Survived')

In [None]:
# We try to see the category of cabin number would affect Survival
cabin_let_df = train_df[['PassengerId','Survived','cabin_let']].groupby(['Survived','cabin_let']).count().reset_index()
sns.barplot(data=cabin_let_df, x='cabin_let', y='PassengerId', hue='Survived')

In [None]:
train_df.drop(index=339, axis=0, inplace=True)

In [None]:
train_df.head(3)

In [None]:
train_df.columns

In [None]:
features = train_df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Fare', 'Embarked', 'cabin_num', 'cabin_let']]
features.head(3)

In [None]:
# using onehot ecoding to create dummy variables for categorical varriables
features_one_hot = pd.get_dummies(features, ['Sex', 'Embarked', 'Cabin_let'])
features_one_hot.head()

In [None]:
# cast all numerical columns to float64
features_one_hot = features_one_hot.astype('float64')

In [None]:
features_one_hot.info()

In [None]:
features_one_hot.columns

In [None]:
test_df.info()

In [None]:
# We apply the same transformations to the test data

# Dealing with nulls
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

# We create new vaiables to represent the length of the cabin number and the letter category and check how this affects survival
test_df['cabin_num'] = test_df['Cabin'].apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
test_df['cabin_let'] = test_df['Cabin'].apply(lambda x: str(x)[0])

test_features = test_df[['Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Fare', 'Embarked', 'cabin_num', 'cabin_let']]

# using onehot ecoding to create dummy variables for categorical varriables
test_features_one_hot = pd.get_dummies(test_features, ['Sex', 'Embarked', 'Cabin_let'])
test_features_one_hot.info()

# Model Creation

In [None]:
# Import required libraries
from sklearn import preprocessing                    # Preprocessing to standarsize our data
from sklearn.model_selection import GridSearchCV     # To test parameters of classification algorithms and find the best one
from sklearn.linear_model import LogisticRegression  # Logistic Regression classification algorithm
from sklearn.svm import SVC                          # Support Vector Machine classification algorithm
from sklearn.tree import DecisionTreeClassifier      # Decision Tree classification algorithm
from sklearn.neighbors import KNeighborsClassifier   # K Nearest Neighbors classification algorithm
from sklearn.neural_network import MLPClassifier     # Neural network classification algorithm

In [None]:
# Assign X and y variables
X_train = features_one_hot[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'cabin_num', 'Sex_female', 
                            'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S','Cabin_let_A', 
                            'Cabin_let_B', 'Cabin_let_C', 'Cabin_let_D','Cabin_let_E', 'Cabin_let_F',
                            'Cabin_let_G','Cabin_let_n'
                           ]
                          ] 
y_train = features_one_hot['Survived'].to_numpy()
y_train

In [None]:
# Standardise X_train data
transform = preprocessing.StandardScaler()
X_train = transform.fit(X_train).transform(X_train)
X_train[0:5]

In [None]:
# Standardize X_test data
X_test = test_features_one_hot
X_test = transform.fit(X_test).transform(X_test)
X_test[0:5]

## Logistic regression

In [None]:
# Create a logistic regression object then create a GridSearchCV object logreg_cv with cv = 10. 
# Fit the object to find the best parameters from the dictionary parameters.

# create a parameter dictionary
parameters ={"C":[0.01,0.1,1,10,100],
             'penalty':['l2'], 
             'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
            }# l1 lasso l2 ridge

logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, parameters, cv=10, n_jobs=-1)
logreg_cv.fit(X_train, y_train)
# Predict survival
lr_yhat = logreg_cv.predict(X_test)

In [None]:
print('Best Hyperparameters: %s' % logreg_cv.best_params_)

print("accuracy:", logreg_cv.best_score_*100)

## Support vector

In [None]:
parameters = {'kernel':['linear','poly','rbf', 'sigmoid'],
              'C': [100, 10, 1.0, 0.1, 0.001],
              'gamma': ['scale']
             }
svm = SVC(random_state=1)
svm_cv = GridSearchCV(svm, parameters, cv=10, n_jobs=-1)
svm_cv.fit(X_train, y_train)
#predict survival
svm_yhat = svm_cv.predict(X_test)

In [None]:
print('Best Hyperparameters: %s' % svm_cv.best_params_)
print("accuracy :", svm_cv.best_score_*100)

## Decision Tree

In [None]:
parameters = {'criterion': ['gini', 'entropy'],
              'splitter': ['best', 'random'],
              'max_depth': [2*n for n in range(1,10)],
              'max_features': ['auto', 'sqrt'],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]
             }
dtree = DecisionTreeClassifier(random_state=1)
dtree_cv = GridSearchCV(dtree, parameters, cv=10)
dtree_cv.fit(X_train, y_train)
# predict survival
dtree_yhat = dtree_cv.predict(X_test)

In [None]:
print('Best Hyperparameters: %s' % dtree_cv.best_params_)
print("accuracy :", dtree_cv.best_score_*100)

## K nearest neigbors

In [None]:
parameters = {'n_neighbors': range(1, 21),
              'metric': ['euclidean', 'manhattan', 'minkowski'],
              'weights': ['uniform', 'distance'],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]
             }

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, parameters, cv=10)
knn_cv.fit(X_train, y_train)
# predict survival
knn_yhat = knn_cv.predict(X_test)

In [None]:
print('Best Hyperparameters: %s' % knn_cv.best_params_)
print("accuracy :", knn_cv.best_score_*100)

## Multi-layer Perceptron classifier

In [None]:
nn = MLPClassifier(random_state=1, max_iter=1500)
nn.fit(X_train, y_train)
# predict survival
nn_yhat = nn.predict(X_test)

In [None]:
print('Best Hyperparameters: %s' % nn.get_params())
print("accuracy :", nn.score(X_train, y_train))

In [None]:
# Prepare submission files

submission_lr= pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': lr_yhat})
submission_sv= pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': svm_yhat})
submission_dt= pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': dtree_yhat})
submission_kn= pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': knn_yhat})
submission_nn= pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': nn_yhat})

In [None]:
submission_lr['Survived']=submission_lr['Survived'].astype(int)
submission_sv['Survived']=submission_sv['Survived'].astype(int)
submission_dt['Survived']=submission_dt['Survived'].astype(int)
submission_kn['Survived']=submission_kn['Survived'].astype(int)
submission_nn['Survived']=submission_nn['Survived'].astype(int)

In [None]:
submission_lr.to_csv('submission_lr.csv', index=False)
submission_sv.to_csv('submission_sv.csv', index=False)
submission_dt.to_csv('submission_dt.csv', index=False)
submission_kn.to_csv('submission_kn.csv', index=False)
submission_nn.to_csv('submission_nn.csv', index=False)

In [None]:
submission_sv.info()