In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Prepare the Data

In [None]:
#Load all the required data files and python libraries

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

#create a copy of train_data df
train_data_copy = train_data.copy(deep = True)

In [None]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from pandas.api.types import CategoricalDtype

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
train_data

In [None]:
test_data

check for null values, datatypes and missing values in the columns

In [None]:
print('Train Data Set')
train_data.info()
print('_'*40)
print('Test Data Set')
test_data.info()

# 2. Data Pre-Procesing

**Some observations from the above dataset:**

* Age column has several null values in the train and test dataset. These values must be replaced with the median values. We will not remove these rows since age is an important factor and removing almost 20% of the data from the dataset would not be advisable.
* Similary, there are several null values for fare Column in the test data and we will replace those NaN values with their median values as well
* The Age has a datatype of float which should be converted to int for both the dfs.
* Ticket column has the ticket number related to the individual, which is not helpful in determining the prediction. Hence we can remove that column.
* Embarked column has the Station code related to the individual from which he began his journey, which is not helpful in determining the prediction. Hence we can remove that column.
* Column Cabin has too many null values, filling those values could lead to incorrect results, hence we will drop that column as well
* PassengerId is a unique id associated with each traveller which does not determine the prediction of his/her survival. Hence we can drop that column as well.

In [None]:
#drop the columns from the train_data and test_data df
train_data_copy = train_data_copy.drop(['Ticket', 'Embarked', 'Cabin', 'PassengerId'], axis = 1)
test_data = test_data.drop(['Ticket', 'Embarked', 'Cabin'], axis = 1)

#fill the null values of age with the median values
train_data_copy['Age'].fillna(train_data_copy['Age'].median(), inplace = True)
test_data['Age'].fillna(test_data['Age'].median(), inplace = True)

#fill the null values of fare with the median values
test_data['Fare'].fillna(test_data['Fare'].dropna().median(), inplace=True)

#Convert the Age from float64 to int
train_data_copy['Age'] = train_data_copy['Age'].astype(int)
test_data['Age'] = test_data['Age'].astype(int)

train_data_copy.info()

**Data Processing**

* We will include new columns into test data and train data - FamilySize, IsAlone, Title.
* FamilySize column combines the values from SibSp and Parch columns, which will help us to focus on just one column.
* IsAlone is related to familysize. If the familySize is 0, then we can say that the person was lone traveller.
* Names of the passengers are relevant to predict the survival, however, their prefixes could be used which signifies the person's class and their marital status as well. We will extract these prefixes from the name columns and try to see if they could be used in our study.

In [None]:
#create a list with train and test data, to make it easier while performing changes in both of them
clean_dataset = [train_data_copy, test_data]
for dataset in clean_dataset:
    #add new columns
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset['IsAlone'] = 0 #initialize to yes/0 is alone
    dataset.loc[dataset['FamilySize'] > 1, 'IsAlone'] = 1 #change to no/1 is not alone, if family size is more than 1
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_data_copy['Title'], train_data_copy['Sex'])

* We will club all the titles which are less frequently used into a common title - Rare
* We can club Miss, Mlle into Miss and also Mme and Mrs.
* After that, we can drop the name, Parch and SibSp columns.

In [None]:
for dataset in clean_dataset:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
#drop the name column
train_data_copy = train_data_copy.drop(['Name', 'SibSp', 'Parch'], axis=1)
test_data = test_data.drop(['Name', 'SibSp', 'Parch'], axis=1)
clean_dataset = [train_data_copy, test_data]

* For the age and the fares, we need to create bins for each of them.
* For this, qcut and cut will be used

In [None]:
for dataset in clean_dataset:
    dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)

In [None]:
train_data_copy['AgeBin'].unique()

In [None]:
train_data_copy['FareBin'].unique()

**Convert categorical values to ordinal values**

1. Sex - Male = 0, Female = 1
2. Title = Rare = 0, Mr = 1, Miss = 2, Mrs = 3, Master = 4
3. Fare - Range from 0-3
4. Age - Range from 0-4

In [None]:
# convert the categorical titles to ordinal.
title_mapping = {"Rare": 0, "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4}
for dataset in clean_dataset:
    dataset['Title'] = dataset['Title'].map(title_mapping)

train_data_copy

In [None]:
# convert the categorical sex column to ordinal.
sex_mapping = {"male": 0, "female": 1}
for dataset in clean_dataset:
    dataset['Sex'] = dataset['Sex'].map(sex_mapping)

train_data_copy

In [None]:
# convert the Age column to ordinal based on AgeBin.
for dataset in clean_dataset:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

train_data_copy

In [None]:
# convert the Fare column to ordinal based on FareBin.
for dataset in clean_dataset:    
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_data_copy

Now we can remove the below columns which are not required:

1. FamilySize
2. FareBin
3. AgeBin

In [None]:
#drop the columns from the train_data df
train_data_copy = train_data_copy.drop(['FamilySize', 'FareBin', 'AgeBin'], axis = 1)

#drop the columns from the test_data df
test_data = test_data.drop(['FamilySize', 'FareBin', 'AgeBin'], axis = 1)

print('Train Data Set')
train_data_copy.info()
print('_'*40)
print('Test Data Set')
test_data.info()

# 3. Data Visualization and Analysis

**1) Co-relation matrix**

This will help us to see the relation between factors against their survival

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
ax = sns.heatmap(train_data_copy[["Survived", "Pclass", 
         "Sex", "Age","Fare", "IsAlone", "Title"]].corr(), 
            annot = True, 
            fmt = ".2f",
            linewidths=0.5,
          cmap = "YlOrBr")

ax.set_title('Correlation Matrix');

**2. Survival Rate Vs Other factors**

* The below barplots demonstrate how the survival rate differs wrt to each factor:

1) Title
2) Pclass
3) Sex
4) IsAlone
5) Fare
6) Age

In [None]:
fig, axs = plt.subplots(3,2, figsize=(20, 20))
sns.barplot(data=train_data_copy,y ='Survived', x='Title',
                ax=axs[0][0]).set_title("Survival Rate vs Title")
sns.barplot(data=train_data_copy,x ='Pclass', y='Survived',
                ax=axs[0][1]).set_title("Survival Rate vs Pclass")
sns.barplot(data=train_data_copy,y ='Survived', x='Sex',
                ax=axs[1][0]).set_title("Survival Rate vs Sex")
sns.barplot(data=train_data_copy,y ='Survived', x='IsAlone',
                ax=axs[1][1]).set_title("Survival Rate vs Alone")
sns.barplot(data=train_data_copy,y ='Survived', x='Fare',
                ax=axs[2][0]).set_title("Survival Rate vs Fare")
sns.barplot(data=train_data_copy,y ='Survived', x='Age',
                ax=axs[2][1]).set_title("Survival Rate vs Age")

**Some observations from the above graphs:**

* (Rare = 0, Mr = 1, Miss = 2, Mrs = 3, Master = 4), Mrs are more likely to survive.
* Class 1 travelers handled the survival well
* Female travelers were alive by more than 50%
* People who have had family members, likely to survive more.
* People with higher fare were alive more, this is in direct sync with the point 2, that class 1 members survived more
* children below the age of 16 and mid-age people are likely to have survived more comapred to rest age group people.

**3. Survival Rate, Sex vs other factors**

sex mattered in survival, now let's compare sex and features: Pclass, IsALone, Age

In [None]:
fig, ax = plt.subplots(1,3,figsize=(14,12))

sns.barplot(x = 'Sex', y = 'Survived', hue = 'Pclass', data=train_data_copy, ax  = ax[0]).set_title('Sex vs Pclass Survival Comparison')

sns.barplot(x = 'Sex', y = 'Survived', hue = 'IsAlone', data=train_data_copy, ax  = ax[1]).set_title('Sex vs IsAlone Survival Comparison')

sns.barplot(x = 'Sex', y = 'Survived', hue = 'Age', data=train_data_copy, ax  = ax[2]).set_title('Sex vs IsAlone Survival Comparison')

**Some observations from the above graphs:**

* Pclass --> Men and women in the upper classes have survived more wrt to other classes. Althought survival rate over all is too less when compared to women.
* IsALone --> Women who were alone (without a family member) survived more whereas in case of men, more of those men survived who had family members.
* Sex --> Mid-age women and children survived more. In case of men, children below the age of 16 and men in their 20s have survived more wrt other males of other age group.
* Overall conclusion, women have survived more in the upper class of the mid-age group and children.

**4. Sex vs Pclass vs Age vs Survived**

In [None]:
#histogram comparison of sex, class, and age by survival
h = sns.FacetGrid(train_data_copy, row = 'Sex', col = 'Pclass', hue = 'Survived')
h.map(plt.hist, 'Age', alpha = .75)
h.add_legend()

* as we can see from the obove graphs, there is a drastic difference between the deaths of the lower class men in their young age.

**5. Survived vs Pclass vs Fare**

In [None]:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train_data_copy, col='Survived', row='Pclass', aspect=1.6)
grid.map(plt.hist, 'Fare', alpha=.5, bins=20)
grid.add_legend();

**6. Survived vs Pclass vs Age**

In [None]:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
grid = sns.FacetGrid(train_data_copy, col='Survived', row='Pclass',aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

# 4. Model Prediction

Our problem is a classification and regression problem. We want to identify relationship between output (Survived or not) with other variables or features (Gender, Age, Port...). We are also perfoming a category of machine learning which is called supervised learning as we are training our model with a given dataset. With these two criteria - Supervised Learning plus Classification and Regression, we can narrow down our choice of models to a few. These include:

* Logistic Regression
* KNN or k-Nearest Neighbors
* Support Vector Machines
* Naive Bayes classifier
* Decision Tree
* Random Forrest
* Perceptron
* Artificial neural network
* RVM or Relevance Vector Machine

In [None]:
train_data_copy

In [None]:
test_data

In [None]:
X_train = train_data_copy.drop("Survived", axis=1)
Y_train = train_data_copy["Survived"]
X_test  = test_data.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

In [None]:
X_train

In [None]:
X_test

**1. Logistic Regression**

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

**2. Support Vector Machines (SVM)**

In [None]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

**3. KNN**

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

**4. Gaussian Naives Bayes**

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian


**5. Perceptron**

In [None]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

**6. Linear SVC**

In [None]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

**7. Gradient Descent**

In [None]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

**8. Decision Tree**

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier(criterion= 'gini', max_depth= 9, random_state=2)
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

**9. Random Forest Classifier**

In [None]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100, max_depth=9)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

**An overview of all the models' score**

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

**As we can see from above, random forest and decision Trees give us the maximum result.
For our final result, I have chosen Random Forest to avoid overfitting of data.**

Submission:

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": Y_pred
    })
submission.sample(20)
submission.to_csv('../working/submission2.csv', index=False)