In [None]:
#Import Python Libraries

In [None]:
import numpy as np
import pandas as pd
import missingno as mn
from collections import Counter

In [None]:
#Visualization
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
%matplotlib inline

In [None]:
# setting up plot style 
sns.set_context("paper")
style.use('fivethirtyeight')

In [None]:
# ml
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Importing the input files
titanic_train = pd.read_csv('../Titanic/train.csv')
titanic_test = pd.read_csv('../Titanic/test.csv')

In [None]:
titanic_train.head()

In [None]:
titanic_test.head()

In [None]:
combined=titanic_train.append(titanic_test)

In [None]:
# Database dimension
print("Database dimension : ")
print("Database dimension - titanic_train     :",titanic_train.shape)
print("Database dimension - titanic_test      :",titanic_test.shape)
print("Database dimension - combined          :",combined.shape)

print('\n')

In [None]:
#Database size
print("Database size : ")
print("Database size - titanic_train          :",titanic_train.size)
print("Database size - titanic_test           :",titanic_test.size)
print("Database size - combined               :",combined.size)

In [None]:
#Database column types
print("Titanic Train Dataset Info : ")
print(titanic_train.info())
print("_"* 40)
print("Titanic Test Dataset Info : ")
print(titanic_test.info())

Insights: Numerical Features : PassengerId, Age, Fare, SibSp, Parch Categorical Features : Pclass, Name, Sex, Embarked

In [None]:
# Checking the numerical variables in train and test data set
titanic_train.describe().T

In [None]:
mn.bar(titanic_train)

In [None]:
# Column wise null values in train data set 
null_train_perc = pd.DataFrame((titanic_train.isnull().sum())*100/titanic_train.shape[0]).reset_index()
null_train_perc.columns = ['Column Name', 'Null Values Percentage']
null_train_value = pd.DataFrame(titanic_train.isnull().sum()).reset_index()
null_train_value.columns = ['Column Name', 'Null Values']
null_train = pd.merge(null_train_value, null_train_perc, on='Column Name')
null_train

Insights: Cabin - has 687 rows which is 77% of the data missing. There is no point trying to fill the data Age - has 117 likes which is around 20% of the data missing. Age imputation will be followed in later section embarked - has 2 missing values. Data imputation will be followed in later section

In [None]:
mn.bar(titanic_test)

In [None]:
# Column wise null values in test data set
null_test_perc = pd.DataFrame((titanic_test.isnull().sum())*100/titanic_test.shape[0]).reset_index()
null_test_perc.columns = ['Column Name', 'Null Values Percentage']
null_test_value = pd.DataFrame(titanic_test.isnull().sum()).reset_index()
null_test_value.columns = ['Column Name', 'Null Values']
null_test = pd.merge(null_test_value, null_test_perc, on='Column Name')
null_test

Insights: Cabin - has 327 rows which is 78% of the data missing. There is no point trying to fill the data Age - has 86 likes which is around 20.6% of the data missing. Age imputation will be followed in later section Fare - has 1 missing value. Data imputation will be followed in later section

In [None]:
# checking the correlation among the numeric variables
plt.figure(figsize = (8,6))
ax= sns.heatmap(titanic_train.corr(), annot = True, cmap="RdYlGn",linewidth =1)
plt.show()

In [None]:
#survived
sns.countplot(titanic_train['Survived'], palette = 'husl')
plt.show()

In [None]:
titanic_train['Survived'].value_counts(normalize=True)

In [None]:
#age

In [None]:
ax = sns.FacetGrid(titanic_train, col='Survived',height = 6, aspect =0.5)
ax.map(sns.distplot, "Age")
plt.show()

In [None]:
ax = sns.kdeplot(titanic_train["Age"][(titanic_train["Survived"] == 0) & (titanic_train["Age"].notnull())], color="Red", shade = True)
ax = sns.kdeplot(titanic_train["Age"][(titanic_train["Survived"] == 1) & (titanic_train["Age"].notnull())], ax =ax, color="Green", shade= True)
ax.set_xlabel("Age")
ax.set_ylabel("Frequency")
ax = ax.legend(["Not Survived","Survived"])

In [None]:
# Overall age distribution of combined test & train database
ax = sns.distplot(combined["Age"], color="purple", label="Skewness : %.2f"%(combined["Age"].skew()))
ax = ax.legend(loc="best")
plt.show()

In [None]:
# Overall Fare distribution of combined test & train database
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
ax = sns.distplot(combined["Fare"], color="blue", label="Skewness : %.2f"%(combined["Fare"].skew()))
ax = ax.legend(loc="best")
plt.subplot(1,2,2)
sns.boxplot(combined["Fare"],color="blue")
plt.show()

In [None]:
ax = sns.FacetGrid(titanic_train, col='Survived',height = 6, aspect =0.5)
ax.map(sns.distplot, "Fare")
plt.show()

In [None]:
ax = sns.kdeplot(combined["Fare"][(combined["Survived"] == 0) & (combined["Fare"].notnull())], color="Red", shade = True)
ax = sns.kdeplot(combined["Fare"][(combined["Survived"] == 1) & (combined["Fare"].notnull())], ax =ax, color="Green", shade= True)
ax.set_xlabel("Age")
ax.set_ylabel("Frequency")
ax = ax.legend(["Not Survived","Survived"])

In [None]:
#parch
plt.figure(figsize = (10,6))
sns.barplot(x="Parch", y="Survived",data = titanic_train,palette="Set2")
plt.ylabel("Survival Probability")
plt.show()

In [None]:
#SibSp
plt.figure(figsize = (10,6))
sns.barplot(x="SibSp", y="Survived",data = titanic_train,palette="husl")
plt.ylabel("Survival Probability")
plt.show()

In [None]:
#sex
plt.figure(figsize = (15,6))
plt.subplot(1,2,1)
sns.barplot(x="Sex", y="Survived",data = titanic_train,palette="Set2")
plt.ylabel("Survival Probability")
plt.subplot(1,2,2)
sns.countplot("Sex",data = titanic_train,palette="Set2")
plt.show()

In [None]:
titanic_train[["Sex","Survived"]].groupby('Sex').agg({"mean","count"})

In [None]:
#pclass
plt.figure(figsize = (18,6))
plt.subplot(1,3,1)
sns.barplot(x="Pclass", y="Survived",data = titanic_train,palette="muted")
plt.ylabel("Survival Probability")
plt.subplot(1,3,2)
sns.countplot("Pclass",data = titanic_train,palette="muted")
plt.subplot(1,3,3)
sns.barplot(x="Pclass", y="Survived",data = titanic_train,hue = "Sex",palette="muted")
plt.ylabel("Survival Probability")
plt.show()

In [None]:
#embarked
plt.figure(figsize = (18,6))
plt.subplot(1,3,1)
sns.barplot(x="Embarked", y="Survived",data = titanic_train,palette="Accent")
plt.ylabel("Survival Probability")
plt.subplot(1,3,2)
sns.countplot("Embarked",data = titanic_train,palette="Accent")
plt.subplot(1,3,3)
sns.barplot(x="Embarked", y="Survived",data = titanic_train,hue = "Pclass",palette="Accent")
plt.ylabel("Survival Probability")
plt.show()

In [None]:
ax= sns.FacetGrid(data = titanic_train, row = 'Sex', col = 'Pclass', hue = 'Survived',palette = 'husl',height = 4, aspect = 1.4)
ax.map(sns.kdeplot, 'Age', alpha = .75, shade = True)
plt.legend()

# Data Cleaning

In [None]:
display(combined[combined.Fare.isnull()])

In [None]:
for df in [titanic_train, titanic_test, combined]:
    df['PeopleInTicket']=df['Ticket'].map(combined['Ticket'].value_counts()) # Getting the unique count of tickets
    df['FarePerPerson']=df['Fare']/df['PeopleInTicket'] 


print('Mean fare for this category: ', titanic_train[(titanic_train.Embarked=='S') & (titanic_train.Pclass==3)]['FarePerPerson'].mean())

In [None]:

titanic_test.loc[titanic_test.Fare.isnull(), ['Fare','FarePerPerson']] = round(titanic_train[(titanic_train.Embarked=='S')& (titanic_train.Pclass==3)\
                                                                          & (titanic_train.PeopleInTicket==1)]['Fare'].mean(),1)
display(titanic_test[titanic_test.Fare.isnull()])

In [None]:
# Embarked Imputation

In [None]:
display(combined[combined.Embarked.isnull()])

In [None]:
# Groupby Embarked and check some statistics
titanic_train[titanic_train.Pclass==1].groupby(['Embarked',"Pclass"]).agg({'FarePerPerson': 'mean', 'Fare': 'mean', 'PassengerId': 'count'})

In [None]:
# Updating the Embarked location for the two missing values
titanic_train.loc[titanic_train.PassengerId==62,'Embarked']="C"
titanic_train.loc[titanic_train.PassengerId==830,'Embarked']="C"
display(titanic_train[titanic_train.Embarked.isnull()])

In [None]:
#Age Imputation

In [None]:
titanic_train['Title'], titanic_test['Title'] = [df.Name.str.extract(' ([A-Za-z]+)\.', expand=False) for df in [titanic_train, titanic_test]]

In [None]:
# Extracting the statistics of Title on Train dataset
titanic_train.groupby(['Title', 'Pclass'])['Age'].agg(['mean', 'count'])

In [None]:
TitleDict = {"Capt": "Officer","Col": "Officer","Major": "Officer","Jonkheer": "Royalty", \
             "Don": "Royalty", "Sir" : "Royalty","Dr": "Royalty","Rev": "Royalty", \
             "Countess":"Royalty", "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs","Mr" : "Mr", \
             "Mrs" : "Mrs","Miss" : "Miss","Master" : "Master","Lady" : "Royalty"}

In [None]:
titanic_train['Title'], titanic_test['Title'] = [df.Title.map(TitleDict) for df in [titanic_train, titanic_test]]

# Let us now reprint the groups
titanic_train.groupby(['Title', 'Pclass'])['Age'].agg(['mean', 'count'])

In [None]:
# Checking for Master Title with Age missing
display(combined[(combined.Age.isnull()) & (combined.Name.str.contains('Master'))])

In [None]:
print("Average age for Masters in Pclass 3 : ", round(titanic_train[titanic_train.Name.str.contains('Master')]['Age'].mean(),2))
print("Maximum age for Masters in Pclass 3 : ", round(combined[combined.Name.str.contains('Master')]['Age'].max(),2))

In [None]:
# Assigning the max value to the age of passenger with title Master and travelling alone
titanic_test.loc[titanic_test.PassengerId==1231,'Age']=14

In [None]:
for df in [titanic_train, titanic_test]:
    df.loc[(df.Title=='Miss') & (df.Parch!=0) & (df.PeopleInTicket>1), 'Title']="FemaleChild"

# Extracting the statistics
print(titanic_train.groupby(['Pclass','Sex','Title'])['Age'].agg({'mean', 'median', 'count'}))
print("_"*60)
print(titanic_test.groupby(['Pclass','Sex','Title'])['Age'].agg({'mean', 'median', 'count'}))

In [None]:
# Checking female child with missing age

display(titanic_train[(titanic_train.Age.isnull()) & (titanic_train.Title=='FemaleChild')])
display(titanic_test[(titanic_test.Age.isnull()) & (titanic_test.Title=='FemaleChild')])

In [None]:
# Creating a lookup table to fill the missing age values
grp = titanic_train.groupby(['Pclass','Sex','Title'])['Age'].mean().reset_index()[['Sex', 'Pclass', 'Title', 'Age']]
grp

In [None]:
# Upfating the missing age based on above lookup table
def fill_age(x):
    return grp[(grp.Pclass==x.Pclass)&(grp.Sex==x.Sex)&(grp.Title==x.Title)]['Age'].values[0]
titanic_train['Age'], titanic_test['Age'] = [df.apply(lambda x: fill_age(x) if np.isnan(x['Age']) else x['Age'], axis=1) for df in [titanic_train, titanic_test]]

In [None]:
# Checking to see if any null value exists
print(titanic_train.Age.isnull().sum())
print("_"*50)
print(titanic_test.Age.isnull().sum())

In [None]:
# Outlier Treatment

In [None]:
#Function to identify outliers
def outliers(df, n, features):
    outlier_indices = []
    for col in features:
        Q1 = np.percentile(df[col], 25) # First quartile range
        Q3 = np.percentile(df[col],75) # Third quartile range
        IQR = Q3 - Q1
        outlier_step = 1.5 * IQR
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        outlier_indices.extend(outlier_list_col)
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers  

In [None]:
# Extracting the outliers IDs
Outliers_id = outliers(titanic_train, 2, ["Age", "SibSp", "Parch", "Fare"])
print(Outliers_id,'\n')
print(titanic_train.loc[Outliers_id])

In [None]:
# Dropping the outliers
titanic_train = titanic_train.drop(Outliers_id, axis = 0).reset_index(drop=True)
titanic_train.shape

# Feature Engineering


In [None]:
plt.figure(figsize = (12,6))
plt.subplot(1,2,1)
sns.countplot(titanic_train['Title'], palette = 'Set2')
plt.subplot(1,2,2)
sns.barplot(x= "Title",y = "Survived", data = titanic_train, palette = "Set2")
plt.ylabel("Survival Probability")
plt.show()

In [None]:
# Checking if test dataset has any null values for Title
display(titanic_test[(titanic_test.Title.isnull())])

In [None]:
titanic_test.loc[titanic_test.PassengerId==1306,'Title']="Royalty"

In [None]:
#family siz feature
for dataset in [titanic_train,titanic_test]:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
titanic_train.head(3)

In [None]:
for dataset in [titanic_train,titanic_test]:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
titanic_train.head(3)  

In [None]:
#cabin feature
for dataset in [titanic_train,titanic_test]:
    dataset["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else 'X' for i in dataset['Cabin'] ])
titanic_train["Cabin"].value_counts() 

In [None]:
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
sns.countplot(titanic_train['Cabin'], palette = 'husl',order=['A','B','C','D','E','F','G','T','X'])
plt.subplot(1,2,2)
sns.barplot(x= "Cabin",y = "Survived", data = titanic_train, palette = "husl",order=['A','B','C','D','E','F','G','T','X'])
plt.ylabel("Survival Probability")
plt.show()

In [None]:
#age binning
for dataset in [titanic_train,titanic_test]:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']=4

In [None]:
# plotting the data based on new age classification
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
sns.countplot(titanic_train['Age'], palette = 'husl')
plt.subplot(1,2,2)
sns.barplot(x= "Age",y = "Survived", data = titanic_train, palette = "husl")
plt.ylabel("Survival Probability")
plt.show()

In [None]:
#fare binning
for dataset in [titanic_train,titanic_test]:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int) 

In [None]:
# plotting the data based on new age classification
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
sns.countplot(titanic_train['Fare'], palette = 'husl')
plt.subplot(1,2,2)
sns.barplot(x= "Fare",y = "Survived", data = titanic_train, palette = "husl")
plt.ylabel("Survival Probability")
plt.show()

# Model Bulding
