In [327]:
# Data Analysis Libraries;
import numpy as np
import pandas as pd
import random as rnd

# Data Visualization Libraries;
import matplotlib.pyplot as plt
import seaborn as sns

# To Ignore Warnings;
import warnings
warnings.filterwarnings('ignore')

# To Display All Columns:
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

# Model Selection
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import metrics
from sklearn.preprocessing import StandardScaler,minmax_scale

In [328]:
# Read train and test data with pd.read_csv():

train_data = pd.read_csv("c:/users/haider/desktop/wall/titanic/train.csv")
test_data = pd.read_csv("c:/users/haider/desktop/wall/titanic/test.csv")

# Copy data in order to avoid any change in the original:
train = train_data.copy()
test = test_data.copy()

In [None]:
# Description of training dataset

train.info()

In [None]:
# Displaying 1st five rows of training dataset

train.head()

In [None]:
# Displaying Count, Mean, Std Deviation and Minimum values of the training dataset and transposing the result

train.describe().T

In [None]:
# Description of testing dataset

test.info()

In [None]:
# Displaying 1st five rows of testing dataset

test.head()

In [None]:
# Displaying Count, Mean, Std Deviation and Minimum values of the testing dataset and transposing the result

test.describe().T

In [None]:
# Bar graph to display the Non-null and Null counts in the dataset

import missingno as msno
msno.bar(train);

In [None]:
# Classification of categorical variables

train['Pclass'].value_counts()

In [None]:
train['Sex'].value_counts()

In [None]:
train['SibSp'].value_counts()

In [None]:
train['Parch'].value_counts()

In [None]:
train['Ticket'].value_counts()

In [None]:
train['Cabin'].value_counts()

In [None]:
train['Embarked'].value_counts()

In [None]:
# Visualization

# Barplot is used for categorical variables while histogram, density and boxplot are used for numerical data.

sns.barplot(x = 'Pclass', y = 'Survived', data = train);

In [None]:
sns.barplot(x = 'SibSp', y = 'Survived', data = train);

In [None]:
sns.barplot(x = 'Parch', y = 'Survived', data = train);

In [None]:
sns.barplot(x = 'Sex', y = 'Survived', data = train);

In [None]:
# Data Preparation

# Deleting Unnecessary Variables


# We can drop the Ticket feature since it is unlikely to have useful information

train = train.drop(['Ticket'], axis = 1)
test = test.drop(['Ticket'], axis = 1)

In [None]:
train.describe([0.10,0.25,0.50,0.75,0.90,0.99]).T

In [None]:
# It looks like there is a problem in Fare max data. Visualize with boxplot.
sns.boxplot(x = train['Fare']);

In [None]:
Q1 = train['Fare'].quantile(0.25)
Q3 = train['Fare'].quantile(0.75)
IQR = Q3 - Q1

lower_limit = Q1-1.5*IQR
lower_limit

upper_limit = Q3 + 1.5*IQR
upper_limit

In [None]:
# Observations with Fare data higher than the upper limit.

train['Fare'] > (upper_limit)

In [None]:
# In boxplot, there are too many data higher than upper limit. Replacing the highest values.

train['Fare'] = train['Fare'].replace(512.3292, 312)
test['Fare'] = test['Fare'].replace(512.3292, 312)

In [None]:
# Finding and replacing Null values with '0' in the datsets

train.isnull().values.any()

In [None]:
train.isnull().sum()

In [None]:
train["Age"].fillna(0, inplace = True)
train["Cabin"].fillna(0, inplace = True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().values.any()

In [None]:
test.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
test["Age"].fillna(0, inplace = True)
test["Cabin"].fillna(0, inplace = True)

In [None]:
# Replacing '0' in "Age" with the mean of "Age"

train['Age'] = train['Age'].replace(0,(train['Age'].mean()))
test['Age'] = test['Age'].replace(0,(test['Age'].mean()))

In [None]:
train.isnull().values.any()

In [None]:
train.isnull().sum()

In [None]:
display(train)

In [None]:
test.isnull().values.any()

In [None]:
test.isnull().sum()

In [None]:
display(test)

In [None]:
train["Embarked"].value_counts()

In [None]:
# Fill NA with the most frequent value:
train["Embarked"] = train["Embarked"].fillna("S")
test["Embarked"] = test["Embarked"].fillna("S")

In [None]:
test[test["Fare"].isnull()]

In [None]:
test[["Pclass","Fare"]].groupby("Pclass").mean()

In [None]:
test["Fare"] = test["Fare"].fillna(12)

In [None]:
test.isnull().sum()

In [None]:
# Create CabinBool variable which states if someone has a Cabin data or not:

train["CabinBool"] = train["Cabin"].isnull().astype('int')
test["CabinBool"] = test["Cabin"].isnull().astype('int')

train = train.drop(['Cabin'], axis = 1)
test = test.drop(['Cabin'], axis = 1)

train.head()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
# Map each Embarked value to a numerical value:

embarked_mapping = {"S": 1, "C":2, "Q":3}

train['Embarked'] = train['Embarked'].map(embarked_mapping)
test['Embarked'] = test['Embarked'].map(embarked_mapping)
train.head()

In [None]:
# Convert Sex values into 1-0:

from sklearn import preprocessing

lbe = preprocessing.LabelEncoder()

train["Sex"] = lbe.fit_transform(train["Sex"])
test["Sex"] = lbe.fit_transform(test["Sex"])

In [None]:
train["Title"] = train["Name"].str.extract('([A-Za-z]+)\.', expand = False)
test["Title"] = test["Name"].str.extract('([A-Za-z]+)\.', expand = False)

train['Title'].value_counts()

In [None]:
train['Title'] = train['Title'].replace(['Lady','Capt','Col','Don','Dr','Major','Rev','Jonkheer','Dona'], 'Rare')
train['Title'] = train['Title'].replace(['Countess','Lady','Sir'], 'Royal')
train['Title'] = train['Title'].replace('Mlle','Miss')
train['Title'] = train['Title'].replace('Ms','Miss')
train['Title'] = train['Title'].replace('Mme','Mrs')

test['Title'] = test['Title'].replace(['Lady','Capt','Col','Don','Dr','Major','Rev','Jonkheer','Dona'], 'Rare')
test['Title'] = test['Title'].replace(['Countess','Lady','Sir'], 'Royal')
test['Title'] = test['Title'].replace('Mlle','Miss')
test['Title'] = test['Title'].replace('Ms','Miss')
test['Title'] = test['Title'].replace('Mme','Mrs')

In [None]:
train[["Title","PassengerId"]].groupby("Title").count()

In [None]:
train[['Title','Survived']].groupby(['Title'], as_index = False).agg({"count","mean"})

In [None]:
# Map each of the title groups to a numerical value

title_mapping = {"Mr":1, "Miss":2, "Mrs":3, "Master":4, "Royal":5, "Rare":5}

train['Title'] = train['Title'].map(title_mapping)

In [None]:
test['Title'] = test['Title'].map(title_mapping)

In [None]:
train = train.drop(['Name'], axis = 1)
test = test.drop(['Name'], axis = 1)

In [None]:
bins = [0, 5, 12, 18, 24, 35, 60, np.inf]
mylabels = ['Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
train['AgeGroup'] = pd.cut(train["Age"], bins, labels = mylabels)
test['AgeGroup'] = pd.cut(test["Age"], bins,labels = mylabels)

In [None]:
# Map each Age value to a numerical value:
age_mapping = {'Baby':1, 'Child':2, 'Teenager':3, 'Student':4, 'Young Adult':5, 'Adult':6, 'Senior':7}
train['AgeGroup'] = train['AgeGroup'].map(age_mapping)
test ['AgeGroup'] = test['AgeGroup'].map(age_mapping)

In [None]:
# Dropping the Age feature for now, might change:
train = train.drop(['Age'], axis = 1)
test = test.drop(['Age'], axis = 1)

In [None]:
# Map Fare values into groups of numerical values:
train['FareBand'] = pd.qcut(train['Fare'], 4, labels = [1,2,3,4])
test['FareBand'] = pd.qcut(test['Fare'], 4, labels = [1,2,3,4])

In [None]:
# Drop Fare values:
train = train.drop(['Fare'], axis = 1)
test = test.drop(['Fare'], axis = 1)

In [None]:
train["FamilySize"] = train_data["SibSp"] + train_data["Parch"] + 1
test["FamilySize"] = test_data["SibSp"] + test_data["Parch"] + 1

In [None]:
# Create new feature of family size:

train['Single'] = train['FamilySize'].map(lambda s:1 if s == 1 else 0)
train['SmallFam'] = train['FamilySize'].map(lambda s:1 if s == 2 else 0)
train['MedFam'] = train['FamilySize'].map(lambda s:1 if 3 <= s <= 4 else 0)
train['LargeFam'] = train['FamilySize'].map(lambda s:1 if s>=5 else 0)

test['Single'] = test['FamilySize'].map(lambda s:1 if s == 1 else 0)
test['SmallFam'] = test['FamilySize'].map(lambda s:1 if s == 2 else 0)
test['MedFam'] = test['FamilySize'].map(lambda s:1 if 3 <= s <= 4 else 0)
test['LargeFam'] = test['FamilySize'].map(lambda s:1 if s >= 5 else 0)

In [None]:
# Convert Title and Embarked into dummy variables:

train = pd.get_dummies(train, columns = ["Title"])
train = pd.get_dummies(train, columns = ["Embarked"], prefix = "Em")

test = pd.get_dummies(test, columns = ["Title"])
test = pd.get_dummies(test, columns = ["Embarked"], prefix = "Em")

In [None]:
train.groupby("Pclass")["Survived"].mean()

In [None]:
# Creat categorical values for Pclass:

train["Pclass"] = train["Pclass"].astype("category")
train = pd.get_dummies(train, columns = ["Pclass"], prefix = "Pc")

test["Pclass"] = test["Pclass"].astype("category")
test = pd.get_dummies(test, columns = ["Pclass"], prefix = "Pc")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train = train.drop(['PassengerId','Survived'], axis = 1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId", axis=1).copy()

In [None]:
X_train.shape, Y_train.shape, X_test.shape

In [None]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

In [None]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

In [None]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

In [None]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

In [None]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

In [None]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

In [None]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

In [None]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)