# Titanic Project

This is a classification problem. The goal is to predict if an arbitrary passenger on Titanic would survive the sinking or not. In this dataset there are 11 features and 1 label "Survived" and the dataset contains both numerical and categorical data.

Dataset Dictionary From the dataset we can assume the following things:

Survived: 0 = No(Dead) and 1 = Yes(Survived) Pclass=Passengr's class: 1 = Upper class, 2 = Middle class, 3 = Lower class SibSp: It indicates the passenger's relation with siblings and Spouse. Parch: It indicates the passenger's relation with parents. Embarked: The ports of embarkations are C = Cherbourg, Q = Queenstown and S = Southampton.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
import scipy as stats
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
# Reading the csv file from dataset
df = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset1/master/titanic_train.csv")
df

In [None]:
df.head()

In [None]:
# Checking the dimension of the dataset
df.shape

In [None]:
# Checking the types of the dataset
df.dtypes

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isnull().sum()

In [None]:
sns.heatmap(df.isnull())

In [None]:
#checking columns
df.columns

In [None]:
df['Survived'].unique()

In [None]:
df["Survived"].value_counts()

In [None]:
df.loc[df['Survived']==" "]

In [None]:
df.describe()

# Data Visualization by Univariate Analysis

In [None]:
print(df["Survived"].value_counts())
sns.countplot(df["Survived"])

In [None]:
print(df["Pclass"].value_counts())
sns.countplot(df["Pclass"])

In [None]:
print(df["Sex"].value_counts())
sns.countplot(df["Sex"])

In [None]:
print(df["Embarked"].value_counts())
sns.factorplot('Embarked',kind='count',data=df,hue='Survived')

In [None]:
print(df["Parch"].value_counts())
sns.factorplot('Parch',kind='count',data=df,hue='Survived')

# Distribution of skewness

In [None]:
sns.distplot(df["Age"])

In [None]:
sns.distplot(df["Fare"])

In [None]:
sns.distplot(df["PassengerId"])

In [None]:
sns.distplot(df["Survived"])

In [None]:
sns.distplot(df["Pclass"])

In [None]:
sns.distplot(df["SibSp"])

In [None]:
sns.distplot(df["Parch"])

# Bivariate Analysis

In [None]:
# Checking the relation between two variables
plt.figure(figsize=[10,6])
plt.title('Comparision between Pclass and Age')
sns.scatterplot(df['Pclass'],df['Age'],hue=df["Survived"]);

In [None]:
plt.figure(figsize=[10,6])
plt.title('Comparision between Pclass and Sex')
sns.scatterplot(df['Pclass'],df['Sex'],hue=df["Survived"]);

In [None]:
# Checking the relation between two variables
plt.figure(figsize=[10,6])
plt.title('Comparision between Age and Embarked')
sns.scatterplot(df['Embarked'],df['Age'],hue=df["Survived"]);

In [None]:
# Checking the relation between two variables
plt.figure(figsize=[10,6])
plt.title('Comparision between Sex and Parch')
sns.scatterplot(df['Sex'],df['Parch'],hue=df["Survived"]);

In [None]:
# Checking the relation between two variables
plt.figure(figsize=[10,6])
plt.title('Comparision between Age and Fare')
sns.scatterplot(df['Age'],df['Fare'],hue=df["Survived"]);

In [None]:
# Checking the pairwise relation in the dataset.
sns.pairplot(df,hue="Survived")

# Correlation between the target variable and independent variables using HEAT map

In [None]:
cor = df.corr()
cor

In [None]:
# Visualizing the correlation matrix by plotting heat map.
plt.figure(figsize=(16,9))
sns.heatmap(df.corr(),linewidths=.1, annot = True)
plt.yticks(rotation=0);
cor['Survived'].sort_values(ascending=False)

# Visualizing the correlation between label and features using bar plot


In [None]:
plt.figure(figsize=(22,7))
df.corr()['Survived'].sort_values(ascending=False).drop(['Survived']).plot(kind='bar',color='c')
plt.xlabel('Feature',fontsize=14)
plt.ylabel('column with target names',fontsize=14)
plt.title('correlation',fontsize=18)
plt.show()

In [None]:
# Dropping PassengerId column
df.drop('PassengerId', axis=1, inplace = True )
df.drop('Name',axis=1, inplace=True)   # Name has no important to build our madel
df.head()

# Identifying the outliers

In [None]:
sns.boxplot(df["Age"])

In [None]:
sns.boxplot(df["Fare"])

In [None]:
sns.boxplot(df["SibSp"])

In [None]:
sns.boxplot(df["Parch"])


In [None]:
df.isnull().sum()

In [None]:
df["Age"] = df["Age"].fillna(df["Age"].median())

In [None]:
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

In [None]:
df['Cabin'] = df['Cabin'].fillna('U')

In [None]:
#Zscore method
df.columns

In [None]:
#Chcking null values after filling them
df.isnull().sum()

In [None]:
# Visualizing the missing value after treating it using heat map.
sns.heatmap(df.isnull())

In [None]:
df.columns

In [None]:
variables = df[['Age','Fare']]

In [None]:
from scipy.stats import zscore
z=np.abs(zscore(variables))
z

In [None]:
new_df = df[(z<3).all(axis=1)]
new_df.head()

In [None]:
df.shape

In [None]:
new_df.shape

In [None]:
# Let's check the the data loss
data_loss = (891-864)/891*100
data_loss

In [None]:
# IQR (Inter Quantile Range) method
# 1st quantile
Q1=variables.quantile(0.25)

# 3rd quantile
Q3=variables.quantile(0.75)

# IQR
IQR=Q3 - Q1

df1=df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]


In [None]:
df.shape

In [None]:
df1.shape

In [None]:
data_loss = (891-721)/891*100
data_loss

In [None]:
new_df.skew()

# Encoding the data using Label Encoding

In [None]:
cols = ["Sex", "Ticket", "Cabin", "Embarked"]


In [None]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
new_df[cols]= new_df[cols].apply(LE.fit_transform)

In [None]:
new_df[cols].head()

In [None]:
new_df.head()

In [None]:
x = new_df.drop("Survived", axis=1)
y = new_df["Survived"]

In [None]:
x.head()

In [None]:
y.value_counts()

In [None]:
x.shape, y.shape

# Modeling
#Finding the best random state

In [None]:
x.shape, y.shape

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
maxAccu=0
maxRS=0
for i in range(1,200):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.30, random_state =i)
    DTC = DecisionTreeClassifier()
    DTC.fit(x_train, y_train)
    pred = DTC.predict(x_test)
    acc=accuracy_score(y_test, pred)
    if acc>maxAccu:
        maxAccu=acc
        maxRS=i
print("Best accuracy is ",maxAccu," on Random_state ",maxRS)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=maxRS)

In [None]:
x_train.shape,y_train.shape, x_test.shape,y_test.shape


In [None]:
#classification of algorithms
#importing necessary libraries.
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score

In [None]:
#Decision Tree Classifier
DTC = DecisionTreeClassifier()
DTC.fit(x_train,y_train)
predDTC = DTC.predict(x_test)

print(accuracy_score(y_test, predDTC))
print(confusion_matrix(y_test, predDTC))
print(classification_report(y_test,predDTC))

In [None]:
# Lets plot confusion matrix for DTC
cm = confusion_matrix(y_test,predDTC)

x_axis_labels = ["Died","Survived"]
y_axis_labels = ["Died","Survived"]

f , ax = plt.subplots(figsize=(7,7))
sns.heatmap(cm, annot = True,linewidths=.2, linecolor="black", fmt = ".0f", ax=ax, cmap="Purples", 
xticklabels=x_axis_labels,
yticklabels=y_axis_labels)

plt.xlabel("PREDICTED LABEL")
plt.ylabel("TRUE LABEL")
plt.title('Confusion Matrix for Decision Tree Classifier')

In [None]:
#Random Forest Classifier
RFC = RandomForestClassifier()
RFC.fit(x_train,y_train)
predRFC = RFC.predict(x_test)

print(accuracy_score(y_test, predRFC))
print(confusion_matrix(y_test, predRFC))
print(classification_report(y_test,predRFC))


In [None]:
# Lets plot confusion matrix for Random Forest Classifier
cm = confusion_matrix(y_test,predRFC)

x_axis_labels = ["Died","Survived"]
y_axis_labels = ["Died","Survived"]

f , ax = plt.subplots(figsize=(7,7))
sns.heatmap(cm, annot = True,linewidths=.2, linecolor="black", fmt = ".0f", ax=ax, cmap="Purples", 
xticklabels=x_axis_labels,
yticklabels=y_axis_labels)

plt.xlabel("PREDICTED LABEL")
plt.ylabel("TRUE LABEL")
plt.title('Confusion Matrix for Random Forest Classifier')

In [None]:
#Logistic Regression
LR=LogisticRegression()
LR.fit(x_train,y_train)
predLR=LR.predict(x_test)

print(accuracy_score(y_test, predLR))
print(confusion_matrix(y_test, predLR))
print(classification_report(y_test,predLR))


In [None]:
# Lets plot confusion matrix for Logistic Regression
cm = confusion_matrix(y_test,predLR)

x_axis_labels = ["Died","Survived"]
y_axis_labels = ["Died","Survived"]

f , ax = plt.subplots(figsize=(7,7))
sns.heatmap(cm, annot = True,linewidths=.2, linecolor="black", fmt = ".0f", ax=ax, cmap="Purples", 
xticklabels=x_axis_labels,
yticklabels=y_axis_labels)

plt.xlabel("PREDICTED LABEL")
plt.ylabel("TRUE LABEL")
plt.title('Confusion Matrix for Logistic Regression')

# Checking the Cross Validation Score


In [None]:
from sklearn.model_selection import cross_val_score


In [None]:
# cv score for Decision Tree Classifier
print(cross_val_score(DTC,x,y,cv=5).mean())

In [None]:
# cv score for Random Forest Classifier
print(cross_val_score(RFC,x,y,cv=5).mean())

In [None]:
# cv score for Logistic Regression 
print(cross_val_score(LR,x,y,cv=5).mean())

# Hyper parameter tuning for best model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Random Forest Classifier
parameters = {'n_estimators':[0,200],
             'criterion':['gini','entropy'],
             'max_depth':np.arange(2,50),
             'max_features':["auto","sqrt","log2"],
             'max_leaf_nodes':[10,20,30,40]}

In [None]:
GCV=GridSearchCV(RandomForestClassifier(),parameters,cv=5)

In [None]:
GCV.fit(x_train,y_train)

In [None]:
GCV.best_params_

In [None]:
Titanic_survived = RandomForestClassifier(criterion='gini',max_depth=27, max_features='sqrt', max_leaf_nodes=40, n_estimators=200)
Titanic_survived.fit(x_train, y_train)
pred = Titanic_survived.predict(x_test)
acc=accuracy_score(y_test,pred)
print(acc*100)

In [None]:
x.shape

# Plotting ROC-AUC curve for best model

In [None]:
# Support Vector Machine Classifier
from sklearn import datasets 
from sklearn import metrics
from sklearn import model_selection
x,y = datasets.make_classification (random_state=192)
x_train, x_test, y_train, y_test = model_selection.train_test_split(x,y,random_state=87)
df = RandomForestClassifier(random_state=192)
df.fit(x_train, y_train)
metrics.plot_roc_curve(df, x_test, y_test)
plt.show()

In [None]:
# Saving the model using .pkl
import joblib
joblib.dump(Titanic_survived,"Titanic.pkl")