In [315]:
# Predicting Survival in the Titanic Data Set
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
import warnings; warnings.simplefilter('ignore')

url= "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"
titanic = pd.read_csv(url)
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [316]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [317]:
print("Understanding the impact of travelling class in the survival ratio \n")
print(titanic[['Pclass','Survived']].groupby(['Pclass'],as_index=False).mean(),"\n")
print("Passengers who were travelling travelling on a higher class had a higher survival ratio")

Understanding the impact of travelling class in the survival ratio 

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363 

Passengers who were travelling travelling on a higher class had a higher survival ratio


In [318]:
print("Understanding the impact of gender in the survival ratio \n")
print(titanic[['Sex','Survived']].groupby(['Sex'],as_index=False).mean(),"\n")
print("Female passangers have a higher survival rate than the male passengers")

Understanding the impact of gender in the survival ratio 

      Sex  Survived
0  female  0.742038
1    male  0.188908 

Female passangers have a higher survival rate than the male passengers


In [319]:
print("Understanding the impact of Family size in survival rate \n")
for i in titanic:
    titanic['Family_size'] = titanic['SibSp']+titanic['Parch']+1
print(titanic[['Family_size','Survived']].groupby(['Family_size'],as_index=False).mean(),"\n")
print("From the above, we infer that a moderate family has a higher survival rate (Family with many members as well as individual person has a least survival ratio)")

Understanding the impact of Family size in survival rate 

   Family_size  Survived
0            1  0.303538
1            2  0.552795
2            3  0.578431
3            4  0.724138
4            5  0.200000
5            6  0.136364
6            7  0.333333
7            8  0.000000
8           11  0.000000 

From the above, we infer that a moderate family has a higher survival rate (Family with many members as well as individual person has a least survival ratio)


In [320]:
print("Understanding the impact of Age in survival rate \n")
titanic['Age_group'] = pd.cut(titanic['Age'],5)
print(titanic[['Age_group','Survived']].groupby(['Age_group'],as_index=False).mean(),"\n")
print("Children and old people have a higher survival rate")

Understanding the impact of Age in survival rate 

          Age_group  Survived
0    (0.34, 16.336]  0.550000
1  (16.336, 32.252]  0.369942
2  (32.252, 48.168]  0.404255
3  (48.168, 64.084]  0.434783
4    (64.084, 80.0]  0.090909 

Children and old people have a higher survival rate


In [321]:
# Using the qcut function to get a ecenly dustributed range
print("Understanding the impact of Fare in survival rate \n")
titanic['Fare_group'] = pd.qcut(titanic['Fare'],5)
print(titanic[['Fare_group','Survived']].groupby(['Fare_group'],as_index=False).mean(),"\n")
print("People who paid more had a higher survival rate")

Understanding the impact of Fare in survival rate 

          Fare_group  Survived
0    (-0.001, 7.854]  0.217877
1      (7.854, 10.5]  0.201087
2     (10.5, 21.679]  0.424419
3   (21.679, 39.688]  0.444444
4  (39.688, 512.329]  0.642045 

People who paid more had a higher survival rate


In [322]:
# Analyzing and mapping the Age parameter
titanic.loc[titanic['Age'] <= 16,'Age']=0
titanic.loc[(titanic['Age'] > 16) & (titanic['Age'] <= 32),'Age']=1
titanic.loc[(titanic['Age'] > 32) & (titanic['Age'] <= 48),'Age']=2
titanic.loc[(titanic['Age'] > 48) & (titanic['Age'] <= 64),'Age']=3
titanic.loc[(titanic['Age'] > 64) & (titanic['Age'] <= 80),'Age']=4

In [323]:
# Analyzing and mapping the Age parameter
titanic.loc[titanic['Fare'] <= 7.854,'Fare']=0
titanic.loc[(titanic['Fare'] > 7.854) & (titanic['Fare'] <= 10.5),'Fare']=1
titanic.loc[(titanic['Fare'] > 10.5) & (titanic['Fare'] <= 21.679),'Fare']=2
titanic.loc[(titanic['Fare'] > 21.679) & (titanic['Fare'] <= 39.688),'Fare']=3
titanic.loc[(titanic['Fare'] > 39.688) & (titanic['Fare'] <= 512.329),'Fare']=4

In [324]:
# Dropping the unwanted columns (Parch and SibSp are also dropped because we have combined them into Family_size)
drop_elements = ['PassengerId','Name','Ticket','Cabin','SibSp','Parch','Embarked']
titanic = titanic.drop(drop_elements, axis = 1)
titanic = titanic.drop(['Age_group','Fare_group'], axis = 1)
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Family_size
0,0,3,male,1.0,0.0,2
1,1,1,female,2.0,4.0,2
2,1,3,female,1.0,1.0,1
3,1,1,female,2.0,4.0,2
4,0,3,male,2.0,1.0,1


In [325]:
# Mapping the Sex values
titanic['Sex'] = titanic['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [326]:
print(titanic['Sex'].isnull().values.any())

False


In [327]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Family_size
0,0,3,0,1.0,0.0,2
1,1,1,1,2.0,4.0,2
2,1,3,1,1.0,1.0,1
3,1,1,1,2.0,4.0,2
4,0,3,0,2.0,1.0,1


In [328]:
# Setting our target variable
y = titanic['Survived']
y.isnull().values.any()

False

In [329]:
# Setting our dependent variables
x = titanic[['Pclass','Sex','Age','Fare','Family_size']]
# Filling the NaN values with the mean age in the mean column
x = x.apply(lambda x: x.fillna(x.mean()),axis=0)
print(x.mean())

Pclass         2.308642
Sex            0.352413
Age            1.362745
Fare           3.716035
Family_size    1.904602
dtype: float64


In [330]:
algorithms = [KNeighborsClassifier(3),DecisionTreeClassifier(),RandomForestClassifier(),LogisticRegression()]
log_train  = pd.DataFrame(columns=["Classifier", "Accuracy"])
log_test  = pd.DataFrame(columns=["Classifier", "Accuracy"])

In [331]:
# Splitting our dataset into train and test datasets
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.4,random_state=4)

acc_dict = {}

# Building model using train dataset
for x in algorithms:
    name = x.__class__.__name__
    x.fit(x_train, y_train)
    train_predictions = x.predict(x_train)
    acc = accuracy_score(y_train, train_predictions)
    if name in acc_dict:
        acc_dict[name] += acc
    else:
        acc_dict[name] = acc
print(acc_dict)
for x in acc_dict:
    acc_dict[x] = acc_dict[x] / 10.0
    log_train_entry = pd.DataFrame([[x, acc_dict[x]]], columns=["Classifier", "Accuracy"])
    log_train = log_train.append(log_train_entry)

{'KNeighborsClassifier': 0.848314606741573, 'DecisionTreeClassifier': 0.8707865168539326, 'RandomForestClassifier': 0.8670411985018727, 'LogisticRegression': 0.7715355805243446}


In [332]:
log_train['Accuracy'] = (log_train['Accuracy']*1000).round(2)
log_train

Unnamed: 0,Classifier,Accuracy
0,KNeighborsClassifier,84.83
0,DecisionTreeClassifier,87.08
0,RandomForestClassifier,86.7
0,LogisticRegression,77.15


In [333]:
# Validating our model on the test dataset
for x in algorithms:
    name = x.__class__.__name__
    x.fit(x_test, y_test)
    test_predictions = x.predict(x_test)
    acc = accuracy_score(y_test, test_predictions)
    if name in acc_dict:
        acc_dict[name] += acc
    else:
        acc_dict[name] = acc
print(acc_dict)
for x in acc_dict:
    acc_dict[x] = acc_dict[x] / 10.0
    log_test_entry = pd.DataFrame([[x, acc_dict[x]]], columns=["Classifier", "Accuracy"])
    log_test = log_test.append(log_test_entry)

{'KNeighborsClassifier': 0.9503776791615522, 'DecisionTreeClassifier': 0.9946416768954773, 'RandomForestClassifier': 0.991466024612092, 'LogisticRegression': 0.9062852107134989}


In [334]:
log_test['Accuracy'] = (log_test['Accuracy']*1000).round(2)
log_test

Unnamed: 0,Classifier,Accuracy
0,KNeighborsClassifier,95.04
0,DecisionTreeClassifier,99.46
0,RandomForestClassifier,99.15
0,LogisticRegression,90.63
