<a href="https://colab.research.google.com/github/kiranmahara9600/Titanic-Data-Model/blob/main/Titanic_data_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
##importing all necessary modules

#for data preprocessing
import pandas as pd
import numpy as np

#for visualisation
import matplotlib.pyplot as plt
import seaborn as sns

#for model building
from sklearn.tree import DecisionTreeClassifier

#to split the data
from sklearn.model_selection import train_test_split

#for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

#error metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, auc

#to visualise decision trees
from sklearn.tree import export_graphviz
from sklearn import tree
from IPython.display import SVG
from graphviz import Source
from IPython.display import display

In [2]:
#importing dataset
from sklearn.datasets import fetch_openml
titanic = fetch_openml('titanic', version = 1)

#capturing data in a dataframe
titanic_df = titanic['data']
titanic_df['survived'] = titanic['target']

In [3]:
#data dimension
titanic_df.shape

(1309, 14)

In [4]:

#looking for first few rows of the data
titanic_df.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO",1
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",1
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0


In [5]:
titanic_df.tail()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived
1304,3.0,"Zabour, Miss. Hileni",female,14.5,1.0,0.0,2665,14.4542,,C,,328.0,,0
1305,3.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,,0
1306,3.0,"Zakarian, Mr. Mapriededer",male,26.5,0.0,0.0,2656,7.225,,C,,304.0,,0
1307,3.0,"Zakarian, Mr. Ortin",male,27.0,0.0,0.0,2670,7.225,,C,,,,0
1308,3.0,"Zimmerman, Mr. Leo",male,29.0,0.0,0.0,315082,7.875,,S,,,,0


***Data Dictionary***

* *pclass - refers to passenger class ( 1st, 2nd, 3rd )*
* *name - name of the passenger*
* *sex - gender of the passenger*
* *age - age of the passenger*
* *sibsp - no. of siblings or spouses passenger have on the ship*
* *parch - no. of parent or guardian passenger have on the ship*
* *ticket - represents ticket no. of the passenger*
* *fare - price of the ticket*
* *cabin - cabin information of the passenger*
* *embarked - prt of departure for the passenger*
* *boat - lifeboat if survived*
* *body - body no. if passenger did not survive and body was recovered*
* *home.dest - destination of the passenger*
* *survived - passenger survived or not*

###***Data Pre-processing***

***Feature Extraction***

*Name feature consists of first name, last name, title etc. let's explore this feature. Let's randomly pick a row to check title of the passenger for that row*

In [6]:
#randomly picked a row
titanic_df['name'][93]

'Dodge, Dr. Washington'

*okay, the title of the passenger in 93rd row is Dr.*

*Let's extract the title from name column as it represents social status of a passenger and might help in prediction of output variable as survival may depend on social status of a person*

In [7]:
#extracting a new feature title
titanic_df['passenger title'] =titanic_df['name'].str.split(',|\\.',expand = True)[1]

#dropping columns 'name'
titanic_df.drop(columns = ['name'],inplace = True)

In [8]:
titanic_df.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived,passenger title
0,1.0,female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO",1,Miss
1,1.0,male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",1,Master
2,1.0,female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0,Miss
3,1.0,male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0,Mr
4,1.0,female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0,Mrs


In [9]:
#to remove spaces in passenger title column values
titanic_df['passenger title']=titanic_df['passenger title'].str.strip()

*These titles can be categorise in different segments e.g. religious titles, noble title etc. Let's categorise*

In [10]:
#creating a dictionary to map
status_map={'Military':'Capt',
            'Col':'Military',
            'Don':'Noble',
            'Dona':'Noble',
            'Dr':'Dr',
            'Jonkheer':'Noble',
            'Lady':'Noble',
            'Major':'Military',
            'Master':'Common',
            'Miss':'Common',
            'Mlle':'Common',
            'Mme':'Common',
            'Mr':'Common',
            'Mrs':'Common',
            'Ms':'Common',
            'Rev':'Clergy',
            'Sir':'Noble',
            'the Countess':'Noble'}

#creating a new column 
titanic_df['social status'] = titanic_df['passenger title'].map(status_map)
#dropping passenger title column
titanic_df.drop(columns = ['passenger title'],inplace = True)

*Cabin feature consists of deck and no. deck could be helpful to predict whether a passenger survived or not. So, let's extract deck from cabin feature*

In [11]:
#extracting deck feature
titanic_df['deck'] = titanic_df['cabin'].str.replace('[0-9]','').str.split(' ', expand=True)[0]
#dropping columns 'cabin'
titanic_df.drop(columns = ['cabin'],inplace = True)

  


*As 'sibsp' and 'parch' represents information of the family memebers in a ship for a passenger, so we can replace these two columns with ' no. of family memebers in a ship' feature. Let's do this.*

In [12]:
#creating new feature 'family_members'
titanic_df['family_members'] = titanic_df['sibsp'] + titanic_df['parch']
#dropping columns 'sibsp' and 'parch'
titanic_df.drop(columns = ['sibsp', 'parch'],inplace = True)

*Boat and Body feature might help i survival prediction but as there are lot of null values in these features, so it will be good to drop these features*

In [13]:
#dropping features
titanic_df.drop(columns = ['boat','body'],inplace = True)

In [14]:
#checking basic info of the data
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   pclass          1309 non-null   float64 
 1   sex             1309 non-null   category
 2   age             1046 non-null   float64 
 3   ticket          1309 non-null   object  
 4   fare            1308 non-null   float64 
 5   embarked        1307 non-null   category
 6   home.dest       745 non-null    object  
 7   survived        1309 non-null   category
 8   social status   1308 non-null   object  
 9   deck            295 non-null    object  
 10  family_members  1309 non-null   float64 
dtypes: category(3), float64(4), object(4)
memory usage: 86.1+ KB


In [15]:
#converting datatype of embarked to object ( as category datatype does not allow new category to add on)
titanic_df['embarked'] = titanic_df['embarked'].astype('str')
titanic_df['sex'] = titanic_df['sex'].astype('str')
titanic_df['survived'] = titanic_df['survived'].astype('float')

In [16]:
#checking value counts of some features
list_to_check = ['pclass','sex','embarked']
for features in list_to_check:
  print('Unique values of feature ' +  features +' are :')
  print(titanic_df[features].value_counts())
  print('\n')
  print('--------*--------')
  print('\n')

Unique values of feature pclass are :
3.0    709
1.0    323
2.0    277
Name: pclass, dtype: int64


--------*--------


Unique values of feature sex are :
male      843
female    466
Name: sex, dtype: int64


--------*--------


Unique values of feature embarked are :
S      914
C      270
Q      123
nan      2
Name: embarked, dtype: int64


--------*--------




*There are some features with missing values. Let's try to impute these null values*

***Imputation***

In [17]:
#converting datatype of embarked to object ( as category datatype does not allow new category to add on)
titanic_df['embarked'] = titanic_df['embarked'].astype('object')

*For non continuous features let's impute null values with NA*

In [18]:
##imputing null values with NA 

#for 'deck' column
titanic_df['deck'] = titanic_df['deck'].fillna('Not Available')
#for 'embarked' column
titanic_df['embarked'] = titanic_df['embarked'].fillna('Not Available')
#for 'home.dest' column
titanic_df['home.dest'] = titanic_df['home.dest'].fillna('Not Available')
#for 'social status' column
titanic_df['social status'] = titanic_df['social status'].fillna('Not Available')

In [19]:
#imputing null values of 'age' with mean of age
titanic_df['age'] = titanic_df['age'].fillna(titanic_df['age'].mean())

#imputing null values of 'fare' with mode of fare
titanic_df['fare'] = titanic_df['fare'].fillna(titanic_df['fare'].mode()[0])

*okay, so done with null value treatment*

In [20]:
titanic_df.head()

Unnamed: 0,pclass,sex,age,ticket,fare,embarked,home.dest,survived,social status,deck,family_members
0,1.0,female,29.0,24160,211.3375,S,"St Louis, MO",1.0,Common,B,0.0
1,1.0,male,0.9167,113781,151.55,S,"Montreal, PQ / Chesterville, ON",1.0,Common,C,3.0
2,1.0,female,2.0,113781,151.55,S,"Montreal, PQ / Chesterville, ON",0.0,Common,C,3.0
3,1.0,male,30.0,113781,151.55,S,"Montreal, PQ / Chesterville, ON",0.0,Common,C,3.0
4,1.0,female,25.0,113781,151.55,S,"Montreal, PQ / Chesterville, ON",0.0,Common,C,3.0


In [21]:
#dropping home.dest and ticket column
titanic_df.drop(columns = 'home.dest',inplace =True)
titanic_df.drop(columns = 'ticket',inplace =True)