In [157]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split

## Load the Dataset

In [158]:
df = pd.read_csv("../data/Titanic.csv")
df.head()

Unnamed: 0,Row Number,Passenger Class,Name,Gender,Age,No of Siblings,No of Parents,Ticket Number,Passenger Fare,Cabin,Port of Embarkation,Life Boat,Survived
0,1,First,"Allen, Miss. Elisabeth Walton",Female,29.0,0,0,24160,211.3375,B5,Southampton,2,Yes
1,2,First,"Allison, Master. Hudson Trevor",Male,0.9167,1,2,113781,151.55,C22 C26,Southampton,11,Yes
2,3,First,"Allison, Miss. Helen Loraine",Female,2.0,1,2,113781,151.55,C22 C26,Southampton,?,No
3,4,First,"Allison, Mr. Hudson Joshua Creighton",Male,30.0,1,2,113781,151.55,C22 C26,Southampton,?,No
4,5,First,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",Female,25.0,1,2,113781,151.55,C22 C26,Southampton,?,No


## Rename Features

In [159]:
# Rename some of the features to make data cleaning and further analysis easier
df.rename(columns={'Port of Embarkation': 'Port', 'Passenger Class': 'Class', 'Passenger Fare': 'Fare', 'Cabin ': 'Cabin'}, inplace=True)

## Simple Data Inspection

In [160]:
df.shape

(1309, 13)

In [161]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Row Number      1309 non-null   int64 
 1   Class           1309 non-null   object
 2   Name            1309 non-null   object
 3   Gender          1309 non-null   object
 4   Age             1309 non-null   object
 5   No of Siblings  1309 non-null   int64 
 6   No of Parents   1309 non-null   int64 
 7   Ticket Number   1309 non-null   object
 8   Fare            1309 non-null   object
 9   Cabin           1309 non-null   object
 10  Port            1309 non-null   object
 11  Life Boat       1309 non-null   object
 12  Survived        1309 non-null   object
dtypes: int64(3), object(10)
memory usage: 133.1+ KB


In [162]:
df.describe(include='all')

Unnamed: 0,Row Number,Class,Name,Gender,Age,No of Siblings,No of Parents,Ticket Number,Fare,Cabin,Port,Life Boat,Survived
count,1309.0,1309,1309,1309,1309,1309.0,1309.0,1309,1309.0,1309,1309,1309,1309
unique,,3,1307,2,99,,,929,282.0,187,4,28,2
top,,Third,"Connolly, Miss. Kate",Male,?,,,CA. 2343,8.05,?,Southampton,?,No
freq,,709,2,843,263,,,11,60.0,1014,914,823,809
mean,655.0,,,,,0.498854,0.385027,,,,,,
std,378.020061,,,,,1.041658,0.86556,,,,,,
min,1.0,,,,,0.0,0.0,,,,,,
25%,328.0,,,,,0.0,0.0,,,,,,
50%,655.0,,,,,0.0,0.0,,,,,,
75%,982.0,,,,,1.0,0.0,,,,,,


Note the issues with the data :

- Age is of type object
- Age, Cabin, Life Boat all show the most frequently occurring value to be '?' - Inspect the other features to determine if '?' is used in place of NaN and replace all.


## Convert Age to Numeric

In [163]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

In [164]:
df['Age'] = df['Age'].round(0)

## Loop through the categorical features and inspect the possible values

In [165]:
cat_feats = ['Class', 'Name', 'Gender', 'Ticket Number',
       'Cabin', 'Port', 'Life Boat', 'Survived']

In [166]:
for feat in cat_feats:
    print(f"********* Feature: {feat} *********\n")
    print(f"{df[feat].unique()}\n")

********* Feature: Class *********

['First' 'Second' 'Third']

********* Feature: Name *********

['Allen, Miss. Elisabeth Walton' 'Allison, Master. Hudson Trevor'
 'Allison, Miss. Helen Loraine' ... 'Zakarian, Mr. Mapriededer'
 'Zakarian, Mr. Ortin' 'Zimmerman, Mr. Leo']

********* Feature: Gender *********

['Female' 'Male']

********* Feature: Ticket Number *********

['24160' '113781' '19952' '13502' '112050' '11769' 'PC 17609' 'PC 17757'
 'PC 17477' '19877' '27042' 'PC 17318' 'PC 17558' '11813' '13050' '11751'
 '111369' 'PC 17483' '13905' '11967' 'PC 17760' '110564' '113784' '112277'
 '36928' '113783' '110489' 'PC 17608' '113505' '111427' '113054'
 'PC 17591' '112379' 'PC 17610' '16966' '113050' '113798' 'PC 17476'
 'PC 17606' 'PC 17755' '695' '113059' '113760' '19924' '17770'
 'W.E.P. 5734' '113806' '110152' 'PC 17594' '112051' '13508' '110465'
 '5727' 'PC 17756' '11770' '113791' 'WE/P 5735' '112901' 'PC 17599'
 '113055' '113804' 'F.C. 12750' '17474' '33638' 'PC 17761' '11755'
 

## Replace "?" with NAs

In [167]:
df[df['Port'] == '?']

Unnamed: 0,Row Number,Class,Name,Gender,Age,No of Siblings,No of Parents,Ticket Number,Fare,Cabin,Port,Life Boat,Survived
168,169,First,"Icard, Miss. Amelie",Female,38.0,0,0,113572,80,B28,?,6,Yes
284,285,First,"Stone, Mrs. George Nelson (Martha Evelyn)",Female,62.0,0,0,113572,80,B28,?,6,Yes


In [168]:
df.replace('?', np.nan, inplace=True)

## Identify Missing Values

In [169]:
df.dtypes

Row Number          int64
Class              object
Name               object
Gender             object
Age               float64
No of Siblings      int64
No of Parents       int64
Ticket Number      object
Fare               object
Cabin              object
Port               object
Life Boat          object
Survived           object
dtype: object

In [170]:
df.isna().sum()

Row Number           0
Class                0
Name                 0
Gender               0
Age                263
No of Siblings       0
No of Parents        0
Ticket Number        0
Fare                 1
Cabin             1014
Port                 2
Life Boat          823
Survived             0
dtype: int64

In [171]:
df = df[~((df['Fare'].isna()) | (df['Port'].isna()))]

In [172]:
df.columns

Index(['Row Number', 'Class', 'Name', 'Gender', 'Age', 'No of Siblings',
       'No of Parents', 'Ticket Number', 'Fare', 'Cabin', 'Port', 'Life Boat',
       'Survived'],
      dtype='object')

In [173]:
df.drop(columns=['Cabin', 'Life Boat', 'Row Number'], inplace=True)

In [174]:
df.isna().sum()

Class               0
Name                0
Gender              0
Age               263
No of Siblings      0
No of Parents       0
Ticket Number       0
Fare                0
Port                0
Survived            0
dtype: int64

In [175]:
df.loc[df['Age'].isna()]

Unnamed: 0,Class,Name,Gender,Age,No of Siblings,No of Parents,Ticket Number,Fare,Port,Survived
15,First,"Baumann, Mr. John D",Male,,0,0,PC 17318,25.925,Southampton,No
37,First,"Bradley, Mr. George (""George Arthur Brayton"")",Male,,0,0,111427,26.55,Southampton,Yes
40,First,"Brewe, Dr. Arthur Jackson",Male,,0,0,112379,39.6,Cherbourg,No
46,First,"Cairns, Mr. Alexander",Male,,0,0,113798,31,Southampton,No
59,First,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genev...",Female,,0,0,17770,27.7208,Cherbourg,Yes
...,...,...,...,...,...,...,...,...,...,...
1293,Third,"Williams, Mr. Howard Hugh ""Harry""",Male,,0,0,A/5 2466,8.05,Southampton,No
1297,Third,"Wiseman, Mr. Phillippe",Male,,0,0,A/4. 34244,7.25,Southampton,No
1302,Third,"Yousif, Mr. Wazli",Male,,0,0,2647,7.225,Cherbourg,No
1303,Third,"Yousseff, Mr. Gerious",Male,,0,0,2627,14.4583,Cherbourg,No


In [176]:
df.groupby(['Gender', 'Class', 'No of Siblings', 'No of Parents'])['Age'].mean()

Gender  Class  No of Siblings  No of Parents
Female  First  0               0                35.826087
                               1                36.625000
                               2                31.666667
               1               0                37.916667
                               1                45.333333
                                                  ...    
Male    Third  3               2                 7.000000
               4               1                 7.100000
                               2                 6.800000
               5               2                 8.750000
               8               2                14.000000
Name: Age, Length: 84, dtype: float64