In [68]:
#import necessary libraries 
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

In [69]:
#import dataset 
df = pd.read_csv("titanicDataset\\test.csv")
#print the data shape
print(df.shape)

(418, 11)


In [70]:
#print the dataset
df.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [71]:
#Check the data types and null elements of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


# Part 2: Data Preprocessing & Feature Engineering

## Step 1:  Handle missing values 
Goal: Fill or remove missing values so the model can train properly.
- Check for missing values first

In [72]:
df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Step 2:  Feature Engineering
Goal: Create new features to improve model performance.
- Extract Title from Name
- Convert ‘Sex’ to Numeric (Label Encoding)
- Create a new IsAlone Feature
- Extract Deck from Cabin
- Filling NaN values in ‘Embarked’ and 'Deck'
- Convert ‘Embarked’ to Numerical (One-Hot Encoding)

### 1. Extract Titles from names.

In [73]:
#import regular expressions 
import re
pattern =  r' ([A-Za-z]+)\.'  #This pattern captures the just the titile from the name
df['Title'] = df['Name'].str.extract(pattern)
#The codes below would help to reduce the categories for better model prediciton
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace(['Mme', 'Countess', 'Lady', 'Dona'], 'Mrs')
df['Title'] = df['Title'].replace(['Capt', 'Col', 'Dr', 'Major', 'Rev'], 'Officer')
print(df['Title'].unique())

['Mr' 'Mrs' 'Miss' 'Master' 'Officer']


### 2. Convert 'Sex' into Numeric

In [74]:
if df['Sex'].dtypes == 'object':
    df['Sex'] = df['Sex'].map({"male" : 0, "female" : 1})

print(df['Sex'].dtypes, df['Sex'].unique())

int64 [0 1]


### 3. Create a new IsAlone column

In [75]:
df['FamilySize'] = df['SibSp'] + df['Parch']
df['IsAlone'] = (df['FamilySize'] == 0).astype(int)

### 4. Extract Deck from Cabin

In [76]:
df['Deck'] = df['Cabin'].str[0]
df['Deck'] = df['Deck'].fillna('Unknown')

### 5. Filling NaN values in ‘Embarked’ and 'Deck'

In [77]:
df['Embarked'].unique()

array(['Q', 'S', 'C'], dtype=object)

There is no nan value in emabarked column

### 6. Convert ‘Embarked’ to Numerical (One-Hot Encoding)

In [78]:
if 'Embarked' in df:
    df = pd.get_dummies(df, columns = ['Embarked'], drop_first = True) # We use drop_first = True to delete the first colum for reducing multicolinairty
df['Embarked_Q'] = df['Embarked_Q'].astype(int)
df['Embarked_S'] = df['Embarked_S'].astype(int)
df.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title,FamilySize,IsAlone,Deck,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Mr,0,1,Unknown,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,Mrs,1,0,Unknown,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Mr,0,1,Unknown,1,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,Mr,0,1,Unknown,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,Mrs,2,0,Unknown,0,1


## Step 3:   Handle Outliers (Fare & Age)
Goal: Remove or transform extreme values.
- Handle High Fare Outliers
- Normalize Age

In [79]:
df['Fare'] = df['Fare'].apply(lambda x: np.log(x + 1)) #Taking log value to handle the extreme outliers in fare colmn

In [80]:
#Performing group based impuation for age on feature engineered dataset.
df['Age'] = df.groupby(['Sex', 'Pclass'])['Age'].transform(lambda x: x.fillna(x.median()))
#Normalising the age column again
df['Age'] = ( df['Age'] - df['Age'].mean()) / (df['Age'].std()) #Normalising the Age since age is a more normally distributed variable.

### 7. Convert ‘PClass’ to Numerical (One-Hot Encoding)

In [81]:
if 'Pclass' in df:
    df = pd.get_dummies(df, columns = ['Pclass'], drop_first = True) # We use drop_first = True to delete the first colum for reducing multicolinairty
df['Pclass_2'] = df['Pclass_2'].astype(int)
df['Pclass_3'] = df['Pclass_3'].astype(int)
df.head(5)

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title,FamilySize,IsAlone,Deck,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
0,892,"Kelly, Mr. James",0,0.398973,0,0,330911,2.178064,,Mr,0,1,Unknown,1,0,0,1
1,893,"Wilkes, Mrs. James (Ellen Needs)",1,1.357646,1,0,363272,2.079442,,Mrs,1,0,Unknown,0,1,0,1
2,894,"Myles, Mr. Thomas Francis",0,2.508054,0,0,240276,2.369075,,Mr,0,1,Unknown,1,0,1,0
3,895,"Wirz, Mr. Albert",0,-0.176231,0,0,315154,2.268252,,Mr,0,1,Unknown,0,1,0,1
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,-0.5597,1,1,3101298,2.586824,,Mrs,2,0,Unknown,0,1,0,1


In [82]:
df.isnull().sum()

PassengerId      0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Title            0
FamilySize       0
IsAlone          0
Deck             0
Embarked_Q       0
Embarked_S       0
Pclass_2         0
Pclass_3         0
dtype: int64

In [83]:
df[df['Fare'].isna()]


Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title,FamilySize,IsAlone,Deck,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
152,1044,"Storey, Mr. Thomas",0,2.393013,0,0,3701,,,Mr,0,1,Unknown,0,1,0,1


In [84]:
mean_fare = df['Fare'].mean()
df = df.fillna({'Fare' : mean_fare})
df.isnull().sum()

PassengerId      0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Title            0
FamilySize       0
IsAlone          0
Deck             0
Embarked_Q       0
Embarked_S       0
Pclass_2         0
Pclass_3         0
dtype: int64

In [85]:
df.head(5)

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title,FamilySize,IsAlone,Deck,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
0,892,"Kelly, Mr. James",0,0.398973,0,0,330911,2.178064,,Mr,0,1,Unknown,1,0,0,1
1,893,"Wilkes, Mrs. James (Ellen Needs)",1,1.357646,1,0,363272,2.079442,,Mrs,1,0,Unknown,0,1,0,1
2,894,"Myles, Mr. Thomas Francis",0,2.508054,0,0,240276,2.369075,,Mr,0,1,Unknown,1,0,1,0
3,895,"Wirz, Mr. Albert",0,-0.176231,0,0,315154,2.268252,,Mr,0,1,Unknown,0,1,0,1
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,-0.5597,1,1,3101298,2.586824,,Mrs,2,0,Unknown,0,1,0,1


In [86]:
#Drop unnecessary columns
df = df.drop( ['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin'], axis = 1)
new_order = ['Title', 'Age', 'Sex', 'Fare', 'Pclass_2',  'Pclass_3', 'Deck', 'FamilySize', 'IsAlone', 'Embarked_Q', 'Embarked_S']
df = df[new_order]
df.head()

Unnamed: 0,Title,Age,Sex,Fare,Pclass_2,Pclass_3,Deck,FamilySize,IsAlone,Embarked_Q,Embarked_S
0,Mr,0.398973,0,2.178064,0,1,Unknown,0,1,1,0
1,Mrs,1.357646,1,2.079442,0,1,Unknown,1,0,0,1
2,Mr,2.508054,0,2.369075,1,0,Unknown,0,1,1,0
3,Mr,-0.176231,0,2.268252,0,1,Unknown,0,1,0,1
4,Mrs,-0.5597,1,2.586824,0,1,Unknown,2,0,0,1


#### Encoding methods for Name and Deck

In [87]:
#Deck: for now lets use label encoding because deck has an order.
print(df['Deck'].unique())

['Unknown' 'B' 'E' 'A' 'C' 'D' 'F' 'G']


In [88]:
deck_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T':8, 'Unknown': 0}
df['Deck'] = df['Deck'].map(deck_mapping)
#Normal question, do we need to scale?
# Since Deck is a category (A, B, C... → 1, 2, 3...), it doesn’t represent a numerical value like Age or Fare.
# Scaling is only needed for continuous numerical data like Age and Fare.

In [89]:
df.head(5)

Unnamed: 0,Title,Age,Sex,Fare,Pclass_2,Pclass_3,Deck,FamilySize,IsAlone,Embarked_Q,Embarked_S
0,Mr,0.398973,0,2.178064,0,1,0,0,1,1,0
1,Mrs,1.357646,1,2.079442,0,1,0,1,0,0,1
2,Mr,2.508054,0,2.369075,1,0,0,0,1,1,0
3,Mr,-0.176231,0,2.268252,0,1,0,0,1,0,1
4,Mrs,-0.5597,1,2.586824,0,1,0,2,0,0,1


In [90]:
#Name: for now lets use one hot encoding because title has no order.
print(df['Title'].unique())

['Mr' 'Mrs' 'Miss' 'Master' 'Officer']


In [91]:
if 'Title' in df:
    if df['Title'].dtypes == 'object':
        df = pd.get_dummies(df, columns = ['Title']) # We use drop_first = True to delete the first colum for reducing multicolinairty
df['Title_Master'] = df['Title_Master'].astype(int)
df['Title_Miss'] = df['Title_Miss'].astype(int)
df['Title_Mr'] = df['Title_Mr'].astype(int)
df['Title_Mrs'] = df['Title_Mrs'].astype(int)
df['Title_Officer'] = df['Title_Officer'].astype(int)

In [94]:
#Rearranging the dataset
new_order_post_title = ['Title_Officer' ,'Title_Master','Title_Mr',
                        'Title_Miss', 'Title_Mrs','Age', 'Sex', 'Fare', 'Pclass_2', 'Pclass_3', 'Deck', 'FamilySize', 'IsAlone', 'Embarked_Q', 'Embarked_S']
df = df[new_order_post_title]

In [95]:
df.head(5)

Unnamed: 0,Title_Officer,Title_Master,Title_Mr,Title_Miss,Title_Mrs,Age,Sex,Fare,Pclass_2,Pclass_3,Deck,FamilySize,IsAlone,Embarked_Q,Embarked_S
0,0,0,1,0,0,0.398973,0,2.178064,0,1,0,0,1,1,0
1,0,0,0,0,1,1.357646,1,2.079442,0,1,0,1,0,0,1
2,0,0,1,0,0,2.508054,0,2.369075,1,0,0,0,1,1,0
3,0,0,1,0,0,-0.176231,0,2.268252,0,1,0,0,1,0,1
4,0,0,0,0,1,-0.5597,1,2.586824,0,1,0,2,0,0,1


### Exporting dataframe into csv

In [97]:
from pathlib import Path  
filepath = Path('C:\\Users\\karth\\Machine_Learning\\titanicPassengerSurvival\\titanicDataset\\processed_test.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath) 

In [98]:
#import dataset 
df = pd.read_csv("titanicDataset\\processed_test.csv")
#print the data shape
print(df.shape)

(418, 16)


In [99]:
#print the dataset
df.head(10)

Unnamed: 0.1,Unnamed: 0,Title_Officer,Title_Master,Title_Mr,Title_Miss,Title_Mrs,Age,Sex,Fare,Pclass_2,Pclass_3,Deck,FamilySize,IsAlone,Embarked_Q,Embarked_S
0,0,0,0,1,0,0,0.398973,0,2.178064,0,1,0,0,1,1,0
1,1,0,0,0,0,1,1.357646,1,2.079442,0,1,0,1,0,0,1
2,2,0,0,1,0,0,2.508054,0,2.369075,1,0,0,0,1,1,0
3,3,0,0,1,0,0,-0.176231,0,2.268252,0,1,0,0,1,0,1
4,4,0,0,0,0,1,-0.5597,1,2.586824,0,1,0,2,0,0,1
5,5,0,0,1,0,0,-1.173251,0,2.324836,0,1,0,0,1,0,1
6,6,0,0,0,1,0,0.053851,1,2.155152,0,1,0,0,1,1,0
7,7,0,0,1,0,0,-0.252925,0,3.401197,1,0,0,2,0,0,1
8,8,0,0,0,0,1,-0.866475,1,2.107689,0,1,0,0,1,0,0
9,9,0,0,1,0,0,-0.636394,0,3.224858,0,1,0,2,0,0,1
