Titanic Data Preprocessing – Part 1 (Missing Values)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import os

# List all files in input folder
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




/kaggle/input/titanic-dataset/Titanic-Dataset.csv


In [3]:
import pandas as pd

train = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")
test = pd.read_csv("/kaggle/input/titanic-dataset/Titanic-Dataset.csv")

train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


 ==============================
# 3. Missing Values Overview
# ==============================

In [4]:

print(train.isnull().sum())




PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [5]:
print(train.columns)



Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


# ==============================
# 5. Handle 'Embarked' Missing Values
# ==============================
# Fill missing Embarked with mode (most frequent value)

In [6]:

train['Embarked'].fillna(train['Embarked'].mode()[0])


0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

# Replace rare titles # Map similar titles # Fill Age missing values with median Age per Title group

In [7]:
#
train['Title'] = train['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)


rare_titles = ['Lady', 'Countess','Capt','Col','Don','Dr','Major','Rev',
               'Sir','Jonkheer','Dona']
train['Title'] = train['Title'].replace(rare_titles, 'Rare')
test['Title'] = test['Title'].replace(rare_titles, 'Rare')


train['Title'] = train['Title'].replace(['Mlle','Ms'], 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')
test['Title'] = test['Title'].replace(['Mlle','Ms'], 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')


train['Age'] = train.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
test['Age'] = test.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))


# ==============================
# 7. Handle 'Fare' Missing Value in Test
# ==============================

In [8]:

test['Fare'].fillna(test['Fare'].median())


0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: Fare, Length: 891, dtype: float64

# ==============================
# 8. Verify Missing Values
# ==============================

In [9]:

print("Train missing values:\n", train.isnull().sum())
print("\nTest missing values:\n", test.isnull().sum())


Train missing values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64

Test missing values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64


: Encode Categorical Features

In [10]:
from sklearn.preprocessing import LabelEncoder

# Encode Sex
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])

# One-hot encode Embarked + Title
train = pd.get_dummies(train, columns=['Embarked', 'Title'], drop_first=True)
test = pd.get_dummies(test, columns=['Embarked', 'Title'], drop_first=True)



In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = ['Age', 'Fare']

train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])


In [12]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)


Train shape: (891, 17)
Test shape: (891, 17)
