### Import Libraries & Load Data

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# loading csv
df = pd.read_csv("data/Titanic-Dataset.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Basic Inspection

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Standardize Column Names

In [4]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

df.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

### Handle Missing Values – Age

In [5]:
df['age'].fillna(df['age'].median(), inplace=True)

df['age'].isnull().sum()

np.int64(0)

### Handle Missing Values – Embarked

In [6]:
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

df['embarked'].isnull().sum()

np.int64(0)

### Handle Missing Values – Cabin

In [7]:
df['cabin'].isnull().mean()

np.float64(0.7710437710437711)

In [8]:
df.drop(columns=['cabin'], inplace=True)

In [9]:
# Convert numeric columns safely
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['fare'] = pd.to_numeric(df['fare'], errors='coerce')

# Convert pclass to categorical
df['pclass'] = df['pclass'].astype(str)

df.dtypes

passengerid      int64
survived         int64
pclass          object
name            object
sex             object
age            float64
sibsp            int64
parch            int64
ticket          object
fare           float64
embarked        object
dtype: object

### Remove Duplicates

In [10]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

### Final Missing Value Check

In [11]:
df.isnull().sum()

passengerid    0
survived       0
pclass         0
name           0
sex            0
age            0
sibsp          0
parch          0
ticket         0
fare           0
embarked       0
dtype: int64

### Save Cleaned Dataset

In [12]:
df.to_csv("titanic_cleaned.csv", index=False)

print("Cleaned dataset saved as titanic_cleaned.csv")

Cleaned dataset saved as titanic_cleaned.csv


### Create a Simple Cleaning Log

In [13]:
cleaning_log = {
    "rows_after_cleaning": df.shape[0],
    "columns_after_cleaning": df.shape[1],
    "missing_values_remaining": df.isnull().sum().to_dict()
}

log_df = pd.DataFrame(list(cleaning_log.items()), columns=["Step", "Details"])
log_df.to_csv("cleaning_log.csv", index=False)

log_df

Unnamed: 0,Step,Details
0,rows_after_cleaning,891
1,columns_after_cleaning,11
2,missing_values_remaining,"{'passengerid': 0, 'survived': 0, 'pclass': 0,..."
