# 01 â€” EDA & Cleaning
This notebook will contain data exploration and cleaning steps.

In [8]:
import sagemaker
import boto3
import pandas as pd
import numpy as np

# --- 1. SageMaker Session & Role ---
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name
prefix = 'titanic-ml' # Project prefix for S3 organization

print(f"SageMaker Role ARN: {role}")
print(f"S3 Bucket: {bucket}")
print(f"S3 Prefix: {prefix}")

SageMaker Role ARN: arn:aws:iam::323649454838:role/service-role/AmazonSageMaker-ExecutionRole-20251106T213876
S3 Bucket: sagemaker-ap-southeast-1-323649454838
S3 Prefix: titanic-ml


Target Distribution (Survived):
Survived
0    61.616162
1    38.383838
Name: proportion, dtype: float64
------------------------------

Descriptive Statistics for Numerical Features:


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


------------------------------

Features with Missing Values:
Cabin       687
Age         177
Embarked      2
dtype: int64


In [11]:
# Check target variable distribution
print("Target Distribution (Survived):")
print(df['Survived'].value_counts(normalize=True) * 100)
print("-" * 30)

# Check statistics for numerical features
print("\nDescriptive Statistics for Numerical Features:")
display(df.describe())
print("-" * 30)

# Identify features with missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print("\nFeatures with Missing Values:")
print(missing_values)

Target Distribution (Survived):
Survived
0    61.616162
1    38.383838
Name: proportion, dtype: float64
------------------------------

Descriptive Statistics for Numerical Features:


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


------------------------------

Features with Missing Values:
Cabin       687
Age         177
Embarked      2
dtype: int64


In [12]:
# 3.1 Drop features that are unlikely to be useful or have too many missing values
# PassengerId is an identifier, Cabin is mostly null, Ticket is complex.
df_clean = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# 3.2 Feature Engineering: Extract Title from Name
# A common feature for this dataset, often related to social status/age
df_clean['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# Group less common titles
df_clean['Title'] = df_clean['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                               'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                               'Jonkheer', 'Dona'], 'Rare')
df_clean['Title'] = df_clean['Title'].replace('Mlle', 'Miss')
df_clean['Title'] = df_clean['Title'].replace('Ms', 'Miss')
df_clean['Title'] = df_clean['Title'].replace('Mme', 'Mrs')

# 3.3 Impute Missing Values

# Fill missing 'Age' using the median age for each 'Title' (a more robust imputation)
df_clean['Age'] = df_clean.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))

# Fill missing 'Embarked' with the mode (most frequent port, 'S')
df_clean['Embarked'] = df_clean['Embarked'].fillna(df_clean['Embarked'].mode()[0])

# Fill missing 'Fare' with the median (only 1 missing value, robust to outliers)
df_clean['Fare'] = df_clean['Fare'].fillna(df_clean['Fare'].median())

# 3.4 Create Family Size feature
df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1
df_clean = df_clean.drop(columns=['SibSp', 'Parch']) # Drop the components

# 3.5 Check final missing values
print("\nMissing values after cleaning:")
print(df_clean.isnull().sum().sum()) # Should be 0

print("\nCleaned DataFrame Info:")
df_clean.info()


Missing values after cleaning:
0

Cleaned DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Pclass      891 non-null    int64  
 2   Sex         891 non-null    object 
 3   Age         891 non-null    float64
 4   Fare        891 non-null    float64
 5   Embarked    891 non-null    object 
 6   Title       891 non-null    object 
 7   FamilySize  891 non-null    int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [13]:
# Identify categorical columns (object type)
categorical_cols = df_clean.select_dtypes(include='object').columns.tolist()

# Use One-Hot Encoding for categorical features
# This creates new columns for each category and is necessary for Sklearn/PyTorch
df_processed = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=True)

print("\nFinal Processed DataFrame:")
display(df_processed.head())
print(f"Final Shape: {df_processed.shape}")

# Optional: Save the processed data to S3 or locally for the next step
# When using SageMaker, you'd typically save this to S3, but for local testing:
# df_processed.to_csv('../data/processed/processed_train.csv', index=False)
# print("\nProcessed data saved to ../data/processed/processed_train.csv")


Final Processed DataFrame:


Unnamed: 0,Survived,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,3,22.0,7.25,2,True,False,True,False,True,False,False
1,1,1,38.0,71.2833,2,False,False,False,False,False,True,False
2,1,3,26.0,7.925,1,False,False,True,True,False,False,False
3,1,1,35.0,53.1,2,False,False,True,False,False,True,False
4,0,3,35.0,8.05,1,True,False,True,False,True,False,False


Final Shape: (891, 12)
