In [42]:
import sys
import os

# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)  # Add the project root to Python path

# Import reusable scripts
from scripts.data_loader import load_data  # Load dataset
from scripts.eda import summarize_data  # EDA functions
from scripts.preprocessing import fill_missing_values  # Missing value imputation



In [47]:
#Reload modules if any change was applied

import importlib

# import scripts.eda  # Ensure the module is recognized
# importlib.reload(scripts.eda)  # Force reload

import scripts.preprocessing
importlib.reload(scripts.preprocessing)  # Force reload

#Now, import the updated functions

from scripts.eda import summarize_data
from scripts.preprocessing import fill_missing_values

In [48]:

# 🔹 Step 1: Load Data and Data Quality Assessment

df = load_data(data_path="../data/processed",filename="train_eda.csv") 
df.head()
summarize_data(df,  show=('isnull', 'info'))

📌 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  Title        891 non-null    object 
 13  FamilySize   891 non-null    int64  
dtypes: float64(2), int64(6), object(6)
memory usage: 97.6+ KB
--------------------------------------------------

📌 Missing Values:
Age         177
Cabin       687
Embarked      2
dtype: int64
-------------

Strategy:

* Median Imputation: Fill missing values with the median.
* Predictive Imputation: Build a regression model to refine imputation based on features like Pclass, Title, and Fare.

In [49]:
# 🔹 Step 2: Handle Missing Values


# Handle Missing Values for Age (Hybrid Approach)

# Step 1: Fill missing 'Age' with median to reduce model bias
df = fill_missing_values(df, num_strategy="median", columns=["Age"])
print("✅ Missing values in 'Age' initially filled with median.")

# Step 2: Refine 'Age' values using Predictive Imputation based on related features
df = fill_missing_values(df, num_strategy="predictive", columns=["Age"])
print("✅ Missing 'Age' values refined using predictive imputation.")

summarize_data(df,  show=('isnull'))

✅ Missing values in 'Age' initially filled with median.
✅ Missing 'Age' values refined using predictive imputation.

📌 Missing Values:
Cabin       687
Embarked      2
dtype: int64
--------------------------------------------------


In [50]:
# Handle Missing Values for Embarked (Mode Imputation)

df = fill_missing_values(df, cat_strategy="mode", columns=["Embarked"])
print("✅ Missing 'Embarked' values filled using mode imputation.")

summarize_data(df,  show=('isnull'))

✅ Missing 'Embarked' values filled using mode imputation.

📌 Missing Values:
Cabin    687
dtype: int64
--------------------------------------------------


In [51]:
# Handle Missing Values for Cabin

# Create a binary indicator for missing 'Cabin'
df['Cabin_Missing'] = df['Cabin'].isna().astype(int)
print(f"✅ Created 'Cabin_Missing' indicator. Missing count: {df['Cabin_Missing'].sum()}")

# Drop the original 'Cabin' column due to high missing rate
df.drop(columns=['Cabin'], inplace=True)
print("✅ Dropped 'Cabin' column due to high missing rate.")
summarize_data(df,  show=('isnull'))

✅ Created 'Cabin_Missing' indicator. Missing count: 687
✅ Dropped 'Cabin' column due to high missing rate.

📌 Missing Values:
No missing values.
--------------------------------------------------


In [53]:
# Check for remaining missing values
print("\n📊 **Remaining Missing Values Summary:**")
summarize_data(df,  show=('isnull', 'info'))


📊 **Remaining Missing Values Summary:**
📌 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    891 non-null    int64  
 1   Survived       891 non-null    int64  
 2   Pclass         891 non-null    int64  
 3   Name           891 non-null    object 
 4   Sex            891 non-null    object 
 5   Age            891 non-null    float64
 6   SibSp          891 non-null    int64  
 7   Parch          891 non-null    int64  
 8   Ticket         891 non-null    object 
 9   Fare           891 non-null    float64
 10  Embarked       891 non-null    object 
 11  Title          891 non-null    object 
 12  FamilySize     891 non-null    int64  
 13  Cabin_Missing  891 non-null    int32  
dtypes: float64(2), int32(1), int64(6), object(5)
memory usage: 94.1+ KB
--------------------------------------------------

📌 Missing