# <span style="font-weight:bold"> Step 2: Data Cleaning </span>
---
---

Importing and setting up the environment

In [1]:
import pandas as pd

df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# Remove duplications (if exist)
df_train.drop_duplicates(inplace=True)

# Create a copy for preprocessing
df_train_cleaned = df_train.copy()

# Check which columns are now mismatched (just in case)
train_cols = set(df_train.columns)
test_cols = set(df_test.columns)
print("\nColumns in Train not in Test:", train_cols - test_cols)
print("Columns in Test not in Train:", test_cols - train_cols)


Columns in Train not in Test: {'TARGET_LifeExpectancy'}
Columns in Test not in Train: set()


#### <span style="font-style:italic"> 1. Feature Selection </span>

In [2]:
# Drop based on EDA-driven feature selection
drop_features = [
    "ID", 
    "Country", 
    "SLS", 
    "Thinness5-9years", 
    "AdultMortality-Female", 
    "AdultMortality-Male", 
    "Measles",
    "PercentageExpenditure"
]

# Check columns before dropping
print("Train Columns BEFORE Drop:", df_train.shape[1])

# Apply drop
df_train_cleaned.drop(columns=drop_features, inplace=True)

# Check columns after dropping
print("\nTrain Columns AFTER Drop:", df_train_cleaned.shape[1])


Train Columns BEFORE Drop: 24

Train Columns AFTER Drop: 16


#### <span style="font-style:italic"> 2. Handle Missing Values</span>


In [3]:
# === Custom Imputation Strategy ===

# Fill with median
median_impute_cols = [ 'BMI', 'GDP', 'Schooling', 'Alcohol',
    'IncomeCompositionOfResources', 'TotalExpenditure',
    'Polio', 'HIV-AIDS', 'Population', 'Thinness1-19years', 'Under5LS']

# Impute with median
for col in median_impute_cols:
    median_val = df_train_cleaned[col].median()
    df_train_cleaned[col] = df_train_cleaned[col].fillna(median_val)

# Impute AdultMortality with mean
mean_val = df_train_cleaned['AdultMortality'].mean()
df_train_cleaned['AdultMortality'] = df_train_cleaned['AdultMortality'].fillna(mean_val)

# Check missing values after imputation
print("Missing values after imputation:")
print("-" * 30)
print("Train Set:", df_train_cleaned.isnull().sum()[df_train.isnull().sum() > 0])
print("-" * 30)

Missing values after imputation:
------------------------------
Train Set: AdultMortality                  0
Alcohol                         0
BMI                             0
Under5LS                        0
Polio                           0
TotalExpenditure                0
HIV-AIDS                        0
GDP                             0
Population                      0
Thinness1-19years               0
IncomeCompositionOfResources    0
Schooling                       0
dtype: int64
------------------------------


After completing all essential preprocessing steps—including feature selection, missing value imputation, and data cleaning—we now save the final datasets to disk. These files will be used in the modeling stage.

In [4]:
# Save cleaned versions for modeling
df_train_cleaned.to_csv("../data/cleaned_train.csv", index=False)

print("Cleaned datasets saved as:")
print("- ../data/cleaned_train.csv")

Cleaned datasets saved as:
- ../data/cleaned_train.csv


In [5]:
data_train = pd.read_csv("../data/cleaned_train.csv")

In [6]:
print("\n____________DATA TRAIN____________")
data_train.info()
data_train.head()
print("TRAIN DATA SHAPE:", data_train.shape)
print("______________________________")


____________DATA TRAIN____________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071 entries, 0 to 2070
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   TARGET_LifeExpectancy         2071 non-null   float64
 1   Year                          2071 non-null   int64  
 2   Status                        2071 non-null   int64  
 3   AdultMortality                2071 non-null   float64
 4   Alcohol                       2071 non-null   float64
 5   BMI                           2071 non-null   float64
 6   Under5LS                      2071 non-null   float64
 7   Polio                         2071 non-null   float64
 8   TotalExpenditure              2071 non-null   float64
 9   Diphtheria                    2071 non-null   float64
 10  HIV-AIDS                      2071 non-null   float64
 11  GDP                           2071 non-null   float64
 12  Population                

In [7]:
#Check for missing values again
print("\n____________NULL COUNT____________")
print("TRAIN NULL COUNT:")
print(data_train.isnull().sum())


____________NULL COUNT____________
TRAIN NULL COUNT:
TARGET_LifeExpectancy           0
Year                            0
Status                          0
AdultMortality                  0
Alcohol                         0
BMI                             0
Under5LS                        0
Polio                           0
TotalExpenditure                0
Diphtheria                      0
HIV-AIDS                        0
GDP                             0
Population                      0
Thinness1-19years               0
IncomeCompositionOfResources    0
Schooling                       0
dtype: int64
