                                   `Imports + Logging`                                          

In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import logging
from sklearn.preprocessing import MinMaxScaler

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


                              `Load .env and MongoDB Connection`                                

In [2]:
# Step 1: Load environment variables
load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise ValueError("Mongo URI not found in environment.")


In [3]:
# MongoDB Connection

client = MongoClient(MONGO_URI)
db = client["healthcare"]
silver = db["heart_disease_silver"]
gold = db["heart_disease_gold"]
logging.info(" Connected to MongoDB.")

2025-06-10 04:32:18,822 - INFO -  Connected to MongoDB.


                                `Load Silver Layer`                                               

In [4]:
# Step 1: Load Silver Layer
silver_data = list(silver.find({}, {'_id': 0}))
df = pd.DataFrame(silver_data)

logging.info(f" Loaded Silver layer data: shape = {df.shape}")

2025-06-10 04:32:20,605 - INFO -  Loaded Silver layer data: shape = (920, 16)


In [5]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,1,0,3,145.0,233.0,True,0,150.0,False,2.3,0,0.0,0,0
1,2,67,1,0,0,160.0,286.0,False,0,108.0,True,1.5,1,3.0,1,2
2,3,67,1,0,0,120.0,229.0,False,0,129.0,True,2.6,1,2.0,2,1
3,4,37,1,0,2,130.0,250.0,False,1,187.0,False,3.5,0,0.0,1,0
4,5,41,0,0,1,130.0,204.0,False,0,172.0,False,1.4,2,0.0,1,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    int64  
 3   dataset   920 non-null    int64  
 4   cp        920 non-null    int64  
 5   trestbps  920 non-null    float64
 6   chol      920 non-null    float64
 7   fbs       920 non-null    bool   
 8   restecg   920 non-null    int64  
 9   thalch    920 non-null    float64
 10  exang     920 non-null    bool   
 11  oldpeak   920 non-null    float64
 12  slope     920 non-null    int64  
 13  ca        920 non-null    float64
 14  thal      920 non-null    int64  
 15  num       920 non-null    int64  
dtypes: bool(2), float64(5), int64(9)
memory usage: 102.6 KB


                                  `Normalize Numerical Features`                               

In [7]:
# Step 2: Normalize numerical features (except label column 'num')
#features_to_scale = df.drop(columns=['num']).columns
target_col = 'num'
features_to_scale = df.select_dtypes(include=np.number).columns.tolist()
features_to_scale.remove(target_col)

# Apply MinMax scaling
scaler = MinMaxScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
display(df.head())
logging.info(f" Normalized features using MinMaxScaler.")

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,0.0,0.714286,1.0,0.0,1.0,0.725,0.386401,True,0.0,0.633803,False,0.556818,0.0,0.0,0.0,0
1,0.001088,0.795918,1.0,0.0,0.0,0.8,0.474295,False,0.0,0.338028,True,0.465909,0.5,1.0,0.5,2
2,0.002176,0.795918,1.0,0.0,0.0,0.6,0.379768,False,0.0,0.485915,True,0.590909,0.5,0.666667,1.0,1
3,0.003264,0.183673,1.0,0.0,0.666667,0.65,0.414594,False,0.5,0.894366,False,0.693182,0.0,0.0,0.5,0
4,0.004353,0.265306,0.0,0.0,0.333333,0.65,0.338308,False,0.0,0.788732,False,0.454545,1.0,0.0,0.5,0


2025-06-10 04:32:20,817 - INFO -  Normalized features using MinMaxScaler.


                            `Feature Selection (Correlation-based)`                                  

In [8]:
# Step: Correlation-based Feature Selection
target_col = 'num'

# 1. Compute absolute correlation with target
correlations = df.corr()[target_col].abs().sort_values(ascending=False)

# 2. Select features with correlation > 0.25
selected_features = correlations[correlations > 0.25].index.tolist()

# 3. Remove target column from selected features
if target_col in selected_features:
    selected_features.remove(target_col)

logging.info(f" Selected features (correlation > 0.25): {selected_features}")

# 4. Create final Gold DataFrame
df_gold = df[selected_features + [target_col]]
display(df_gold.head())


2025-06-10 04:32:20,845 - INFO -  Selected features (correlation > 0.25): ['oldpeak', 'thalch', 'exang', 'age', 'ca', 'cp', 'dataset', 'id', 'sex']


Unnamed: 0,oldpeak,thalch,exang,age,ca,cp,dataset,id,sex,num
0,0.556818,0.633803,False,0.714286,0.0,1.0,0.0,0.0,1.0,0
1,0.465909,0.338028,True,0.795918,1.0,0.0,0.0,0.001088,1.0,2
2,0.590909,0.485915,True,0.795918,0.666667,0.0,0.0,0.002176,1.0,1
3,0.693182,0.894366,False,0.183673,0.0,0.666667,0.0,0.003264,1.0,0
4,0.454545,0.788732,False,0.265306,0.0,0.333333,0.0,0.004353,0.0,0


                               `Upload to Gold Layer`                                                  

In [9]:
# Step 4: Upload to Gold Layer
gold.delete_many({})
gold.insert_many(df_gold.to_dict(orient="records"))

print(f"✅ Inserted {len(df_gold)} records into `heart_disease_gold`.")
print(f"📌 Selected features: {selected_features}")

✅ Inserted 920 records into `heart_disease_gold`.
📌 Selected features: ['oldpeak', 'thalch', 'exang', 'age', 'ca', 'cp', 'dataset', 'id', 'sex']


                              `Preview Stored Gold Data`                                         

In [10]:
# Show a couple of records for validation
preview = list(gold.find({}, {"_id": 0}).limit(5))
pd.DataFrame(preview)


Unnamed: 0,oldpeak,thalch,exang,age,ca,cp,dataset,id,sex,num
0,0.556818,0.633803,False,0.714286,0.0,1.0,0.0,0.0,1.0,0
1,0.465909,0.338028,True,0.795918,1.0,0.0,0.0,0.001088,1.0,2
2,0.590909,0.485915,True,0.795918,0.666667,0.0,0.0,0.002176,1.0,1
3,0.693182,0.894366,False,0.183673,0.0,0.666667,0.0,0.003264,1.0,0
4,0.454545,0.788732,False,0.265306,0.0,0.333333,0.0,0.004353,0.0,0


                                   `Gold Layer`                                                
- Loaded cleaned data from `heart_disease_silver` collection

- Normalized numerical features (e.g., age, cholesterol) to [0, 1] range using MinMaxScaler

- Selected top features based on correlation with the target column (`num`)

- Stored refined and modeling-ready data in `heart_disease_gold` collection

- Dataset now ready for use in model training
