                                `Imports + Setup Logging`                                           

In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import logging
from sklearn.preprocessing import LabelEncoder
# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")



                             `2. Load .env & MongoDB URI`                                        

In [2]:
# Step 1: Load environment variables
load_dotenv()

MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise ValueError("Mongo URI not found in environment.")

                                    `MongoDB Connection`                                           

In [3]:
# Step 2: MongoDB connection
client = MongoClient(MONGO_URI)
db = client["healthcare"]
silver = db["heart_disease_silver"]

                                `Load Data from Bronze Collection`                                  

In [4]:
# Step 3: Fetch raw data from Bronze layer
bronze = db["heart_disease_bronze"]
bronze_data = list(bronze.find({}, {'_id': 0}))  # exclude _id
df = pd.DataFrame(bronze_data)

logging.info(f" Loaded data from Bronze layer: {df.shape}")

2025-06-10 03:18:48,964 - INFO -  Loaded data from Bronze layer: (920, 16)


In [5]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [6]:
df.describe()

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,920.0,920.0,861.0,890.0,865.0,858.0,309.0,920.0
mean,460.5,53.51087,132.132404,199.130337,137.545665,0.878788,0.676375,0.995652
std,265.725422,9.424685,19.06607,110.78081,25.926276,1.091226,0.935653,1.142693
min,1.0,28.0,0.0,0.0,60.0,-2.6,0.0,0.0
25%,230.75,47.0,120.0,175.0,120.0,0.0,0.0,0.0
50%,460.5,54.0,130.0,223.0,140.0,0.5,0.0,1.0
75%,690.25,60.0,140.0,268.0,157.0,1.5,1.0,2.0
max,920.0,77.0,200.0,603.0,202.0,6.2,3.0,4.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


                                  `Data Preprocessing`                                                 

In [12]:
# Step 4: Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        mode = df[col].mode()[0]
        df[col] = df[col].fillna(mode)
        #df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        mean = df[col].mean()
        df[col] = df[col].fillna(mean)

In [9]:
# Step 5: Encode categorical features
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # optional: for inverse_transform later
    #label_encoders["sex"].inverse_transform([1])  # gives ['male']

                              `Insert into MongoDB Silver Layer`                                    

In [10]:
# Step 6: Push to Silver collection
records = df.to_dict(orient="records")
silver.delete_many({})  # clear if re-running
silver.insert_many(records)
print(f"✅ Inserted {len(records)} cleaned records into heart_disease_silver.")

✅ Inserted 920 cleaned records into heart_disease_silver.


In [11]:
df.head()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,1,0,3,145.0,233.0,True,0,150.0,False,2.3,0,0.0,0,0
1,2,67,1,0,0,160.0,286.0,False,0,108.0,True,1.5,1,3.0,1,2
2,3,67,1,0,0,120.0,229.0,False,0,129.0,True,2.6,1,2.0,2,1
3,4,37,1,0,2,130.0,250.0,False,1,187.0,False,3.5,0,0.0,1,0
4,5,41,0,0,1,130.0,204.0,False,0,172.0,False,1.4,2,0.0,1,0


                                     `Silver Layer`                                              
- Data loaded from the `heart_disease_bronze` collection

- Preprocessing included:
  - Imputed missing values (mean for numericals, mode for categoricals)

  - Encoded categorical variables using one-hot encoding

- Stored processed data in the `heart_disease_silver` collection under `healthcare` database

- MongoDB interaction handled with `pymongo`

- Logging added for transparency and debugging
