In [2]:
import pandas as pd
from pymongo import MongoClient
from sklearn.preprocessing import LabelEncoder
from dotenv import load_dotenv
import os

In [3]:
# Step 1: Load environment variables
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")
if not MONGO_URI:
    raise ValueError("MONGO_URI environment variable is not set.")


In [4]:
# Step 2: MongoDB connection
client = MongoClient(MONGO_URI)
db = client["healthcare"]
bronze = db["heart_disease_bronze"]
silver = db["heart_disease_silver"]

In [None]:
# Step 3: Fetch raw data from Bronze layer
bronze_data = list(bronze.find({}, {'_id': 0}))  # exclude _id
df = pd.DataFrame(bronze_data)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    int64  
 3   dataset   920 non-null    int64  
 4   cp        920 non-null    int64  
 5   trestbps  920 non-null    float64
 6   chol      920 non-null    float64
 7   fbs       920 non-null    bool   
 8   restecg   920 non-null    int64  
 9   thalch    920 non-null    float64
 10  exang     920 non-null    bool   
 11  oldpeak   920 non-null    float64
 12  slope     920 non-null    int64  
 13  ca        920 non-null    float64
 14  thal      920 non-null    int64  
 15  num       920 non-null    int64  
dtypes: bool(2), float64(5), int64(9)
memory usage: 102.6 KB


Data Preprocessing

In [10]:
# Step 4: Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        mode = df[col].mode()[0]
        df[col] = df[col].fillna(mode)
    else:
        mean = df[col].mean()
        df[col] = df[col].fillna(mean)

In [None]:
# Step 5: Encode categorical features
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # optional: for inverse_transform later
    #label_encoders["sex"].inverse_transform([1])  # gives ['male']

In [8]:
# Step 6: Push to Silver collection
records = df.to_dict(orient="records")
silver.delete_many({})  # clear if re-running
silver.insert_many(records)
print(f"✅ Inserted {len(records)} cleaned records into heart_disease_silver.")

✅ Inserted 920 cleaned records into heart_disease_silver.
