In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import os

In [2]:
# Step 1: Load environment variables
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")

In [3]:
# MongoDB Connection
client = MongoClient(MONGO_URI)
db = client["healthcare"]
silver = db["heart_disease_silver"]
gold = db["heart_disease_gold"]

In [4]:
# Step 1: Load Silver Layer
silver_data = list(silver.find({}, {'_id': 0}))
df = pd.DataFrame(silver_data)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    float64
 1   age       920 non-null    float64
 2   sex       920 non-null    float64
 3   dataset   920 non-null    float64
 4   cp        920 non-null    float64
 5   trestbps  920 non-null    float64
 6   chol      920 non-null    float64
 7   fbs       920 non-null    float64
 8   restecg   920 non-null    float64
 9   thalch    920 non-null    float64
 10  exang     920 non-null    float64
 11  oldpeak   920 non-null    float64
 12  slope     920 non-null    float64
 13  ca        920 non-null    float64
 14  thal      920 non-null    float64
 15  num       920 non-null    int64  
dtypes: float64(15), int64(1)
memory usage: 115.1 KB


In [5]:
# Step 2: Normalize numerical features (except label column 'num')
features_to_scale = df.drop(columns=['num']).columns
scaler = MinMaxScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

In [6]:
# Step 3: Feature Selection using RandomForest
X = df.drop(columns=['num'])
y = df['num']

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

importances = pd.Series(model.feature_importances_, index=X.columns)
selected_features = importances[importances > 0.05].index.tolist()

# Final Gold Data
final_df = df[selected_features + ['num']]

In [7]:
# Step 4: Upload to Gold Layer
gold.delete_many({})
gold.insert_many(final_df.to_dict(orient="records"))

print(f"✅ Inserted {len(final_df)} records into `heart_disease_gold`.")
print(f"📌 Selected features: {selected_features}")

✅ Inserted 920 records into `heart_disease_gold`.
📌 Selected features: ['id', 'age', 'dataset', 'cp', 'trestbps', 'chol', 'thalch', 'oldpeak']
