In [None]:
!pip install pymongo pandas scikit-learn


In [None]:
import pandas as pd
from pymongo import MongoClient

# Load raw CSV
df_raw = pd.read_csv('/content/heart_disease_uci.csv')

# Connect to MongoDB
client = MongoClient("mongodb+srv://mluser:Kumar321@cluster0.u30ux5m.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client['healthcare']
bronze_collection = db['heart_disease_bronze']

# Insert raw data as JSON
bronze_collection.insert_many(df_raw.to_dict(orient='records'))


In [None]:
import pandas as pd
from pymongo import MongoClient
from sklearn.preprocessing import LabelEncoder

# Step 1: Connect to MongoDB and fetch Bronze layer data
client = MongoClient("mongodb+srv://mluser:Kumar321@cluster0.u30ux5m.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client['healthcare']
bronze_collection = db['heart_disease_bronze']
data = list(bronze_collection.find({}, {'_id': 0}))  # exclude _id
df_silver = pd.DataFrame(data)

# Step 2: Handle missing values
numerical_cols = df_silver.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df_silver.select_dtypes(include=['object', 'category']).columns

df_silver[numerical_cols] = df_silver[numerical_cols].fillna(df_silver[numerical_cols].mean())

for col in categorical_cols:
    df_silver[col] = df_silver[col].fillna(df_silver[col].mode()[0])

# Step 3: Encode categorical features
label_enc_cols = ['sex', 'cp', 'thal', 'slope', 'restecg']  # adjust based on dataset

for col in label_enc_cols:
    if col in df_silver.columns:
        le = LabelEncoder()
        df_silver[col] = le.fit_transform(df_silver[col])

# Step 4: Store preprocessed data into Silver layer
silver_collection = db['heart_disease_silver']
silver_collection.insert_many(df_silver.to_dict(orient='records'))

print("Silver layer preprocessing complete and data inserted into MongoDB.")


In [None]:
import pandas as pd
from pymongo import MongoClient
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Step 1: Connect to MongoDB and load Silver data
client = MongoClient("mongodb+srv://mluser:Kumar321@cluster0.u30ux5m.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client['healthcare']
silver_collection = db['heart_disease_silver']
data = list(silver_collection.find({}, {'_id': 0}))
df = pd.DataFrame(data)

# Step 2: Drop unwanted columns only (keep 'sex')
df.drop(columns=[col for col in ['id', 'dataset'] if col in df.columns], inplace=True)

# Step 3: Rename 'num' to 'target' if present
if 'num' in df.columns:
    df.rename(columns={'num': 'target'}, inplace=True)

# Step 4: Normalize numerical features
numerical_cols = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
scaler = MinMaxScaler()
for col in numerical_cols:
    if col in df.columns:
        df[col] = scaler.fit_transform(df[[col]])

# Step 5: Encode categorical features
label_enc = LabelEncoder()
for col in ['fbs', 'exang']:
    if col in df.columns:
        df[col] = label_enc.fit_transform(df[col])

# Step 6: Save to Gold collection
gold_collection = db['heart_disease_gold']
gold_collection.delete_many({})  # Optional: clear existing
gold_collection.insert_many(df.to_dict(orient='records'))

# Step 7: Preview Gold layer
df_gold = pd.DataFrame(list(gold_collection.find({}, {'_id': 0})))
print("✅ Gold layer sample:")
display(df_gold.head())


In [None]:
import pandas as pd
from pymongo import MongoClient
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

# Connect to MongoDB and fetch gold layer data
client = MongoClient("mongodb+srv://mluser:Kumar321@cluster0.u30ux5m.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client['healthcare']
gold_data = list(db['heart_disease_gold'].find({}, {'_id': 0}))
df = pd.DataFrame(gold_data)

# Features and target
X = df.drop(columns=['target'])
y = df['target']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Evaluation
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Predict probabilities (needed for ROC AUC)
    if hasattr(model, 'predict_proba'):
        y_proba = model.predict_proba(X_test)
    else:
        y_proba = None

    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='macro', zero_division=0),
        'Recall': recall_score(y_test, y_pred, average='macro', zero_division=0),
        'F1 Score': f1_score(y_test, y_pred, average='macro', zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_proba, multi_class='ovr') if y_proba is not None else 'N/A'
    }

# Print results
for model_name, metrics in results.items():
    print(f"\n📊 {model_name}")
    for metric, value in metrics.items():
        if isinstance(value, float):
            print(f"{metric}: {value:.4f}")
        else:
            print(f"{metric}: {value}")


In [None]:
!pip show xgboost


In [None]:
import joblib
from xgboost import XGBClassifier

# Re-train XGBoost on the full dataset (optional but recommended)
final_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
final_model.fit(X, y)  # X and y already defined from the gold layer

# Save model as pkl
joblib.dump(final_model, 'xgboost_heart_disease_model.pkl')

print("✅ Model saved as 'xgboost_heart_disease_model.pkl'")


In [None]:
import pickle

# Load your XGBoost model
with open("xgboost_heart_disease_model.pkl", "rb") as f:
    model = pickle.load(f)

# Show feature names
print(model.feature_names_in_)
