# Cardio Disease Prediction Model Training (No BMI)

This notebook documents the training process for the cardiovascular disease prediction model used in the application. 
The model uses 11 features (excluding BMI) and is based on a Random Forest Classifier.

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
df = pd.read_csv('preProcessed.csv')

# Features to use (excluding BMI)
# We use: gender, height, weight, ap_hi, ap_lo, cholesterol, gluc, smoke, alco, active, age_years
feature_cols = ['gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'age_years']
X = df[feature_cols]
y = df['cardio']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train_scaled, y_train)

# Evaluate
train_acc = model.score(X_train_scaled, y_train)
test_acc = model.score(X_test_scaled, y_test)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, model.predict(X_test_scaled)))

In [None]:
# Save Model and Scaler
# Note: In the app, these files are named 'model.pkl' and 'scaler_final.pkl' for deployment purposes.
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler_final.pkl')
print("Model and Scaler saved successfully.")