In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Change: Import RandomForestClassifier instead of DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score
import joblib 

# Load data
df = pd.read_csv('cardio_train_cleaned.csv')

X = df.drop('cardio', axis=1)
y = df['cardio']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Change: Use RandomForestClassifier
# n_estimators defines the number of trees in the forest (default is 100)
model = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=10, random_state=0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Random Forest Accuracy: {accuracy:.2f}%")

# # Save the updated model and scaler
# joblib.dump(model, 'cardio_rf_model.pkl')
# joblib.dump(scaler, 'rf_scaler.pkl')

Random Forest Accuracy: 73.51%


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score
import joblib 

# Load data
df = pd.read_csv('cardio_train_cleaned.csv')

X = df.drop('cardio', axis=1)
y = df['cardio']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 1. K-Fold Cross-Validation Setup
kf = KFold(n_splits=5, shuffle=True, random_state=0)

# 2. Hyperparameter Tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'criterion': ['gini', 'entropy']
}

rf = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=kf, scoring='accuracy', n_jobs=-1)

print("Starting Hyperparameter Tuning...")
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# 3. Final Evaluation
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Tuned Random Forest Accuracy: {accuracy:.2f}%")

# Save the best model
joblib.dump(best_model, 'cardio_rf_tuned_model.pkl')
joblib.dump(scaler, 'rf_scaler.pkl')

Starting Hyperparameter Tuning...
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'n_estimators': 100}
Tuned Random Forest Accuracy: 73.49%


['rf_scaler.pkl']