In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# -------------------------------
# Load dataset
# -------------------------------
df = pd.read_csv("cardio_train_cleaned.csv")

X = df.drop("cardio", axis=1)
y = df["cardio"]

# -------------------------------
# 1️⃣ Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

rf_tt = RandomForestClassifier(
    random_state=42,
    class_weight="balanced"
)

rf_tt.fit(X_train, y_train)

train_test_accuracy = accuracy_score(
    y_test, rf_tt.predict(X_test)
)

print("Random Forest - Train-Test Accuracy:", train_test_accuracy)

# -------------------------------
# 2️⃣ K-Fold Cross Validation
# -------------------------------
rf_kf = RandomForestClassifier(
    random_state=42,
    class_weight="balanced"
)

kfold_accuracy = cross_val_score(
    rf_kf, X, y, cv=5, scoring="accuracy"
).mean()

print("Random Forest - K-Fold Accuracy:", kfold_accuracy)

# -------------------------------
# 3️⃣ Hyperparameter Tuning
# -------------------------------
rf_tuned = RandomForestClassifier(
    random_state=42,
    class_weight="balanced"
)

param_grid = {
    "n_estimators": [200, 300],
    "max_depth": [10, 15, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", "log2"]
}

grid = GridSearchCV(
    rf_tuned,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_train, y_train)

tuned_accuracy = accuracy_score(
    y_test, grid.predict(X_test)
)

print("Random Forest - Tuned Accuracy:", tuned_accuracy)
print("Best Parameters:", grid.best_params_)
