In [1]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 2: Load the merged cleaned dataset
df = pd.read_csv("D:/diabetes-risk-prediction-model/Resources/full_cleaned_merged.csv")

# Step 3: Drop rows with missing target values
df = df.dropna(subset=["Diabetes_binary"]).reset_index(drop=True)

# Step 4: Split into features and target
X = df.drop("Diabetes_binary", axis=1)
y = df["Diabetes_binary"]

# Step 5: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [3]:
# Step 7: Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Step 8: Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Step 9: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy Score:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)


Accuracy Score: 0.9093534199276554

Confusion Matrix:
 [[72272  3665]
 [ 5156 16219]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.93      0.95      0.94     75937
         1.0       0.82      0.76      0.79     21375

    accuracy                           0.91     97312
   macro avg       0.87      0.86      0.86     97312
weighted avg       0.91      0.91      0.91     97312

