In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%store -r filtered_df

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Define features (X) and target (y)
X = filtered_df.drop(columns=['Outcome'])
X.dropna(inplace=True)
y = filtered_df['Outcome']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# # Initialize and train the models
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)


rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_logistic = logistic_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Model evaluation
def evaluate_model(model_name, y_true, y_pred):
    print(f"Evaluation for {model_name}:\n")
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"ROC AUC Score: {roc_auc:.2f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred))


In [5]:
# Evaluate Logistic Regression
evaluate_model("Logistic Regression", y_test, y_pred_logistic)

Evaluation for Logistic Regression:

Accuracy: 0.74
Precision: 0.80
Recall: 0.46
F1 Score: 0.59
ROC AUC Score: 0.69

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.93      0.81        40
           1       0.80      0.46      0.59        26

    accuracy                           0.74        66
   macro avg       0.76      0.69      0.70        66
weighted avg       0.75      0.74      0.72        66



In [6]:
# Evaluate Random Forest Classifier
evaluate_model("Random Forest Classifier", y_test, y_pred_rf)

Evaluation for Random Forest Classifier:

Accuracy: 0.73
Precision: 0.79
Recall: 0.42
F1 Score: 0.55
ROC AUC Score: 0.67

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.93      0.80        40
           1       0.79      0.42      0.55        26

    accuracy                           0.73        66
   macro avg       0.75      0.67      0.68        66
weighted avg       0.74      0.73      0.70        66



**Class Imbalance**: The classification report for both models shows an imbalanced dataset with a higher number of individuals without diabetes (Class 0) compared to those with diabetes (Class 1). This class imbalance can affect the models' ability to correctly identify individuals with diabetes, as indicated by the lower recall values for Class 1.

In [7]:
from imblearn.over_sampling import SMOTE

X = filtered_df.drop('Outcome', axis=1)  # Features
y = filtered_df['Outcome']  # Binary class column

# Create a SMOTE object
smote = SMOTE(random_state=42)

# Fit and transform the data to oversample the minority class
X_resampled, y_resampled = smote.fit_resample(X, y)

# Now, X_resampled and y_resampled contain the oversampled data
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Feature scaling (Standardization)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# # Initialize and train the models
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)


rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_logistic = logistic_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Model evaluation
def evaluate_model(model_name, y_true, y_pred):
    print(f"Evaluation for {model_name}:\n")
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"ROC AUC Score: {roc_auc:.2f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

In [8]:
# Evaluate Logistic Regression
evaluate_model("Logistic Regression", y_test, y_pred_logistic)

Evaluation for Logistic Regression:

Accuracy: 0.68
Precision: 0.68
Recall: 0.81
F1 Score: 0.74
ROC AUC Score: 0.67

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.53      0.61        43
           1       0.68      0.81      0.74        52

    accuracy                           0.68        95
   macro avg       0.69      0.67      0.67        95
weighted avg       0.69      0.68      0.68        95



In [9]:
# Evaluate Random Forest Classifier
evaluate_model("Random Forest Classifier", y_test, y_pred_rf)

Evaluation for Random Forest Classifier:

Accuracy: 0.86
Precision: 0.83
Recall: 0.94
F1 Score: 0.88
ROC AUC Score: 0.85

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.77      0.84        43
           1       0.83      0.94      0.88        52

    accuracy                           0.86        95
   macro avg       0.87      0.85      0.86        95
weighted avg       0.87      0.86      0.86        95

