## Predicting Diabetes ML

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
# Load data
data = pd.read_csv("diabetes_prediction_dataset.csv")

# Clean column names
data.columns = data.columns.str.strip().str.lower().str.replace(' ', '').str.replace('(', '').str.replace(')', '')

# Convert target to numeric
data['diabetes'] = pd.to_numeric(data['diabetes'], errors='coerce')

# Handle missing values
data['bmi'] = data['bmi'].fillna(data['bmi'].median())
data['hba1c_level'] = data['hba1c_level'].fillna(data['hba1c_level'].median())
data['blood_glucose_level'] = data['blood_glucose_level'].fillna(data['blood_glucose_level'].median())

# Encode categorical features
data = pd.get_dummies(data, columns=['gender', 'smoking_history'], drop_first=True)

# Define features and target
X = data.drop(columns=['diabetes'])  # Features
y = data['diabetes']  # Target variable

In [None]:
# Test Each Feature Individually
feature_scores = {}

for feature in X.columns:
    X_single = X[[feature]]  # Select only one feature

# Split Data
    X_train, X_test, y_train, y_test = train_test_split(X_single, y, test_size=0.2, random_state=46)

    # Standardize Data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train a Random Forest model
    model = RandomForestClassifier(random_state=46)
    model.fit(X_train, y_train)

    # Predict & Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

# Store result
    feature_scores[feature] = accuracy

# Convert results to a sorted DataFrame
feature_accuracy_df = pd.DataFrame.from_dict(feature_scores, orient='index', columns=['Accuracy']).sort_values(by='Accuracy', ascending=False)
print("Feature Accuracy Scores:")
print(feature_accuracy_df)

In [None]:
# Train Full Model & Extract Feature Importance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46)

# Standardize Data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train RandomForestClassifier
rf_model = RandomForestClassifier(random_state=46)
rf_model.fit(X_train, y_train)

# Get feature importance
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance Rankings from Random Forest Classifier:")
print(feature_importance_df)

# Visualizing Feature Importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance using Random Forest")
plt.gca().invert_yaxis()
plt.show()