In [78]:
# Health_Risk.ipynb

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV ,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import warnings


warnings.filterwarnings('ignore')


# Step 1: Load the Data
data = pd.read_csv('../data/diabetes.csv')  
data.head(10)





   


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [79]:
# Split the data into features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']


In [80]:
# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())

# Handle missing values if necessary (e.g., fill with median)
X.fillna(X.median(), inplace=True)

Missing values in each column:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [81]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [82]:
# Define a list of models to evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Classifier": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    "Extra Trees": ExtraTreesClassifier(),
    "AdaBoost": AdaBoostClassifier()
}



In [83]:
# Evaluate each model and collect results
results = {}
for model_name, model in models.items():
    # Perform cross-validation to get a more reliable performance estimate
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Store results
    results[model_name] = {
        "Cross-Validation Accuracy": np.mean(cv_scores),
        "Test Accuracy": accuracy,
        "Classification Report": report
    }


In [84]:
# Display results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Cross-Validation Accuracy: {metrics['Cross-Validation Accuracy']:.4f}")
    print(f"Test Accuracy: {metrics['Test Accuracy']:.4f}")
    print("Classification Report:", metrics['Classification Report'])
    print("="*60)

Model: Logistic Regression
Cross-Validation Accuracy: 0.7606
Test Accuracy: 0.7532
Classification Report: {'0': {'precision': 0.8144329896907216, 'recall': 0.797979797979798, 'f1-score': 0.8061224489795918, 'support': 99.0}, '1': {'precision': 0.6491228070175439, 'recall': 0.6727272727272727, 'f1-score': 0.6607142857142857, 'support': 55.0}, 'accuracy': 0.7532467532467533, 'macro avg': {'precision': 0.7317778983541328, 'recall': 0.7353535353535354, 'f1-score': 0.7334183673469388, 'support': 154.0}, 'weighted avg': {'precision': 0.7553936387360154, 'recall': 0.7532467532467533, 'f1-score': 0.7541909620991253, 'support': 154.0}}
Model: Decision Tree
Cross-Validation Accuracy: 0.7378
Test Accuracy: 0.7532
Classification Report: {'0': {'precision': 0.8210526315789474, 'recall': 0.7878787878787878, 'f1-score': 0.8041237113402062, 'support': 99.0}, '1': {'precision': 0.6440677966101694, 'recall': 0.6909090909090909, 'f1-score': 0.6666666666666666, 'support': 55.0}, 'accuracy': 0.753246753246