In [42]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

In [43]:
# Load dataset and clean column names
diabetes = pd.read_csv("diabetes_data_encoded.csv")

diabetes.columns = (
    diabetes.columns
    .str.strip()            # Remove leading/trailing spaces
    .str.lower()            # Convert to lowercase
    .str.replace(' ', '_')  # Replace spaces with underscores
)

# Separate features and target
X = diabetes.drop('class', axis=1)
y = diabetes['class']

# Identify column types
categorical_features = ['gender', 'polyuria', 'polydipsia', 'sudden_weight_loss',
       'weakness', 'polyphagia', 'genital_thrush', 'visual_blurring',
       'itching', 'irritability', 'delayed_healing', 'partial_paresis',
       'muscle_stiffness', 'alopecia', 'obesity']  
numeric_features = ['age']         


In [44]:
# Define preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),   
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' 
)


In [48]:
# Build a pipeline that applies preprocessing first, then logistic regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Split data into training and test sets (stratify keeps class balance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model on the training data
pipeline.fit(X_train, y_train)

# Generate predictions and prediction probabilities on the test set
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

# Evaluate performance using AUC and a classification report
print("AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


AUC: 0.9906250000000001
              precision    recall  f1-score   support

           0       0.89      0.97      0.93        40
           1       0.98      0.92      0.95        64

    accuracy                           0.94       104
   macro avg       0.93      0.95      0.94       104
weighted avg       0.95      0.94      0.94       104



The model shows strong predictive performance, with an AUC of 0.99, indicating that it can almost perfectly distinguish between patients with and without early-stage diabetes. Overall accuracy is high at 94%, demonstrating that the majority of predictions are correct. Performance is strong across both classes: the model identifies 97% of non-diabetic patients (recall for class 0) and 92% of diabetic patients (recall for class 1), while maintaining high precision for both groups. This means it makes very few incorrect diabetes predictions and only misses a small number of true cases.

The balance between precision and recall is reflected in the high F1-scores for both classes (0.93 and 0.95), showing that the model is reliable and consistent even with slight class imbalance. In practical terms, this model is well-suited for early-stage diabetes risk prediction, providing accurate classifications while minimizing false alarms and missed detections.

This provides a solid baseline, confirming that the data contains meaningful relationships worth exploring further. With this foundation, creating separate models for men and women or adding gender interaction effects will allow us to identify how symptom importance differs by gender. In the future these gender-specific symptom patterns can be examined with confidence.