In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import os


In [31]:

# Read the CSV file
df = pd.read_csv('diabetes_prediction_dataset.csv')
print(df.columns)

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')


## <span style="color:red">Preprocessing</span>

In [32]:
null_counts = df.isnull().sum()
print(null_counts)

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


### <span style="color:yellow">No columns have null values</span>

In [33]:
for i in df.columns:
    print("number of unique values in", i, ":", df[i].nunique())

number of unique values in gender : 3
number of unique values in age : 102
number of unique values in hypertension : 2
number of unique values in heart_disease : 2
number of unique values in smoking_history : 6
number of unique values in bmi : 4247
number of unique values in HbA1c_level : 18
number of unique values in blood_glucose_level : 18
number of unique values in diabetes : 2


### <span style="color:yellow">One-hot encoding for gender and smoking_history</span>

In [34]:
df_encoded = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)
print(df_encoded.head())

    age  hypertension  heart_disease    bmi  HbA1c_level  blood_glucose_level  \
0  80.0             0              1  25.19          6.6                  140   
1  54.0             0              0  27.32          6.6                   80   
2  28.0             0              0  27.32          5.7                  158   
3  36.0             0              0  23.45          5.0                  155   
4  76.0             1              1  20.14          4.8                  155   

   diabetes  gender_Male  gender_Other  smoking_history_current  \
0         0            0             0                        0   
1         0            0             0                        0   
2         0            1             0                        0   
3         0            0             0                        1   
4         0            1             0                        1   

   smoking_history_ever  smoking_history_former  smoking_history_never  \
0                     0             

### <span style="color:yellow">Normalizing the data</span>

In [35]:
boolean_columns = df_encoded.select_dtypes(include=bool).columns
numerical_columns = df_encoded.select_dtypes(include=np.number).columns
print(boolean_columns)
print(numerical_columns)

Index([], dtype='object')
Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes', 'gender_Male', 'gender_Other',
       'smoking_history_current', 'smoking_history_ever',
       'smoking_history_former', 'smoking_history_never',
       'smoking_history_not current'],
      dtype='object')


In [36]:
# Separate features and target
X = df_encoded.drop(columns=['diabetes'])
y = df_encoded['diabetes']

# Identify numerical, boolean, and one-hot encoded columns
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
boolean_columns = X.select_dtypes(include=['bool']).columns
one_hot_encoded_columns = X.columns.difference(numerical_columns.union(boolean_columns))

# Resample the data using SMOTE
# smote = SMOTE(random_state=42, sampling_strategy=0.10)

# Nevermind let's not do the resampling for now
X_resampled, y_resampled = X,y

# Normalize the numerical features
scaler = StandardScaler()
X_resampled_numerical = pd.DataFrame(scaler.fit_transform(X_resampled[numerical_columns]), columns=numerical_columns)

# Combine the normalized numerical features with the boolean and one-hot encoded features
X_resampled_scaled = pd.concat([X_resampled_numerical, X_resampled[boolean_columns].reset_index(drop=True), X_resampled[one_hot_encoded_columns].reset_index(drop=True)], axis=1)

# Add the target variable back to the DataFrame
df_resampled_scaled = pd.concat([X_resampled_scaled, y_resampled.reset_index(drop=True)], axis=1)

# Display the first few rows of the resampled and scaled DataFrame
print("Resampled and Scaled DataFrame:")
print(df_resampled_scaled.head())

# Check the class distribution after resampling
print("\nClass Distribution After Resampling:")
print(df_resampled_scaled['diabetes'].value_counts())

print("\nClass distribution before resampling:")
print(df['diabetes'].value_counts())

Resampled and Scaled DataFrame:
        age  hypertension  heart_disease       bmi  HbA1c_level  \
0  1.692704     -0.284439       4.936379 -0.321056     1.001706   
1  0.538006     -0.284439      -0.202578 -0.000116     1.001706   
2 -0.616691     -0.284439      -0.202578 -0.000116     0.161108   
3 -0.261399     -0.284439      -0.202578 -0.583232    -0.492690   
4  1.515058      3.515687       4.936379 -1.081970    -0.679490   

   blood_glucose_level  gender_Male  gender_Other  smoking_history_current  \
0             0.047704            0             0                        0   
1            -1.426210            0             0                        0   
2             0.489878            1             0                        0   
3             0.416183            0             0                        1   
4             0.416183            1             0                        1   

   smoking_history_ever  smoking_history_former  smoking_history_never  \
0                     

### <span style="color:yellow">Train test split</span>

In [37]:
x = df_resampled_scaled.drop('diabetes', axis=1)
y = df_resampled_scaled['diabetes']

x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.15, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.1765, random_state=42)

print(f"Train set size: {len(x_train)}")
print(f"Validation set size: {len(x_val)}")
print(f"Test set size: {len(x_test)}")

Train set size: 69997
Validation set size: 15003
Test set size: 15000


### <span style="color:yellow">Training an MLP model</span>

In [38]:
# import os
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset

# # Check if CUDA is available and print the device being used
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Convert data to PyTorch tensors
# X_train_tensor = torch.tensor(x_train.values, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
# X_test_tensor = torch.tensor(x_test.values, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# # Create DataLoader
# train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
# test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# class MLP(nn.Module):
#     def __init__(self, input_dim):
#         super(MLP, self).__init__()
#         self.model = nn.Sequential(
#             nn.Linear(input_dim, 64),
#             nn.ReLU(),
#             nn.Linear(64, 32),
#             nn.ReLU(),
#             nn.Linear(32, 1),
#             nn.Sigmoid()  # Sigmoid for binary classification
#         )
        
#     def forward(self, x):
#         return self.model(x)


# # Initialize the model, loss function, and optimizer
# input_dim = x_train.shape[1]
# model = MLP(input_dim).to(device)
# criterion = nn.BCELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Define the path to the saved model file
# model_file = './models/mlp_model.pth'

# # Check if the model file exists
# if os.path.exists(model_file):
#     print("Model file exists. Loading the model...")
#     model.load_state_dict(torch.load(model_file))
# else:
#     print("Model file does not exist. Training a new model...")

#     # Train the model
#     num_epochs = 50
#     best_loss = float('inf')
#     patience = 3
#     patience_counter = 0

#     for epoch in range(num_epochs):
#         model.train()
#         running_loss = 0.0
#         for X_batch, y_batch in train_loader:
#             X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
#             optimizer.zero_grad()
#             outputs = model(X_batch)
#             loss = criterion(outputs, y_batch)
#             loss.backward()
#             optimizer.step()
            
#             running_loss += loss.item() * X_batch.size(0)
        
#         epoch_loss = running_loss / len(train_loader.dataset)
#         print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

#         # Early stopping
#         model.eval()
#         val_loss = 0.0
#         with torch.no_grad():
#             for X_batch, y_batch in test_loader:
#                 X_batch, y_batch = X_batch.to(device), y_batch.to(device)
#                 outputs = model(X_batch)
#                 loss = criterion(outputs, y_batch)
#                 val_loss += loss.item() * X_batch.size(0)
        
#         val_loss /= len(test_loader.dataset)
#         print(f'Validation Loss: {val_loss:.4f}')

#         if val_loss < best_loss:
#             best_loss = val_loss
#             patience_counter = 0
#         else:
#             patience_counter += 1
#             if patience_counter >= patience:
#                 print("Early stopping triggered.")
#                 break

#     # Save the trained model after training is complete
#     torch.save(model.state_dict(), model_file)
#     print("Model trained and saved.")


In [39]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_size, dropout):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

def train_mlp(train_dataset, val_dataset, batch_size, hidden_size, learning_rate, dropout, num_epochs, patience, model_file='./models/mlp_model.pth'):
    print(f"Using device: {device}")

    # Create DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize the model, loss function, and optimizer
    input_dim = train_dataset.tensors[0].shape[1]
    model = MLP(input_dim, hidden_size, dropout).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Check if the model file exists
    if os.path.exists(model_file):
        print("Model file exists. Loading the model...")
        model.load_state_dict(torch.load(model_file))
    else:
        print("Model file does not exist. Training a new model...")
        best_loss = float('inf')
        patience_counter = 0

        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                
                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item() * X_batch.size(0)
            
            epoch_loss = running_loss / len(train_loader.dataset)
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

            # Early stopping
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                    outputs = model(X_batch)
                    loss = criterion(outputs, y_batch)
                    val_loss += loss.item() * X_batch.size(0)
            
            val_loss /= len(val_loader.dataset)
            print(f'Validation Loss: {val_loss:.4f}')

            if val_loss < best_loss:
                best_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break

        # Save the trained model after training is complete
        torch.save(model.state_dict(), model_file)
        print("Model trained and saved.")
    
    return model


In [40]:
X_train_tensor = torch.tensor(x_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(x_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(x_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [41]:
# # Hyperparameters
# batch_size = 32
# hidden_size = 64
# learning_rate = 0.001
# dropout = 0.2
# num_epochs = 250
# patience = 20

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model = train_mlp(train_dataset, val_dataset, batch_size, hidden_size, learning_rate, dropout, num_epochs, patience)



In [42]:
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# # Evaluate the model
# model.eval()
# test_loss = 0.0
# with torch.no_grad():
#     for X_batch, y_batch in test_loader:
#         X_batch, y_batch = X_batch.to(device), y_batch.to(device)
#         outputs = model(X_batch)

# # Calculate Mean Absolute Error (MAE)
# test_mae = 0.0
# with torch.no_grad():
#     for X_batch, y_batch in test_loader:
#         X_batch, y_batch = X_batch.to(device), y_batch.to(device)
#         outputs = model(X_batch)
#         mae = torch.mean(torch.abs(outputs - y_batch))
#         test_mae += mae.item() * X_batch.size(0)

# test_mae /= len(test_loader.dataset)
# print(f'Test MAE: {test_mae:.4f}')

# # Calculate Prediction Accuracy
# correct_predictions = 0
# total_predictions = 0
# with torch.no_grad():
#     for X_batch, y_batch in test_loader:
#         X_batch, y_batch = X_batch.to(device), y_batch.to(device)
#         outputs = model(X_batch)  # Outputs are probabilities due to sigmoid
#         predictions = (outputs > 0.5).float()  # Convert probabilities to binary predictions
#         correct_predictions += torch.sum(predictions == y_batch).item()
#         total_predictions += y_batch.size(0)

# accuracy = correct_predictions / total_predictions
# print("Total Predictions:", total_predictions)
# print("Correct Predictions:", correct_predictions)
# print("Incorrect Predictions:", total_predictions - correct_predictions)
# print(f'Prediction Accuracy: {accuracy:.4f}')


In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Hyperparameters
hidden_sizes = 128
learning_rates = 0.001
dropouts = 0.3
batch_sizes = 64
num_epochs = 50
patience = 10

# Store results for each trial
results = []

for i in range(10):
    print(f"Trial {i+1}/10")

    # Define a unique model file for each trial
    model_file = f'./models/mlp_model_trial_{i+1}.pth'

    model = train_mlp(train_dataset, val_dataset, batch_sizes, hidden_sizes, learning_rates, dropouts, num_epochs, patience, model_file=model_file)
    model.eval()

    # Lists to store ground truth labels and predictions
    y_true_list = []
    y_pred_list = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            outputs = model(X_batch)  # Get predictions (probabilities from sigmoid)
            predictions = (outputs > 0.5).float()  # Convert probabilities to binary (0 or 1)

            y_true_list.extend(y_batch.cpu().numpy())  # Store ground truth labels
            y_pred_list.extend(predictions.cpu().numpy())  # Store predictions

    # Convert lists to NumPy arrays
    y_true = np.array(y_true_list)
    y_pred = np.array(y_pred_list)

    # Compute metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Store results for this trial
    results.append({
        "trial": i+1,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    })

    # Print results for this trial
    print(f"Saved model: {model_file}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

# Convert results to a DataFrame for easier analysis
results_df = pd.DataFrame(results)




            
            
            

Trial 1/10
Using device: cuda
Model file does not exist. Training a new model...


Epoch [1/50], Loss: 0.1446
Validation Loss: 0.1023
Epoch [2/50], Loss: 0.1076
Validation Loss: 0.0916
Epoch [3/50], Loss: 0.0981
Validation Loss: 0.0863
Epoch [4/50], Loss: 0.0938
Validation Loss: 0.0829
Epoch [5/50], Loss: 0.0912
Validation Loss: 0.0817
Epoch [6/50], Loss: 0.0892
Validation Loss: 0.0807
Epoch [7/50], Loss: 0.0882
Validation Loss: 0.0796
Epoch [8/50], Loss: 0.0873
Validation Loss: 0.0796
Epoch [9/50], Loss: 0.0867
Validation Loss: 0.0793
Epoch [10/50], Loss: 0.0862
Validation Loss: 0.0790
Epoch [11/50], Loss: 0.0854
Validation Loss: 0.0793
Epoch [12/50], Loss: 0.0853
Validation Loss: 0.0791
Epoch [13/50], Loss: 0.0847
Validation Loss: 0.0784
Epoch [14/50], Loss: 0.0842
Validation Loss: 0.0786
Epoch [15/50], Loss: 0.0845
Validation Loss: 0.0781
Epoch [16/50], Loss: 0.0844
Validation Loss: 0.0782
Epoch [17/50], Loss: 0.0833
Validation Loss: 0.0782
Epoch [18/50], Loss: 0.0837
Validation Loss: 0.0784
Epoch [19/50], Loss: 0.0833
Validation Loss: 0.0783
Epoch [20/50], Loss: 

In [46]:
results_df.head(10)

Unnamed: 0,trial,accuracy,precision,recall,f1_score
0,1,0.9718,0.993127,0.675234,0.803894
1,2,0.9704,0.960526,0.682243,0.797814
2,3,0.9714,0.99308,0.670561,0.800558
3,4,0.971733,1.0,0.669782,0.802239
4,5,0.971267,0.977604,0.679907,0.802021
5,6,0.9718,0.998841,0.67134,0.802981
6,7,0.971733,0.996536,0.672118,0.802791
7,8,0.971667,0.995386,0.672118,0.802417
8,9,0.971667,0.993111,0.673676,0.802784
9,10,0.971733,0.99884,0.670561,0.802423


In [47]:
# sort by f1 score
temp_df = results_df.sort_values(by='f1_score', ascending=False)
temp_df.head(10)


Unnamed: 0,trial,accuracy,precision,recall,f1_score
0,1,0.9718,0.993127,0.675234,0.803894
5,6,0.9718,0.998841,0.67134,0.802981
6,7,0.971733,0.996536,0.672118,0.802791
8,9,0.971667,0.993111,0.673676,0.802784
9,10,0.971733,0.99884,0.670561,0.802423
7,8,0.971667,0.995386,0.672118,0.802417
3,4,0.971733,1.0,0.669782,0.802239
4,5,0.971267,0.977604,0.679907,0.802021
2,3,0.9714,0.99308,0.670561,0.800558
1,2,0.9704,0.960526,0.682243,0.797814
