In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import os


In [4]:

# Read the CSV file
df = pd.read_csv('diabetes_prediction_dataset.csv')
print(df.columns)

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')


## <span style="color:red">Preprocessing</span>

In [5]:
null_counts = df.isnull().sum()
print(null_counts)

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


### <span style="color:yellow">No columns have null values</span>

In [6]:
for i in df.columns:
    print("number of unique values in", i, ":", df[i].nunique())

number of unique values in gender : 3
number of unique values in age : 102
number of unique values in hypertension : 2
number of unique values in heart_disease : 2
number of unique values in smoking_history : 6
number of unique values in bmi : 4247
number of unique values in HbA1c_level : 18
number of unique values in blood_glucose_level : 18
number of unique values in diabetes : 2


### <span style="color:yellow">One-hot encoding for gender and smoking_history</span>

In [7]:
df_encoded = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)
print(df_encoded.head())

    age  hypertension  heart_disease    bmi  HbA1c_level  blood_glucose_level  \
0  80.0             0              1  25.19          6.6                  140   
1  54.0             0              0  27.32          6.6                   80   
2  28.0             0              0  27.32          5.7                  158   
3  36.0             0              0  23.45          5.0                  155   
4  76.0             1              1  20.14          4.8                  155   

   diabetes  gender_Male  gender_Other  smoking_history_current  \
0         0        False         False                    False   
1         0        False         False                    False   
2         0         True         False                    False   
3         0        False         False                     True   
4         0         True         False                     True   

   smoking_history_ever  smoking_history_former  smoking_history_never  \
0                 False             

### <span style="color:yellow">Normalizing the data</span>

In [8]:
boolean_columns = df_encoded.select_dtypes(include=bool).columns
numerical_columns = df_encoded.select_dtypes(include=np.number).columns
print(boolean_columns)
print(numerical_columns)

Index(['gender_Male', 'gender_Other', 'smoking_history_current',
       'smoking_history_ever', 'smoking_history_former',
       'smoking_history_never', 'smoking_history_not current'],
      dtype='object')
Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes'],
      dtype='object')


In [9]:
# Separate features and target
X = df_encoded.drop(columns=['diabetes'])
y = df_encoded['diabetes']

# Identify numerical, boolean, and one-hot encoded columns
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
boolean_columns = X.select_dtypes(include=['bool']).columns
one_hot_encoded_columns = X.columns.difference(numerical_columns.union(boolean_columns))

# Resample the data using SMOTE
# smote = SMOTE(random_state=42, sampling_strategy=0.10)

# Nevermind let's not do the resampling for now
X_resampled, y_resampled = X,y

# Normalize the numerical features
scaler = StandardScaler()
X_resampled_numerical = pd.DataFrame(scaler.fit_transform(X_resampled[numerical_columns]), columns=numerical_columns)

# Combine the normalized numerical features with the boolean and one-hot encoded features
X_resampled_scaled = pd.concat([X_resampled_numerical, X_resampled[boolean_columns].reset_index(drop=True), X_resampled[one_hot_encoded_columns].reset_index(drop=True)], axis=1)

# Add the target variable back to the DataFrame
df_resampled_scaled = pd.concat([X_resampled_scaled, y_resampled.reset_index(drop=True)], axis=1)

# Display the first few rows of the resampled and scaled DataFrame
print("Resampled and Scaled DataFrame:")
print(df_resampled_scaled.head())

# Check the class distribution after resampling
print("\nClass Distribution After Resampling:")
print(df_resampled_scaled['diabetes'].value_counts())

print("\nClass distribution before resampling:")
print(df['diabetes'].value_counts())

Resampled and Scaled DataFrame:
        age  hypertension  heart_disease       bmi  HbA1c_level  \
0  1.692704     -0.284439       4.936379 -0.321056     1.001706   
1  0.538006     -0.284439      -0.202578 -0.000116     1.001706   
2 -0.616691     -0.284439      -0.202578 -0.000116     0.161108   
3 -0.261399     -0.284439      -0.202578 -0.583232    -0.492690   
4  1.515058      3.515687       4.936379 -1.081970    -0.679490   

   blood_glucose_level  gender_Male  gender_Other  smoking_history_current  \
0             0.047704        False         False                    False   
1            -1.426210        False         False                    False   
2             0.489878         True         False                    False   
3             0.416183        False         False                     True   
4             0.416183         True         False                     True   

   smoking_history_ever  smoking_history_former  smoking_history_never  \
0                 Fals

### <span style="color:yellow">Train test split</span>

In [10]:
x = df_resampled_scaled.drop('diabetes', axis=1)
y = df_resampled_scaled['diabetes']

x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.15, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.1765, random_state=42)

print(f"Train set size: {len(x_train)}")
print(f"Validation set size: {len(x_val)}")
print(f"Test set size: {len(x_test)}")

Train set size: 69997
Validation set size: 15003
Test set size: 15000


In [11]:
x_train

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
6237,-0.927571,-0.284439,-0.202578,-0.991565,-1.893686,0.539009,False,False,False,False,False,False,False
55777,1.159766,-0.284439,-0.202578,-0.331603,1.842303,2.504228,False,False,False,False,True,False,False
35621,0.626829,-0.284439,-0.202578,0.608616,0.534707,3.486837,True,False,True,False,False,False,False
36264,-0.616691,-0.284439,-0.202578,0.189736,0.441307,2.012923,False,False,False,False,False,True,False
72023,-0.572279,-0.284439,-0.202578,-0.000116,-1.893686,0.539009,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4783,-1.726976,-0.284439,-0.202578,-1.339627,1.001706,-1.303384,True,False,False,False,False,False,False
62465,0.538006,-0.284439,-0.202578,0.289183,0.441307,0.416183,False,False,False,False,False,True,False
61720,0.449184,-0.284439,-0.202578,0.161108,0.161108,-0.296209,False,False,False,True,False,False,False
31640,0.404772,-0.284439,-0.202578,0.554373,0.161108,1.521618,True,False,False,False,False,False,False


In [19]:
boolean_cols = x_train.select_dtypes(include=bool).columns

x_train[boolean_cols] = x_train[boolean_cols].astype(int)
x_val[boolean_cols] = x_val[boolean_cols].astype(int)
x_test[boolean_cols] = x_test[boolean_cols].astype(int)

### <span style="color:yellow">Training an MLP model</span>

In [13]:
# import os
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset

# # Check if CUDA is available and print the device being used
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Convert data to PyTorch tensors
# X_train_tensor = torch.tensor(x_train.values, dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
# X_test_tensor = torch.tensor(x_test.values, dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# # Create DataLoader
# train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
# test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# class MLP(nn.Module):
#     def __init__(self, input_dim):
#         super(MLP, self).__init__()
#         self.model = nn.Sequential(
#             nn.Linear(input_dim, 64),
#             nn.ReLU(),
#             nn.Linear(64, 32),
#             nn.ReLU(),
#             nn.Linear(32, 1),
#             nn.Sigmoid()  # Sigmoid for binary classification
#         )

#     def forward(self, x):
#         return self.model(x)


# # Initialize the model, loss function, and optimizer
# input_dim = x_train.shape[1]
# model = MLP(input_dim).to(device)
# criterion = nn.BCELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Define the path to the saved model file
# model_file = './models/mlp_model.pth'

# # Check if the model file exists
# if os.path.exists(model_file):
#     print("Model file exists. Loading the model...")
#     model.load_state_dict(torch.load(model_file))
# else:
#     print("Model file does not exist. Training a new model...")

#     # Train the model
#     num_epochs = 50
#     best_loss = float('inf')
#     patience = 3
#     patience_counter = 0

#     for epoch in range(num_epochs):
#         model.train()
#         running_loss = 0.0
#         for X_batch, y_batch in train_loader:
#             X_batch, y_batch = X_batch.to(device), y_batch.to(device)

#             optimizer.zero_grad()
#             outputs = model(X_batch)
#             loss = criterion(outputs, y_batch)
#             loss.backward()
#             optimizer.step()

#             running_loss += loss.item() * X_batch.size(0)

#         epoch_loss = running_loss / len(train_loader.dataset)
#         print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

#         # Early stopping
#         model.eval()
#         val_loss = 0.0
#         with torch.no_grad():
#             for X_batch, y_batch in test_loader:
#                 X_batch, y_batch = X_batch.to(device), y_batch.to(device)
#                 outputs = model(X_batch)
#                 loss = criterion(outputs, y_batch)
#                 val_loss += loss.item() * X_batch.size(0)

#         val_loss /= len(test_loader.dataset)
#         print(f'Validation Loss: {val_loss:.4f}')

#         if val_loss < best_loss:
#             best_loss = val_loss
#             patience_counter = 0
#         else:
#             patience_counter += 1
#             if patience_counter >= patience:
#                 print("Early stopping triggered.")
#                 break

#     # Save the trained model after training is complete
#     torch.save(model.state_dict(), model_file)
#     print("Model trained and saved.")


In [14]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_size, dropout):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

def train_mlp(train_dataset, val_dataset, batch_size, hidden_size, learning_rate, dropout, num_epochs, patience, model_file='./models/mlp_model.pth'):
    print(f"Using device: {device}")

    # Create DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize the model, loss function, and optimizer
    input_dim = train_dataset.tensors[0].shape[1]
    model = MLP(input_dim, hidden_size, dropout).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Check if the model file exists
    if os.path.exists(model_file):
        print("Model file exists. Loading the model...")
        model.load_state_dict(torch.load(model_file))
    else:
        print("Model file does not exist. Training a new model...")
        best_loss = float('inf')
        patience_counter = 0

        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0
            for X_batch, y_batch in train_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)

                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()

                running_loss += loss.item() * X_batch.size(0)

            epoch_loss = running_loss / len(train_loader.dataset)
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

            # Early stopping
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                    outputs = model(X_batch)
                    loss = criterion(outputs, y_batch)
                    val_loss += loss.item() * X_batch.size(0)

            val_loss /= len(val_loader.dataset)
            print(f'Validation Loss: {val_loss:.4f}')

            if val_loss < best_loss:
                best_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break

        # Save the trained model after training is complete
        # torch.save(model.state_dict(), model_file)
        print("Model trained and saved.")

    return model


In [15]:
X_train_tensor = torch.tensor(x_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(x_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(x_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [16]:
# Hyperparameters
batch_size = 32
hidden_size = 64
learning_rate = 0.001
dropout = 0.2
num_epochs = 250
patience = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = train_mlp(train_dataset, val_dataset, batch_size, hidden_size, learning_rate, dropout, num_epochs, patience)



Using device: cpu
Model file does not exist. Training a new model...
Epoch [1/250], Loss: 0.1365
Validation Loss: 0.1070
Epoch [2/250], Loss: 0.1081
Validation Loss: 0.0907
Epoch [3/250], Loss: 0.0975
Validation Loss: 0.0850
Epoch [4/250], Loss: 0.0932
Validation Loss: 0.0828
Epoch [5/250], Loss: 0.0909
Validation Loss: 0.0813
Epoch [6/250], Loss: 0.0890
Validation Loss: 0.0805
Epoch [7/250], Loss: 0.0872
Validation Loss: 0.0825
Epoch [8/250], Loss: 0.0866
Validation Loss: 0.0794
Epoch [9/250], Loss: 0.0867
Validation Loss: 0.0789
Epoch [10/250], Loss: 0.0856
Validation Loss: 0.0798
Epoch [11/250], Loss: 0.0855
Validation Loss: 0.0790
Epoch [12/250], Loss: 0.0851
Validation Loss: 0.0788
Epoch [13/250], Loss: 0.0850
Validation Loss: 0.0798
Epoch [14/250], Loss: 0.0845
Validation Loss: 0.0786
Epoch [15/250], Loss: 0.0845
Validation Loss: 0.0783
Epoch [16/250], Loss: 0.0840
Validation Loss: 0.0787
Epoch [17/250], Loss: 0.0839
Validation Loss: 0.0796
Epoch [18/250], Loss: 0.0843
Validation

In [17]:
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Evaluate the model
model.eval()
test_loss = 0.0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)

# Calculate Mean Absolute Error (MAE)
test_mae = 0.0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        mae = torch.mean(torch.abs(outputs - y_batch))
        test_mae += mae.item() * X_batch.size(0)

test_mae /= len(test_loader.dataset)
print(f'Test MAE: {test_mae:.4f}')

# Calculate Prediction Accuracy
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)  # Outputs are probabilities due to sigmoid
        predictions = (outputs > 0.5).float()  # Convert probabilities to binary predictions
        correct_predictions += torch.sum(predictions == y_batch).item()
        total_predictions += y_batch.size(0)

accuracy = correct_predictions / total_predictions
print("Total Predictions:", total_predictions)
print("Correct Predictions:", correct_predictions)
print("Incorrect Predictions:", total_predictions - correct_predictions)
print(f'Prediction Accuracy: {accuracy:.4f}')


Test MAE: 0.0443
Total Predictions: 15000
Correct Predictions: 14574
Incorrect Predictions: 426
Prediction Accuracy: 0.9716


In [18]:
import itertools

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameter tuning
hidden_sizes = [32, 64, 128]
learning_rates = [0.01, 0.005, 0.001]
dropouts = [0.0, 0.1, 0.2, 0.3]
batch_sizes = [16, 32, 64]
num_epochs = 50
patience = 10

num_trials = 5

results = []

total_combinations = len(hidden_sizes) * len(learning_rates) * len(dropouts) * len(batch_sizes)
count = 0

for lr, batch_size, hidden_size, dropout in itertools.product(learning_rates, batch_sizes, hidden_sizes, dropouts):
    print(f"Learning Rate: {lr}, Batch Size: {batch_size}, Hidden Size: {hidden_size}, Dropout: {dropout}")
    print(f"Completed {count}/{total_combinations} combinations")
    count += 1

    maes, accuracies = [], []
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    for i in range(num_trials):
        model = train_mlp(train_dataset, val_dataset, batch_size, hidden_size, lr, dropout, num_epochs, patience)

        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)

        # Calculate Mean Absolute Error (MAE)
        test_mae = 0.0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                mae = torch.mean(torch.abs(outputs - y_batch))
                test_mae += mae.item() * X_batch.size(0)

        test_mae /= len(test_loader.dataset)
        print(f'Test MAE: {test_mae:.4f}')

        # Calculate Prediction Accuracy
        correct_predictions = 0
        total_predictions = 0
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)  # Outputs are probabilities due to sigmoid
                predictions = (outputs > 0.5).float()  # Convert probabilities to binary predictions
                correct_predictions += torch.sum(predictions == y_batch).item()
                total_predictions += y_batch.size(0)

        accuracy = correct_predictions / total_predictions
        print(f'Prediction Accuracy: {accuracy:.4f}%')

        maes.append(test_mae)
        accuracies.append(accuracy)

    mae_mean = np.mean(maes)
    accuracy_mean = np.mean(accuracies)
    results.append([lr, batch_size, hidden_size, dropout, mae_mean, accuracy_mean])

results_df = pd.DataFrame(results, columns=['Learning Rate', 'Batch Size', 'Hidden Size', 'Dropout', 'MAE', 'Accuracy'])
results_df.to_csv('results.csv', index=False)





[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Validation Loss: 0.0785
Epoch [25/50], Loss: 0.0816
Validation Loss: 0.0791
Epoch [26/50], Loss: 0.0815
Validation Loss: 0.0790
Early stopping triggered.
Model trained and saved.
Test MAE: 0.0458
Prediction Accuracy: 0.9719%
Using device: cpu
Model file does not exist. Training a new model...
Epoch [1/50], Loss: 0.1241
Validation Loss: 0.0939
Epoch [2/50], Loss: 0.0950
Validation Loss: 0.0839
Epoch [3/50], Loss: 0.0893
Validation Loss: 0.0830
Epoch [4/50], Loss: 0.0869
Validation Loss: 0.0801
Epoch [5/50], Loss: 0.0856
Validation Loss: 0.0801
Epoch [6/50], Loss: 0.0852
Validation Loss: 0.0795
Epoch [7/50], Loss: 0.0845
Validation Loss: 0.0815
Epoch [8/50], Loss: 0.0842
Validation Loss: 0.0794
Epoch [9/50], Loss: 0.0839
Validation Loss: 0.0794
Epoch [10/50], Loss: 0.0832
Validation Loss: 0.0789
Epoch [11/50], Loss: 0.0828
Validation Loss: 0.0792
Epoch [12/50], Loss: 0.0831
Validation Loss: 0.0794
Epoch [13/50], Loss: 0.082

In [21]:

results_df.sort_values(by='Accuracy', ascending=False).head(5)


Unnamed: 0,Learning Rate,Batch Size,Hidden Size,Dropout,MAE,Accuracy
107,0.001,64,128,0.3,0.046189,0.971813
48,0.005,32,32,0.0,0.045561,0.971813
49,0.005,32,32,0.1,0.046554,0.9718
85,0.001,32,32,0.1,0.046237,0.971787
103,0.001,64,64,0.3,0.044846,0.97172


In [22]:
results_df.sort_values(by='MAE', ascending=False).head(5)


Unnamed: 0,Learning Rate,Batch Size,Hidden Size,Dropout,MAE,Accuracy
11,0.01,16,128,0.3,0.052501,0.969427
14,0.01,32,32,0.2,0.052433,0.971147
9,0.01,16,128,0.1,0.051909,0.97072
6,0.01,16,64,0.2,0.051487,0.970107
4,0.01,16,64,0.0,0.051299,0.97008
