In [104]:
import tensorflow

In [105]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import os


In [106]:

# Read the CSV file
df = pd.read_csv('diabetes_prediction_dataset.csv')
print(df.columns)

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')


## <span style="color:red">Preprocessing</span>

In [107]:
null_counts = df.isnull().sum()
print(null_counts)

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


### <span style="color:yellow">No columns have null values</span>

In [108]:
for i in df.columns:
    print("number of unique values in", i, ":", df[i].nunique())

number of unique values in gender : 3
number of unique values in age : 102
number of unique values in hypertension : 2
number of unique values in heart_disease : 2
number of unique values in smoking_history : 6
number of unique values in bmi : 4247
number of unique values in HbA1c_level : 18
number of unique values in blood_glucose_level : 18
number of unique values in diabetes : 2


### <span style="color:yellow">One-hot encoding for gender and smoking_history</span>

In [109]:
df_encoded = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)
print(df_encoded.head())

    age  hypertension  heart_disease    bmi  HbA1c_level  blood_glucose_level  \
0  80.0             0              1  25.19          6.6                  140   
1  54.0             0              0  27.32          6.6                   80   
2  28.0             0              0  27.32          5.7                  158   
3  36.0             0              0  23.45          5.0                  155   
4  76.0             1              1  20.14          4.8                  155   

   diabetes  gender_Male  gender_Other  smoking_history_current  \
0         0        False         False                    False   
1         0        False         False                    False   
2         0         True         False                    False   
3         0        False         False                     True   
4         0         True         False                     True   

   smoking_history_ever  smoking_history_former  smoking_history_never  \
0                 False             

### <span style="color:yellow">Normalizing the data</span>

In [110]:
boolean_columns = df_encoded.select_dtypes(include=bool).columns
numerical_columns = df_encoded.select_dtypes(include=np.number).columns
print(boolean_columns)
print(numerical_columns)

Index(['gender_Male', 'gender_Other', 'smoking_history_current',
       'smoking_history_ever', 'smoking_history_former',
       'smoking_history_never', 'smoking_history_not current'],
      dtype='object')
Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes'],
      dtype='object')


In [111]:
# Separate features and target
X = df_encoded.drop(columns=['diabetes'])
y = df_encoded['diabetes']

# Identify numerical and boolean columns
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns
boolean_columns = X.select_dtypes(include=['bool']).columns

# Resample the data using SMOTE
smote = SMOTE(random_state=42, sampling_strategy=0.15)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Normalize the numerical features
scaler = StandardScaler()
X_resampled_numerical = pd.DataFrame(scaler.fit_transform(X_resampled[numerical_columns]), columns=numerical_columns)

# Combine the normalized numerical features with the boolean features
X_resampled_scaled = pd.concat([X_resampled_numerical, X_resampled[boolean_columns].reset_index(drop=True)], axis=1)

# Add the target variable back to the DataFrame
df_resampled_scaled = pd.concat([X_resampled_scaled, y_resampled.reset_index(drop=True)], axis=1)

# Display the first few rows of the resampled and scaled DataFrame
print("Resampled and Scaled DataFrame:")
print(df_resampled_scaled.head())

# Check the class distribution after resampling
print("\nClass Distribution After Resampling:")
print(df_resampled_scaled['diabetes'].value_counts())

Resampled and Scaled DataFrame:
        age  hypertension  heart_disease       bmi  HbA1c_level  \
0  1.645531     -0.284262       4.941944 -0.350389     0.908296   
1  0.493983     -0.284262      -0.202350 -0.034864     0.908296   
2 -0.657565     -0.284262      -0.202350 -0.034864     0.094456   
3 -0.303243     -0.284262      -0.202350 -0.608142    -0.538531   
4  1.468370      3.517882       4.941944 -1.098465    -0.719385   

   blood_glucose_level  gender_Male  gender_Other  smoking_history_current  \
0            -0.018695        False         False                    False   
1            -1.399270        False         False                    False   
2             0.395478         True         False                    False   
3             0.326449        False         False                     True   
4             0.326449         True         False                     True   

   smoking_history_ever  smoking_history_former  smoking_history_never  \
0                 Fals

### <span style="color:yellow">Train test split</span>

In [112]:
x = df_resampled_scaled.drop('diabetes', axis=1)
y = df_resampled_scaled['diabetes']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### <span style="color:yellow">Training an MLP model</span>

In [113]:
# Define the path to the saved model file
model_file = './models/mlp_model_resampled_0.15.keras'

# Check if the model file exists
if os.path.exists(model_file):
    print("Model file exists. Loading the model...")
    # Load the saved model
    mlp_model = load_model(model_file)
else:
    print("Model file does not exist. Training a new model...")
    # Define the MLP model
    mlp_model = Sequential([
        Dense(64, activation='relu', input_shape=(x_train.shape[1],)),  # Input layer
        Dense(32, activation='relu'),  # Hidden layer
        Dense(1)  # Output layer for regression
    ])

    # Compile the model
    mlp_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    # Define early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Train the model with early stopping
    mlp_model.fit(x_train, y_train, epochs=50, validation_split=0.2, verbose=1, callbacks=[early_stopping])

    # Save the trained model
    mlp_model.save(model_file)
    print("Model trained and saved.")

Model file does not exist. Training a new model...
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.0514 - val_loss: 0.0423
Epoch 2/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0414 - val_loss: 0.0395
Epoch 3/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0397 - val_loss: 0.0377
Epoch 4/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0377 - val_loss: 0.0349
Epoch 5/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0343 - val_loss: 0.0337
Epoch 6/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0328 - val_loss: 0.0331
Epoch 7/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0325 - val_loss: 0.0322
Epoch 8/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0315 - val_loss: 0.0318
Epoch 9/50
[1m2105/2105[0m [32m━

In [114]:
# Make predictions on the test data
y_pred = mlp_model.predict(x_test)

# Calculate the Mean Squared Error (MSE) on the test data
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Set: {mse}')

# Convert y_pred to binary predictions
y_pred_binary = np.where(y_pred >= 0.5, 1, 0)

# Ensure y_test is a numpy array and reshape if necessary
y_test_array = y_test.values if hasattr(y_test, 'values') else y_test
if y_test_array.ndim > 1:
    y_test_array = y_test_array.reshape(-1)

# Ensure y_pred_binary is a 1-dimensional array
if y_pred_binary.ndim > 1:
    y_pred_binary = y_pred_binary.reshape(-1)

# Compare y_pred_binary with y_test
correct_predictions = np.sum(y_pred_binary == y_test_array)
wrong_predictions = np.sum(y_pred_binary != y_test_array)

# Calculate additional metrics
true_positives = np.sum((y_pred_binary == 1) & (y_test_array == 1))
true_negatives = np.sum((y_pred_binary == 0) & (y_test_array == 0))
false_positives = np.sum((y_pred_binary == 1) & (y_test_array == 0))
false_negatives = np.sum((y_pred_binary == 0) & (y_test_array == 1))

print(f"Number of Correct Predictions: {correct_predictions}")
print(f"Number of Wrong Predictions: {wrong_predictions}")
print(f"Number of True Positives (1 predicted as 1): {true_positives}")
print(f"Number of True Negatives (0 predicted as 0): {true_negatives}")
print(f"Number of False Positives (0 predicted as 1): {false_positives}")
print(f"Number of False Negatives (1 predicted as 0): {false_negatives}")

[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Mean Squared Error on Test Set: 0.028899969714189382
Number of Correct Predictions: 20276
Number of Wrong Predictions: 769
Number of True Positives (1 predicted as 1): 1946
Number of True Negatives (0 predicted as 0): 18330
Number of False Positives (0 predicted as 1): 44
Number of False Negatives (1 predicted as 0): 725
