In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Dense, Input, LeakyReLU, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import os


In [2]:

# Read the CSV file
df = pd.read_csv('diabetes_prediction_dataset.csv')
print(df.columns)

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')


In [3]:
df_encoded = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)
print(df_encoded.head())

    age  hypertension  heart_disease    bmi  HbA1c_level  blood_glucose_level  \
0  80.0             0              1  25.19          6.6                  140   
1  54.0             0              0  27.32          6.6                   80   
2  28.0             0              0  27.32          5.7                  158   
3  36.0             0              0  23.45          5.0                  155   
4  76.0             1              1  20.14          4.8                  155   

   diabetes  gender_Male  gender_Other  smoking_history_current  \
0         0        False         False                    False   
1         0        False         False                    False   
2         0         True         False                    False   
3         0        False         False                     True   
4         0         True         False                     True   

   smoking_history_ever  smoking_history_former  smoking_history_never  \
0                 False             

In [4]:
boolean_columns = df_encoded.select_dtypes(include=bool).columns
numerical_columns = df_encoded.select_dtypes(include=np.number).columns
print(boolean_columns)
print(numerical_columns)

Index(['gender_Male', 'gender_Other', 'smoking_history_current',
       'smoking_history_ever', 'smoking_history_former',
       'smoking_history_never', 'smoking_history_not current'],
      dtype='object')
Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level', 'diabetes'],
      dtype='object')


In [5]:
numerical_columns = numerical_columns.drop(['diabetes', 'age'])
boolean_columns = boolean_columns.insert(0, 'diabetes')
boolean_columns = boolean_columns.insert(0, 'age')

scaler = StandardScaler()
df_numerical_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_columns]), columns=numerical_columns)
df_scaled = pd.concat([df_numerical_scaled, df_encoded[boolean_columns]], axis=1)

In [6]:
df_scaled.head()

Unnamed: 0,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,age,diabetes,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,-0.284439,4.936379,-0.321056,1.001706,0.047704,80.0,0,False,False,False,False,False,True,False
1,-0.284439,-0.202578,-0.000116,1.001706,-1.42621,54.0,0,False,False,False,False,False,False,False
2,-0.284439,-0.202578,-0.000116,0.161108,0.489878,28.0,0,True,False,False,False,False,True,False
3,-0.284439,-0.202578,-0.583232,-0.49269,0.416183,36.0,0,False,False,True,False,False,False,False
4,3.515687,4.936379,-1.08197,-0.67949,0.416183,76.0,0,True,False,True,False,False,False,False


In [7]:
age_distribution = df_scaled['age'].value_counts()
print(age_distribution)

age
80.00    5621
51.00    1619
47.00    1574
48.00    1568
53.00    1542
         ... 
0.48       83
1.00       83
0.40       66
0.16       59
0.08       36
Name: count, Length: 102, dtype: int64


In [8]:
# age_groups = [(0, 9), (10, 19), (20, 29), (30, 39), (40, 49), (50, 59), (60, 69), (70, 79), (80, 89)]

# datasets = {}
# for start, end in age_groups:
#     group_name = f"{start}-{end}"
#     datasets[group_name] = df_scaled[(df_scaled['age'] >= start) & (df_scaled['age'] <= end)]

# # Display the number of records in each dataset
# for group_name, dataset in datasets.items():
#     print(f"Age group {group_name}: {len(dataset)} records")


age_groups = [(0, 29), (30, 59), (60, 89)]

datasets = {}
for start, end in age_groups:
    group_name = f"{start}-{end}"
    datasets[group_name] = df_scaled[(df_scaled['age'] >= start) & (df_scaled['age'] <= end)]

# Display the number of records in each dataset
for group_name, dataset in datasets.items():
    print(f"Age group {group_name}: {len(dataset)} records")

Age group 0-29: 32435 records
Age group 30-59: 42510 records
Age group 60-89: 25055 records


In [9]:
testDF = datasets['30-59']

In [10]:
x = testDF.drop(['diabetes', 'age'], axis=1)
y = testDF['diabetes']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
# Filename for the autoencoder model
model_file = 'autoencoder.keras'

# Check if the model file exists, load it if available, otherwise train a new model
if os.path.exists(model_file):
    print("Model file exists. Loading the model...")
    autoencoder = load_model(model_file)
else:
    print("Model file does not exist. Training a new model...")

    # Define input dimensions and encoding dimensions
    input_dim = x_train.shape[1]  # Assuming 12 features
    encoding_dim = 12  # Bottleneck layer dimension

    # Define the Encoder
    # input_layer = Input(shape=(input_dim,))
    # encoded = Dense(128, activation='sigmoid')(input_layer)
    # encoded = Dense(64, activation='sigmoid')(encoded)
    # encoded = Dense(32, activation='sigmoid')(encoded)
    # encoded = Dense(encoding_dim, activation='sigmoid')(encoded)  # Bottleneck layer
    
    #################################
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(256)(input_layer)
    encoded = LeakyReLU()(encoded)
    encoded = Dropout(0.2)(encoded)
    encoded = Dense(128)(encoded)
    encoded = LeakyReLU()(encoded)
    encoded = Dropout(0.2)(encoded)
    encoded = Dense(64)(encoded)
    encoded = LeakyReLU()(encoded)
    encoded = Dropout(0.2)(encoded)
    encoded = Dense(encoding_dim, activation='linear')(encoded)  # Bottleneck layer
    #################################
    

    # Create the Encoder model
    encoder = Model(input_layer, encoded)
    encoder.compile(optimizer=Adam(learning_rate=0.00005), loss='mse')

    # Train the Encoder
    encoder.fit(x_train, x_train, epochs=200, validation_split=0.2, verbose=1,
                callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)])

    # Define the Decoder
    # encoded_input = Input(shape=(encoding_dim,))
    # decoded = Dense(32, activation='sigmoid')(encoded_input)
    # decoded = Dense(64, activation='sigmoid')(decoded)
    # decoded = Dense(128, activation='sigmoid')(decoded)
    # decoded = Dense(input_dim, activation='sigmoid')(decoded)
    
    
    #################################
    encoded_input = Input(shape=(encoding_dim,))
    decoded = Dense(64)(encoded_input)
    decoded = LeakyReLU()(decoded)
    decoded = Dense(128)(decoded)
    decoded = LeakyReLU()(decoded)
    decoded = Dense(256)(decoded)
    decoded = LeakyReLU()(decoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)
    #################################
    
    # Create the Decoder model
    decoder = Model(encoded_input, decoded)
    decoder.compile(optimizer=Adam(learning_rate=0.00005), loss='mse')

    # Train the Decoder
    encoded_train = encoder.predict(x_train)
    decoder.fit(encoded_train, x_train, epochs=200, validation_split=0.2, verbose=1,
                callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)])

    # Combine Encoder and Decoder to form the Autoencoder
    autoencoder_input = Input(shape=(input_dim,))
    encoded_repr = encoder(autoencoder_input)
    reconstructed = decoder(encoded_repr)
    autoencoder = Model(autoencoder_input, reconstructed)
    autoencoder.compile(optimizer=Adam(learning_rate=0.00005), loss='mse')

    # Save the model as autoencoder.keras
    autoencoder.save(model_file)
    print("Model trained and saved.")

# Calculate reconstruction error on the test set
reconstructed_test = autoencoder.predict(x_test)
reconstruction_error = np.mean(np.square(x_test - reconstructed_test), axis=1)

# Convert reconstruction error to percentage
reconstruction_error_percentage = np.mean(reconstruction_error) * 100

# Print reconstruction error percentage
print(f'Reconstruction Error Percentage: {reconstruction_error_percentage}')


Model file exists. Loading the model...
[1m  1/266[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m34s[0m 129ms/step

  saveable.load_own_variables(weights_store.get(inner_path))


[1m266/266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Reconstruction Error Percentage: 21.350165776419573


In [12]:
# Load the autoencoder model
model_file = 'autoencoder.keras'
autoencoder = load_model(model_file)
print("Model loaded successfully.")


# Create test data for other age ranges
testDF_0_29 = datasets['0-29']
testDF_60_89 = datasets['60-89']

# Prepare the test data
x_test_0_29 = testDF_0_29.drop(['diabetes', 'age'], axis=1)
y_test_0_29 = testDF_0_29['diabetes']

x_test_60_89 = testDF_60_89.drop(['diabetes', 'age'], axis=1)
y_test_60_89 = testDF_60_89['diabetes']

print("Test data for age range 0-29 and 60-89 created successfully.")


Model loaded successfully.
Test data for age range 0-29 and 60-89 created successfully.


  saveable.load_own_variables(weights_store.get(inner_path))


In [13]:
# Predict the reconstruction error for the 0-29 age group
reconstructed_test_0_29 = autoencoder.predict(x_test_0_29)
reconstruction_error_0_29 = np.mean(np.square(x_test_0_29 - reconstructed_test_0_29), axis=1)

# Convert reconstruction error to percentage
reconstruction_error_percentage_0_29 = np.mean(reconstruction_error_0_29) * 100

# Print reconstruction error percentage for 0-29 age group
print(f'Reconstruction Error Percentage for 0-29 age group: {reconstruction_error_percentage_0_29}')

[1m1014/1014[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Reconstruction Error Percentage for 0-29 age group: 19.063118658012844


In [14]:
# Predict the reconstruction error for the 60-89 age group
reconstructed_test_60_89 = autoencoder.predict(x_test_60_89)
reconstruction_error_60_89 = np.mean(np.square(x_test_60_89 - reconstructed_test_60_89), axis=1)

# Convert reconstruction error to percentage
reconstruction_error_percentage_60_89 = np.mean(reconstruction_error_60_89) * 100

# Print reconstruction error percentage for 60-89 age group
print(f'Reconstruction Error Percentage for 60-89 age group: {reconstruction_error_percentage_60_89}')

[1m783/783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Reconstruction Error Percentage for 60-89 age group: 40.02930996411611
