In [2]:
import sqlite3
import pandas as pd

# Load the dataset
file_path = 'data/diabetes_binary_5050split_health_indicators_BRFSS2023.csv'
df_new = pd.read_csv(file_path)

# Create an SQLite database and save the dataset
conn = sqlite3.connect('diabetes_data.db')  # Creates a file-based SQLite database
df_new.to_sql('diabetes_data', conn, if_exists='replace', index=False)


16944

In [3]:
# Query data from the SQL database
query = "SELECT * FROM diabetes_data"
df_sql = pd.read_sql_query(query, conn)

# Display the first few rows to verify
print(df_sql.head())


   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     0.0       1.0        0.0  25.0     1.0     0.0   
1              0.0     0.0       0.0        4.0  23.0     0.0     0.0   
2              0.0     1.0       1.0        0.0  33.0     1.0     0.0   
3              0.0     1.0       0.0        0.0  31.0     0.0     0.0   
4              0.0     0.0       0.0        0.0  25.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  HvyAlcoholConsump  AnyHealthcare  \
0                   0.0           1.0                0.0            1.0   
1                   0.0           1.0                0.0            1.0   
2                   0.0           1.0                1.0            1.0   
3                   0.0           0.0                0.0            1.0   
4                   0.0           1.0                0.0            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      2.0       0

In [4]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report


# Separate features and target variable
X = df_sql.drop(columns=['Diabetes_binary'])
y = df_sql['Diabetes_binary']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Convert target variable to categorical (one-hot encoding)
y_train_smote_categorical = to_categorical(y_train_smote, num_classes=3)
y_test_categorical = to_categorical(y_test, num_classes=3)

# Compute class weights based on original distribution
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Build the neural network
model = Sequential([
    Dense(128, input_dim=X_train_smote.shape[1], activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # 3 classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train_smote, y_train_smote_categorical,
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    validation_data=(X_test, y_test_categorical),
    verbose=1
)

# Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=-1)  # Convert probabilities to class labels
print("Classification Report:")
print(classification_report(y_test, y_pred))


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5046 - loss: 0.9990 - val_accuracy: 0.4099 - val_loss: 1.1440
Epoch 2/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 790us/step - accuracy: 0.5667 - loss: 0.8678 - val_accuracy: 0.4105 - val_loss: 1.2173
Epoch 3/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 826us/step - accuracy: 0.5797 - loss: 0.8322 - val_accuracy: 0.4131 - val_loss: 1.2359
Epoch 4/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 828us/step - accuracy: 0.5970 - loss: 0.8031 - val_accuracy: 0.4508 - val_loss: 1.1593
Epoch 5/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 774us/step - accuracy: 0.6096 - loss: 0.7629 - val_accuracy: 0.4386 - val_loss: 1.2082
Epoch 6/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 774us/step - accuracy: 0.6338 - loss: 0.7212 - val_accuracy: 0.4422 - val_loss: 1.2133
Epoch 7/20
[1m552/552[0m [

In [6]:

# Separate features and target variable
X = df_sql.drop(columns=['Diabetes_binary'])
y = df_sql['Diabetes_binary']

# Step 3: Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: One-hot encode the target variable for multiclass classification
y_categorical = to_categorical(y, num_classes=3)

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_categorical, test_size=0.3, random_state=42)

# Step 6: Build the Neural Network
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # Output layer for 3 classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Step 7: Train the Model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Step 8: Evaluate the Model
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

# Generate classification report
report = classification_report(y_test_classes, y_pred_classes)

report


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5253 - loss: 0.8900 - val_accuracy: 0.5645 - val_loss: 0.8152
Epoch 2/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 974us/step - accuracy: 0.5666 - loss: 0.8254 - val_accuracy: 0.5533 - val_loss: 0.8214
Epoch 3/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 899us/step - accuracy: 0.5749 - loss: 0.8071 - val_accuracy: 0.5602 - val_loss: 0.8123
Epoch 4/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 858us/step - accuracy: 0.5720 - loss: 0.8131 - val_accuracy: 0.5606 - val_loss: 0.8137
Epoch 5/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 873us/step - accuracy: 0.5900 - loss: 0.7986 - val_accuracy: 0.5624 - val_loss: 0.8139
Epoch 6/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 889us/step - accuracy: 0.5829 - loss: 0.7989 - val_accuracy: 0.5661 - val_loss: 0.8198
Epoch 7/20
[1m371/371[0m [

'              precision    recall  f1-score   support\n\n           0       0.60      0.61      0.60      2591\n           1       0.50      0.50      0.50      1946\n           2       0.35      0.31      0.33       547\n\n    accuracy                           0.54      5084\n   macro avg       0.48      0.47      0.48      5084\nweighted avg       0.53      0.54      0.54      5084\n'