In [10]:
import sqlite3
import pandas as pd

# Load the dataset
file_path = 'data/diabetes_binary_5050split_health_indicators_BRFSS2023.csv'
df_new = pd.read_csv(file_path)

# Create an SQLite database and save the dataset
conn = sqlite3.connect('diabetes_data.db')  # Creates a file-based SQLite database
df_new.to_sql('diabetes_data', conn, if_exists='replace', index=False)


16944

In [11]:
# Query data from the SQL database
query = "SELECT * FROM diabetes_data"
df_sql = pd.read_sql_query(query, conn)

# Display the first few rows to verify
print(df_sql.head())


   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     0.0       1.0        0.0  25.0     1.0     0.0   
1              0.0     0.0       0.0        4.0  23.0     0.0     0.0   
2              0.0     1.0       1.0        0.0  33.0     1.0     0.0   
3              0.0     1.0       0.0        0.0  31.0     0.0     0.0   
4              0.0     0.0       0.0        0.0  25.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  HvyAlcoholConsump  AnyHealthcare  \
0                   0.0           1.0                0.0            1.0   
1                   0.0           1.0                0.0            1.0   
2                   0.0           1.0                1.0            1.0   
3                   0.0           0.0                0.0            1.0   
4                   0.0           1.0                0.0            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      2.0       0

In [12]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report


# Separate features and target variable
X = df_sql.drop(columns=['Diabetes_binary'])
y = df_sql['Diabetes_binary']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Convert target variable to categorical (one-hot encoding)
y_train_smote_categorical = to_categorical(y_train_smote, num_classes=3)
y_test_categorical = to_categorical(y_test, num_classes=3)

# Compute class weights based on original distribution
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Build the neural network
model = Sequential([
    Dense(128, input_dim=X_train_smote.shape[1], activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # 3 classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train_smote, y_train_smote_categorical,
    epochs=20,
    batch_size=32,
    class_weight=class_weight_dict,
    validation_data=(X_test, y_test_categorical),
    verbose=1
)

# Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=-1)  # Convert probabilities to class labels
print("Classification Report:")
print(classification_report(y_test, y_pred))


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.4936 - loss: 1.0537 - val_accuracy: 0.4032 - val_loss: 1.2580
Epoch 2/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 852us/step - accuracy: 0.5665 - loss: 0.8741 - val_accuracy: 0.4121 - val_loss: 1.1589
Epoch 3/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 809us/step - accuracy: 0.5855 - loss: 0.8361 - val_accuracy: 0.4426 - val_loss: 1.1035
Epoch 4/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 787us/step - accuracy: 0.6026 - loss: 0.7989 - val_accuracy: 0.4241 - val_loss: 1.2217
Epoch 5/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 805us/step - accuracy: 0.6149 - loss: 0.7668 - val_accuracy: 0.4514 - val_loss: 1.1364
Epoch 6/20
[1m552/552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 829us/step - accuracy: 0.6256 - loss: 0.7448 - val_accuracy: 0.4424 - val_loss: 1.2493
Epoch 7/20
[1m552/552[0m [

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

# Separate features and target variable
X = df_sql.drop(columns=['Diabetes_binary'])
y = df_sql['Diabetes_binary']

# Step 3: Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: One-hot encode the target variable for multiclass classification
y_categorical = to_categorical(y, num_classes=3)

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_categorical, test_size=0.3, random_state=42)

# Step 6: Build the Neural Network
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # Output layer for 3 classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Step 7: Train the Model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Step 8: Evaluate the Model
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

# Generate classification report
report = classification_report(y_test_classes, y_pred_classes)

report


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5276 - loss: 0.9004 - val_accuracy: 0.5633 - val_loss: 0.8108
Epoch 2/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 858us/step - accuracy: 0.5702 - loss: 0.8189 - val_accuracy: 0.5608 - val_loss: 0.8135
Epoch 3/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 847us/step - accuracy: 0.5693 - loss: 0.8092 - val_accuracy: 0.5624 - val_loss: 0.8212
Epoch 4/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 863us/step - accuracy: 0.5753 - loss: 0.8093 - val_accuracy: 0.5578 - val_loss: 0.8131
Epoch 5/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 876us/step - accuracy: 0.5847 - loss: 0.8022 - val_accuracy: 0.5496 - val_loss: 0.8206
Epoch 6/20
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step - accuracy: 0.5883 - loss: 0.7917 - val_accuracy: 0.5525 - val_loss: 0.8180
Epoch 7/20
[1m371/371[0m [

'              precision    recall  f1-score   support\n\n           0       0.58      0.65      0.61      2591\n           1       0.50      0.46      0.48      1946\n           2       0.34      0.23      0.27       547\n\n    accuracy                           0.53      5084\n   macro avg       0.47      0.45      0.46      5084\nweighted avg       0.52      0.53      0.53      5084\n'