In [None]:
import sqlite3
import pandas as pd

# Load the dataset
file_path = 'data/diabetes_binary_5050split_health_indicators_BRFSS2023.csv'
df_new = pd.read_csv(file_path)

# Create an SQLite database and save the dataset
conn = sqlite3.connect('diabetes_data.db')  # Creates a file-based SQLite database
df_new.to_sql('diabetes_data', conn, if_exists='replace', index=False)


73592

In [25]:
# Query data from the SQL database
query = "SELECT * FROM diabetes_data"
df_sql = pd.read_sql_query(query, conn)

# Display the first few rows to verify
print(df_sql.head())


   Diabetes_binary  HighBP  HighChol  CholCheck  BMI  Smoker  Stroke  \
0              0.0     0.0       0.0        0.0  2.0     0.0     0.0   
1              0.0     0.0       0.0        0.0  3.0     0.0     0.0   
2              0.0     1.0       0.0        0.0  4.0     1.0     0.0   
3              0.0     0.0       0.0        4.0  3.0     1.0     0.0   
4              0.0     0.0       0.0        0.0  4.0     1.0     0.0   

   HeartDiseaseorAttack  PhysActivity  HvyAlcoholConsump  AnyHealthcare  \
0                   0.0           1.0                0.0            1.0   
1                   0.0           1.0                0.0            1.0   
2                   1.0           1.0                0.0            1.0   
3                   0.0           1.0                0.0            0.0   
4                   0.0           1.0                0.0            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      3.0       4.0    

In [None]:
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Load the dataset
file_path = 'data/diabetes_binary_5050split_health_indicators_BRFSS2023.csv'
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['Diabetes_binary'])  # Replace 'Diabetes_binary' with your target column
y = df['Diabetes_binary']

# Check for categorical columns
categorical_columns = X.select_dtypes(include=['object', 'category']).columns

# One-hot encode categorical columns (if any)
if not categorical_columns.empty:
    X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)



In [27]:
# Apply SMOTETomek to balance the training set
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)

In [28]:
# Scale the features
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

In [29]:
# Define model parameters
input_dim = X_train_balanced.shape[1]
num_classes = len(np.unique(y))


In [30]:
# Build the neural network
model = Sequential([
    Dense(256, activation='relu', input_dim=input_dim),
    BatchNormalization(),
    Dropout(0.4),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(num_classes, activation='softmax')  # For multiclass classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [31]:
# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])


In [35]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [36]:
# Train the model
history = model.fit(
    X_train_balanced, pd.get_dummies(y_train_balanced),
    epochs=100, batch_size=32,
    validation_data=(X_test_scaled, pd.get_dummies(y_test)),
    callbacks=[early_stopping],
    verbose=1
)


Epoch 1/100
[1m1536/1536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7660 - loss: 0.4876 - val_accuracy: 0.7403 - val_loss: 0.5231
Epoch 2/100
[1m1536/1536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7683 - loss: 0.4858 - val_accuracy: 0.7398 - val_loss: 0.5228
Epoch 3/100
[1m1536/1536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7665 - loss: 0.4888 - val_accuracy: 0.7399 - val_loss: 0.5253
Epoch 4/100
[1m1536/1536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7648 - loss: 0.4905 - val_accuracy: 0.7401 - val_loss: 0.5237
Epoch 5/100
[1m1536/1536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7612 - loss: 0.4891 - val_accuracy: 0.7406 - val_loss: 0.5241
Epoch 6/100
[1m1536/1536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7687 - loss: 0.4824 - val_accuracy: 0.7419 - val_loss: 0.5240
Epoch 7/10

In [37]:
# Evaluate the model
y_pred = model.predict(X_test_scaled).argmax(axis=1)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 822us/step
Accuracy: 0.7405562098016125
Classification Report:
              precision    recall  f1-score   support

         0.0       0.77      0.66      0.71     10604
         1.0       0.72      0.82      0.77     11474

    accuracy                           0.74     22078
   macro avg       0.74      0.74      0.74     22078
weighted avg       0.74      0.74      0.74     22078

