In [2]:
import sqlite3
import pandas as pd

# Load the dataset
file_path = 'Data/diabetes_binary_5050split_health_indicators_BRFSS2023.csv'
df_new = pd.read_csv(file_path)

# Create an SQLite database and save the dataset
conn = sqlite3.connect('diabetes_data.db')  # Creates a file-based SQLite database
df_new.to_sql('diabetes_data', conn, if_exists='replace', index=False)


76024

In [3]:
# Query data from the SQL database
query = "SELECT * FROM diabetes_data"
df_sql = pd.read_sql_query(query, conn)

# Display the first few rows to verify
print(df_sql.head())


   Diabetes_binary  HighBP  HighChol  CholCheck  BMI  PhysActivity  \
0              0.0     0.0       0.0        2.0  3.0           2.0   
1              0.0     0.0       0.0        3.0  3.0           1.0   
2              0.0     1.0       1.0        2.0  3.0           1.0   
3              0.0     0.0       1.0        2.0  3.0           9.0   
4              0.0     1.0       0.0        2.0  2.0           1.0   

   LastCheckup  Smoker  CHCKDNY2  Stroke  ...  AnyHealthcare  NoDocbcCost  \
0          1.0     4.0       2.0     0.0  ...            1.0          0.0   
1          1.0     4.0       2.0     0.0  ...            1.0          0.0   
2          1.0     3.0       2.0     0.0  ...            1.0          0.0   
3          1.0     4.0       2.0     0.0  ...            1.0          0.0   
4          1.0     1.0       2.0     0.0  ...            1.0          0.0   

   GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  Income  
0      2.0       3.0       0.0       0.0  1

In [4]:
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'Data/diabetes_binary_5050split_health_indicators_BRFSS2023.csv'
df = pd.read_csv(file_path)

# Separate features and target variable
X = df.drop(columns=['Diabetes_binary'])  # Replace 'Diabetes_binary' with your target column
y = df['Diabetes_binary']


In [5]:
# Check for categorical columns
categorical_columns = X.select_dtypes(include=['object', 'category']).columns

# One-hot encode categorical columns (if any)
if not categorical_columns.empty:
    X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)


In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [7]:
# Apply SMOTETomek to balance the training data
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)


In [8]:
# Scale the features
scaler = StandardScaler()
X_train_balanced = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Build the neural network
model = Sequential([
    Dense(256, activation='relu', input_dim=X_train_balanced.shape[1]),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification output
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
# Train the model
history = model.fit(
    X_train_balanced, y_train_balanced,
    epochs=20, batch_size=32, validation_data=(X_test_scaled, y_test),
    verbose=1
)

Epoch 1/20
[1m1457/1457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7260 - loss: 0.5627 - val_accuracy: 0.7481 - val_loss: 0.5152
Epoch 2/20
[1m1457/1457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7567 - loss: 0.5050 - val_accuracy: 0.7486 - val_loss: 0.5139
Epoch 3/20
[1m1457/1457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7613 - loss: 0.4963 - val_accuracy: 0.7482 - val_loss: 0.5107
Epoch 4/20
[1m1457/1457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7609 - loss: 0.4911 - val_accuracy: 0.7514 - val_loss: 0.5130
Epoch 5/20
[1m1457/1457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7705 - loss: 0.4852 - val_accuracy: 0.7509 - val_loss: 0.5108
Epoch 6/20
[1m1457/1457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7660 - loss: 0.4894 - val_accuracy: 0.7502 - val_loss: 0.5096
Epoch 7/20
[1m1

In [11]:
# Evaluate the model
y_pred = (model.predict(X_test_scaled) > 0.5).astype('int32')
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 636us/step
Accuracy: 0.7514030164854437
Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.70      0.74     11404
         1.0       0.73      0.80      0.76     11404

    accuracy                           0.75     22808
   macro avg       0.75      0.75      0.75     22808
weighted avg       0.75      0.75      0.75     22808

