In [2]:
import sqlite3
import pandas as pd

# Load the dataset
file_path = 'data/diabetes_binary_health_indicators_BRFSS2023.csv'
df_new = pd.read_csv(file_path)

# Create an SQLite database and save the dataset
conn = sqlite3.connect('diabetes_data.db')  # Creates a file-based SQLite database
df_new.to_sql('diabetes_data', conn, if_exists='replace', index=False)


110599

In [3]:
# Query data from the SQL database
query = "SELECT * FROM diabetes_data"
df_sql = pd.read_sql_query(query, conn)

# Display the first few rows to verify
print(df_sql.head())


   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        0.0  22.0     1.0     0.0   
1              0.0     1.0       0.0        0.0  26.0     0.0     0.0   
2              0.0     1.0       1.0        0.0  30.0     0.0     0.0   
3              0.0     1.0       1.0        0.0  33.0     0.0     0.0   
4              0.0     1.0       1.0        0.0  23.0     1.0     0.0   

   HeartDiseaseorAttack  PhysActivity  HvyAlcoholConsump  AnyHealthcare  \
0                   0.0           1.0                0.0            1.0   
1                   0.0           1.0                0.0            1.0   
2                   0.0           1.0                0.0            1.0   
3                   0.0           0.0                0.0            1.0   
4                   1.0           1.0                0.0            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          1.0      4.0       2

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Separate features and target variable
X_new = df_sql.drop(columns=['Diabetes_binary'])
y_new = df_sql['Diabetes_binary']

# Normalize and standardize the features
scaler = StandardScaler()
X_new_scaled = scaler.fit_transform(X_new)

# Train-test split
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new_scaled, y_new, test_size=0.3, random_state=42)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_new, y_train_new)

# Initialize and train the Gradient Boosting Classifier
gb_model_smote = GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3)
gb_model_smote.fit(X_train_smote, y_train_smote)

# Make predictions
gb_y_pred_smote = gb_model_smote.predict(X_test_new)

# Evaluate the model
gb_accuracy_smote = accuracy_score(y_test_new, gb_y_pred_smote)
gb_classification_report_smote = classification_report(y_test_new, gb_y_pred_smote)

# Print the results
print(f"Accuracy: {gb_accuracy_smote}")
print("Classification Report:")
print(gb_classification_report_smote)


Accuracy: 0.9204339963833634
Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96     32085
         1.0       0.09      0.02      0.03       843
         2.0       0.03      0.21      0.06       252

    accuracy                           0.92     33180
   macro avg       0.36      0.39      0.35     33180
weighted avg       0.94      0.92      0.93     33180

