In [1]:
# Importing modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from pyspark.sql import SparkSession
import findspark
import joblib
from sqlalchemy import create_engine

In [2]:
engine = create_engine('sqlite:///../Resources/Data/diabetes_health_indicators.db') 
query = "SELECT * FROM diabetes_indicators;" 
diabetes_df = pd.read_sql(query, engine)

In [3]:
diabetes_df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [18]:
diabetes_df.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [5]:
y = diabetes_df['Diabetes_012']

X = diabetes_df.drop(columns=['Diabetes_012'])

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
from imblearn.under_sampling import RandomUnderSampler

under_sampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)


In [9]:
logistic_regression_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500, class_weight='balanced', random_state=42)

In [10]:
lr_model = logistic_regression_model.fit(X_train_resampled, y_train_resampled)

In [11]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

# Generate testing predictions 
testing_predictions = logistic_regression_model.predict(X_test)

In [12]:
training_matrix = confusion_matrix(y_train, training_predictions)
print(training_matrix)

[[113275  29016  28671]
 [  1055   1112   1538]
 [  4814   6624  16839]]


In [13]:
test_matrix = confusion_matrix(y_test, testing_predictions)
print(test_matrix)

[[28418  7101  7222]
 [  243   301   382]
 [ 1237  1719  4113]]


In [14]:
training_report = classification_report(y_train, training_predictions)
print(training_report)

              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78    170962
         1.0       0.03      0.30      0.05      3705
         2.0       0.36      0.60      0.45     28277

    accuracy                           0.65    202944
   macro avg       0.45      0.52      0.43    202944
weighted avg       0.85      0.65      0.72    202944



In [15]:
testing_report = classification_report(y_test, testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

         0.0       0.95      0.66      0.78     42741
         1.0       0.03      0.33      0.06       926
         2.0       0.35      0.58      0.44      7069

    accuracy                           0.65     50736
   macro avg       0.44      0.52      0.43     50736
weighted avg       0.85      0.65      0.72     50736



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.86      0.97      0.91     42741
         1.0       0.00      0.00      0.00       926
         2.0       0.47      0.16      0.24      7069

    accuracy                           0.84     50736
   macro avg       0.44      0.38      0.38     50736
weighted avg       0.79      0.84      0.80     50736

