# Diabetes Classifier
## by: Joseph Curtis

In [3]:
import pandas as pd
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn import linear_model, metrics, model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import KFold, cross_val_score

filepath_2015 = "diabetes_binary_5050split_health_indicators_BRFSS2015.csv"
filepath_2021 = "diabetes_binary_5050split_health_indicators_BRFSS2021.csv"
df1 = pd.read_csv(filepath_2015)
df2 = pd.read_csv(filepath_2021)

# Combine the two DataFrames
combined_df = pd.concat([df1, df2], axis=0).reset_index(drop=True)

# Display the first few rows of the combined dataframe and its shape to verify 
# the combination
combined_df_info = combined_df.head(), combined_df.shape

combined_df_info

(   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
 0              0.0     1.0       0.0        1.0  26.0     0.0     0.0   
 1              0.0     1.0       1.0        1.0  26.0     1.0     1.0   
 2              0.0     0.0       0.0        1.0  26.0     0.0     0.0   
 3              0.0     1.0       1.0        1.0  28.0     1.0     0.0   
 4              0.0     0.0       0.0        1.0  29.0     1.0     0.0   
 
    HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
 0                   0.0           1.0     0.0  ...            1.0   
 1                   0.0           0.0     1.0  ...            1.0   
 2                   0.0           1.0     1.0  ...            1.0   
 3                   0.0           1.0     1.0  ...            1.0   
 4                   0.0           1.0     1.0  ...            1.0   
 
    NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
 0          0.0      3.0       5.0      30.0       0

In [4]:
# Remove irrelevant features from the combined dataset
columns_to_remove = ['CholCheck', 'AnyHealthcare', 'NoDocbcCost', 'Education', 'Income']
reduced_df = combined_df.drop(columns=columns_to_remove)

# Display the first few rows of the reduced dataframe to verify the removal
reduced_df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age
0,0.0,1.0,0.0,26.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0
1,0.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0
2,0.0,0.0,0.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0
3,0.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0
4,0.0,0.0,0.0,29.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0


In [5]:
# Check for missing values in the reduced dataset
missing_values = reduced_df.isnull().sum()

missing_values ## no missing values found

Diabetes_binary         0
HighBP                  0
HighChol                0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
dtype: int64

In [6]:
# Check the range of values for the specified features to determine suitable data types
features_to_optimize = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age']
data_types_optimization = reduced_df[features_to_optimize].describe().loc[['min', 'max']]

# Memory used before reducing data types
memory_before = reduced_df.memory_usage(index=True).sum()

data_types_optimization

Unnamed: 0,BMI,GenHlth,MentHlth,PhysHlth,Age
min,12.0,1.0,0.0,0.0,1.0
max,99.0,5.0,30.0,30.0,13.0


In [7]:
binary_columns = ['Diabetes_binary', 'HighBP', 'HighChol', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'DiffWalk', 'Sex']
for column in binary_columns:
    reduced_df[column] = reduced_df[column].astype('bool')

# After data types reduces memory size
memory_after = reduced_df.memory_usage(index=True).sum()

print("Dataframe memory used before:", memory_before)
print("Dataframe memory used after:  ", memory_after)

Dataframe memory used before: 18744740
Dataframe memory used after:   7167188


In [9]:
###### Logistic Regression algorithm ######

# Selecting numerical columns (excluding binary/boolean columns)
numerical_columns = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the numerical features
reduced_df[numerical_columns] = scaler.fit_transform(reduced_df[numerical_columns])

mylog_model = linear_model.LogisticRegression(solver='saga', max_iter=1000)

# 'X' is the feature set and 'y' is the target variable
X_log = reduced_df.drop('Diabetes_binary', axis=1)
y_log = reduced_df['Diabetes_binary'].astype('bool')  # Ensuring the target is boolean

# Splitting the dataset into the Training set and Test set
X_log_train, X_log_test, y_log_train, y_log_test = model_selection.train_test_split(X_log, y_log, test_size=0.25, random_state=42)

mylog_model.fit(X_log_train, y_log_train)

y_pred_log = mylog_model.predict(X_log_test)

# Evaluate the model
k_folds = KFold(n_splits = 5, shuffle=True)
# The number of folds determines the test/train split for each iteration. 
# So 5 folds has 5 different mutually exclusive training sets. 
# That's a 1 to 4 (or .20 to .80) testing/training split for each of the 5 iterations.

log_scores = cross_val_score(mylog_model, X_log, y_log)
# This shows the average score. Print 'scores' to see an array of individual iteration scores.
print("Logistic Regression Average Prediction Score: ", round(log_scores.mean()*100, 2), "%")

Logistic Regression Average Prediction Score:  74.26 %
