In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
#  Import and read the diabetes_binary_health_indicators_BRFSS2015.csv
dataset_path = 'diabetes_binary_health_indicators_BRFSS2015.csv'
diabetes_df = pd.read_csv(dataset_path)
diabetes_df.head()

diabetes_df.insert(0, 'id', range(1, len(diabetes_df) + 1))

In [3]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
diabetes_df = diabetes_df.drop(columns=['AnyHealthcare','NoDocbcCost','MentHlth','DiffWalk','Education','Fruits','Veggies','Income','PhysHlth','PhysActivity','Stroke','HeartDiseaseorAttack','HighChol','Smoker','GenHlth','HvyAlcoholConsump','CholCheck'])


In [4]:
diabetes_df.head()

Unnamed: 0,id,Diabetes_binary,HighBP,BMI,Sex,Age
0,1,0.0,1.0,40.0,0.0,9.0
1,2,0.0,0.0,25.0,0.0,7.0
2,3,0.0,1.0,28.0,0.0,9.0
3,4,0.0,1.0,27.0,0.0,11.0
4,5,0.0,1.0,24.0,0.0,11.0


In [5]:
diabetes_df.to_csv('modified.csv', index=False)

In [6]:
# Determine the number of unique values in each column.
diabetes_df.nunique()

id                 253680
Diabetes_binary         2
HighBP                  2
BMI                    84
Sex                     2
Age                    13
dtype: int64

In [7]:
# 
X = diabetes_df.drop('Diabetes_binary', axis=1)
y = diabetes_df['Diabetes_binary']


In [8]:
X.head()

Unnamed: 0,id,HighBP,BMI,Sex,Age
0,1,1.0,40.0,0.0,9.0
1,2,0.0,25.0,0.0,7.0
2,3,1.0,28.0,0.0,9.0
3,4,1.0,27.0,0.0,11.0
4,5,1.0,24.0,0.0,11.0


In [9]:
y.value_counts

<bound method IndexOpsMixin.value_counts of 0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
253675    0.0
253676    1.0
253677    0.0
253678    0.0
253679    1.0
Name: Diabetes_binary, Length: 253680, dtype: float64>

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a Random Forest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)


In [12]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)


# Fit the model using training data
classifier.fit(X_train, y_train)

In [13]:
# Make predictions on the test set
y_pred = classifier.predict(X_test)


In [14]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [15]:
# Generate a confusion matrix for the model
cm_df = pd.DataFrame(confusion_matrix(y_test, y_pred), index = ["No Diabetes", "High-Risk"], columns = ["Predicted No Diabetes", "Predicted High-Risk for Diabetes"])
cm_df

Unnamed: 0,Predicted No Diabetes,Predicted High-Risk for Diabetes
No Diabetes,43400,339
High-Risk,6744,253


In [16]:
# Display results
print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

Accuracy: 0.86

Confusion Matrix:
[[43400   339]
 [ 6744   253]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.99      0.92     43739
         1.0       0.43      0.04      0.07      6997

    accuracy                           0.86     50736
   macro avg       0.65      0.51      0.50     50736
weighted avg       0.81      0.86      0.81     50736

