In [19]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [23]:
# Read the CSV file from the Resources folder into a Pandas DataFrame

data_df = pd.read_csv("Resources/diabetes_prediction_dataset.csv")

# Review the DataFrame

data_df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


In [24]:
data_df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [40]:

diabetes_data_df = data_df.loc[data_df["age"] >= 3].reset_index().drop(columns="index")

diabetes_data_df 



Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
96708,Female,36.0,0,0,No Info,24.60,4.8,145,0
96709,Female,80.0,0,0,No Info,27.32,6.2,90,0
96710,Male,66.0,0,0,former,27.83,5.7,155,0
96711,Female,24.0,0,0,never,35.42,4.0,100,0


In [41]:
diabetes_data_df["smoking_history"].value_counts()

never          34824
No Info        32840
former          9352
current         9276
not current     6417
ever            4004
Name: smoking_history, dtype: int64

In [46]:
diabetes_data_df["smoking_history"] = diabetes_data_df["smoking_history"].replace('not current', 'former')
diabetes_data_df["smoking_history"] = diabetes_data_df["smoking_history"].replace('ever', 'occasional')
diabetes_data_df["smoking_history"].value_counts()


never         34824
No Info       32840
former        15769
current        9276
occasional     4004
Name: smoking_history, dtype: int64

In [68]:
diabetes_data_df = diabetes_data_df[diabetes_data_df["gender"] != "Other"]




In [69]:
diabetes_data_df["gender"].value_counts()


Female    56951
Male      39744
Name: gender, dtype: int64

In [70]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

# Fit and transform the 'Status' column
X = enc.fit_transform(diabetes_data_df[['smoking_history']])

# The result is a sparse matrix, you can convert it to a DataFrame if needed
result_df = pd.DataFrame(X.toarray(), columns=enc.get_feature_names_out(['smoking_history']))

# Concatenate the result with the original DataFrame, dropping the original 'Status' column
diabetes_data_encoded_df = pd.concat([diabetes_data_df.drop(columns=['smoking_history']), result_df], axis=1)

# Print the updated DataFrame
diabetes_data_encoded_df

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_No Info,smoking_history_current,smoking_history_former,smoking_history_never,smoking_history_occasional
0,Female,80.0,0.0,1.0,25.19,6.6,140.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Female,54.0,0.0,0.0,27.32,6.6,80.0,0.0,1.0,0.0,0.0,0.0,0.0
2,Male,28.0,0.0,0.0,27.32,5.7,158.0,0.0,0.0,0.0,0.0,1.0,0.0
3,Female,36.0,0.0,0.0,23.45,5.0,155.0,0.0,0.0,1.0,0.0,0.0,0.0
4,Male,76.0,1.0,1.0,20.14,4.8,155.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64986,,,,,,,,,0.0,0.0,0.0,1.0,0.0
66223,,,,,,,,,0.0,0.0,0.0,1.0,0.0
68797,,,,,,,,,0.0,1.0,0.0,0.0,0.0
72832,,,,,,,,,0.0,0.0,0.0,1.0,0.0


In [71]:

# Specify the column(s) you want to one-hot encode
categorical_columns = ["gender"]

# Use pd.get_dummies to one-hot encode the specified columns
diabetes_data_encoded_df = pd.get_dummies(diabetes_data_encoded_df, columns=categorical_columns)
diabetes_data_encoded_df

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_No Info,smoking_history_current,smoking_history_former,smoking_history_never,smoking_history_occasional,gender_Female,gender_Male
0,80.0,0.0,1.0,25.19,6.6,140.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0
1,54.0,0.0,0.0,27.32,6.6,80.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0
2,28.0,0.0,0.0,27.32,5.7,158.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1
3,36.0,0.0,0.0,23.45,5.0,155.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0
4,76.0,1.0,1.0,20.14,4.8,155.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64986,,,,,,,,0.0,0.0,0.0,1.0,0.0,0,0
66223,,,,,,,,0.0,0.0,0.0,1.0,0.0,0,0
68797,,,,,,,,0.0,1.0,0.0,0.0,0.0,0,0
72832,,,,,,,,0.0,0.0,0.0,1.0,0.0,0,0
