In [2]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame

data_df = pd.read_csv("Resources/diabetes_prediction_dataset.csv")

# Review the DataFrame

data_df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [4]:
data_df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [5]:

diabetes_data_df = data_df.loc[data_df["age"] >= 3].reset_index().drop(columns="index")

diabetes_data_df 



Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
96708,Female,36.0,0,0,No Info,24.60,4.8,145,0
96709,Female,80.0,0,0,No Info,27.32,6.2,90,0
96710,Male,66.0,0,0,former,27.83,5.7,155,0
96711,Female,24.0,0,0,never,35.42,4.0,100,0


In [6]:
diabetes_data_df["smoking_history"].value_counts()

never          34824
No Info        32840
former          9352
current         9276
not current     6417
ever            4004
Name: smoking_history, dtype: int64

In [7]:
diabetes_data_df["smoking_history"] = diabetes_data_df["smoking_history"].replace('not current', 'former')
diabetes_data_df["smoking_history"] = diabetes_data_df["smoking_history"].replace('ever', 'occasional')
diabetes_data_df["smoking_history"].value_counts()


never         34824
No Info       32840
former        15769
current        9276
occasional     4004
Name: smoking_history, dtype: int64

In [8]:
diabetes_data_df = diabetes_data_df[diabetes_data_df["gender"] != "Other"]
diabetes_data_df



Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
96708,Female,36.0,0,0,No Info,24.60,4.8,145,0
96709,Female,80.0,0,0,No Info,27.32,6.2,90,0
96710,Male,66.0,0,0,former,27.83,5.7,155,0
96711,Female,24.0,0,0,never,35.42,4.0,100,0


In [9]:
diabetes_data_df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [10]:
diabetes_data_df["gender"].value_counts()


Female    56951
Male      39744
Name: gender, dtype: int64

In [11]:
from sklearn.preprocessing import OneHotEncoder

# # Initialize the OneHotEncoder
# enc = OneHotEncoder(handle_unknown='ignore')

# # Fit and transform the 'Status' column
# X = enc.fit_transform(diabetes_data_df[['smoking_history']])



In [12]:
# The result is a sparse matrix, you can convert it to a DataFrame if needed
# result_df = pd.DataFrame(X.toarray(), columns=enc.get_feature_names_out(['smoking_history']))
# result_df

In [13]:
diabetes_data_df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
96708,Female,36.0,0,0,No Info,24.60,4.8,145,0
96709,Female,80.0,0,0,No Info,27.32,6.2,90,0
96710,Male,66.0,0,0,former,27.83,5.7,155,0
96711,Female,24.0,0,0,never,35.42,4.0,100,0


In [14]:

# # Concatenate the result with the original DataFrame, dropping the original 'Status' column
# diabetes_data_encoded_df = pd.concat([diabetes_data_df.drop(columns=['smoking_history']), result_df], axis=1)

# # Print the updated DataFrame
# diabetes_data_encoded_df

In [15]:

# Specify the column(s) you want to one-hot encode
categorical_columns = ["gender", "smoking_history"]

# Use pd.get_dummies to one-hot encode the specified columns
diabetes_data_encoded_df = pd.get_dummies(diabetes_data_df, columns=categorical_columns)
diabetes_data_encoded_df

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,smoking_history_No Info,smoking_history_current,smoking_history_former,smoking_history_never,smoking_history_occasional
0,80.0,0,1,25.19,6.6,140,0,1,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0,1,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,0,1,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,1,0,0,1,0,0,0
4,76.0,1,1,20.14,4.8,155,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96708,36.0,0,0,24.60,4.8,145,0,1,0,1,0,0,0,0
96709,80.0,0,0,27.32,6.2,90,0,1,0,1,0,0,0,0
96710,66.0,0,0,27.83,5.7,155,0,0,1,0,0,1,0,0
96711,24.0,0,0,35.42,4.0,100,0,1,0,0,0,0,1,0


In [16]:
# diabetes_data_encoded_df.to_csv('diabetes.csv', index=False)

In [17]:
# Seperate the features, X,  from the target variable, y
y = diabetes_data_encoded_df['diabetes']
X = diabetes_data_encoded_df.drop(columns='diabetes')

In [18]:
# Review the y variable Series
y.head(3)

0    0
1    0
2    0
Name: diabetes, dtype: int64

In [19]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Female,gender_Male,smoking_history_No Info,smoking_history_current,smoking_history_former,smoking_history_never,smoking_history_occasional
0,80.0,0,1,25.19,6.6,140,1,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,1,0,1,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,1,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,1,0,0,1,0,0,0
4,76.0,1,1,20.14,4.8,155,0,1,0,1,0,0,0


In [20]:
# Check the balance of our target values
y.value_counts()

0    88195
1     8500
Name: diabetes, dtype: int64

In [21]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
45593,0,0
52405,0,0
28793,0,0
39868,0,0
62951,0,0
...,...,...
79507,0,0
95439,0,0
31463,0,0
46757,0,0


In [24]:
# Print the balanced_accuracy score of the model
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.9571440390502193

In [25]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[21790,   232],
       [  804,  1348]], dtype=int64)

In [26]:
# Print the classification report for the model
from sklearn.metrics import classification_report
target_names = ["No Diabetes", "Diabetes"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 No Diabetes       0.96      0.99      0.98     22022
    Diabetes       0.85      0.63      0.72      2152

    accuracy                           0.96     24174
   macro avg       0.91      0.81      0.85     24174
weighted avg       0.95      0.96      0.95     24174



In [27]:
# from sklearn.preprocessing import StandardScaler

# # Instantiate a StandardScaler instance
# scaler = StandardScaler()

# # Fit the training data to the standard scaler
# X_scaler = scaler.fit(X_train)

# # Transform the training data using the scaler
# X_train_scaled = X_scaler.transform(X_train)

# # Transform the testing data using the scaler
# X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler
# Instantiate the random oversampler model
# Assign a random_state parameter of 1 to the model
over_sampler = RandomOverSampler(random_state=1)
# Fit the original training data to the random_oversampler model
X_train_resampled, y_train_resampled = over_sampler.fit_resample(X_train, y_train)

In [29]:
# Count the distinct values of the resampled labels data
y_train_resampled.value_counts()

0    66173
1    66173
Name: diabetes, dtype: int64

In [30]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_train_resampled, y_train_resampled)
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Prediction,Actual
45593,0,0
52405,0,0
28793,0,0
39868,0,0
62951,0,0
...,...,...
79507,0,0
95439,0,0
31463,0,0
46757,1,0


In [31]:
# Print the balanced_accuracy score of the model 
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.8857863820633739

In [32]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[19511,  2511],
       [  250,  1902]], dtype=int64)

In [33]:
# Print the classification report for the model
target_names = ["no diabetes", "diabetes"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

 no diabetes       0.99      0.89      0.93     22022
    diabetes       0.43      0.88      0.58      2152

    accuracy                           0.89     24174
   macro avg       0.71      0.88      0.76     24174
weighted avg       0.94      0.89      0.90     24174



In [34]:
# Split our preprocessed data into our features and target arrays
y = diabetes_data_encoded_df['diabetes'].values
X = diabetes_data_encoded_df.drop(columns=['diabetes']).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [35]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [36]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
import tensorflow as tf
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim = number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                1120      
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 3581 (13.99 KB)
Trainable params: 3581 (13.99 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [37]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [38]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [39]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

756/756 - 1s - loss: 0.0931 - accuracy: 0.9695 - 802ms/epoch - 1ms/step
Loss: 0.09308144450187683, Accuracy: 0.9695127010345459


In [40]:
# # Export our model to HDF5 file
# nn.save("DiabetesNN1.h5")

In [43]:
diabetes_data_encoded_df = diabetes_data_encoded_df.rename(columns={'smoking_history_No Info': 'smoking_history_No_Info'})
diabetes_data_encoded_df


Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male,smoking_history_No_Info,smoking_history_current,smoking_history_former,smoking_history_never,smoking_history_occasional
0,80.0,0,1,25.19,6.6,140,0,1,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0,1,0,0,0,0
2,28.0,0,0,27.32,5.7,158,0,0,1,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,1,0,0,1,0,0,0
4,76.0,1,1,20.14,4.8,155,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96708,36.0,0,0,24.60,4.8,145,0,1,0,1,0,0,0,0
96709,80.0,0,0,27.32,6.2,90,0,1,0,1,0,0,0,0
96710,66.0,0,0,27.83,5.7,155,0,0,1,0,0,1,0,0
96711,24.0,0,0,35.42,4.0,100,0,1,0,0,0,0,1,0


In [44]:
diabetes_data_encoded_df.to_csv('diabetes.csv', index=False)