In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
data = pd.read_csv('diabetes_prediction_dataset.csv')

In [3]:
data.dtypes

gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

In [4]:
data["gender"].value_counts()

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [5]:
#drop the rows that have Other value in gender column
data = data[data['gender'] != 'Other']

In [6]:
# map gender to 0 and 1
gender_map = {'Female': 0, 'Male': 1}
data['gender'] = data['gender'].map(gender_map)




In [7]:
data["gender"].value_counts()

gender
0    58552
1    41430
Name: count, dtype: int64

In [8]:
data["smoking_history"].value_counts()

smoking_history
No Info        35810
never          35092
former          9352
current         9286
not current     6439
ever            4003
Name: count, dtype: int64

In [9]:
# map the smoking history to numerical values
smoking_map = {'No Info': 0, 'never': 1, 'former': 2, 'current': 3, 'not current': 4, 'ever' : 5}
data['smoking_history'] = data['smoking_history'].map(smoking_map)



In [10]:
data.dtypes


gender                   int64
age                    float64
hypertension             int64
heart_disease            int64
smoking_history          int64
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

In [11]:
# calculate how many rows that have age less than 15
# age_less_than_15 = data[data['age'] < 2]
# age_less_than_15.shape[0]

# map age values under 20 to 0 and 20 to 40 to 1 and 40 to 60 to 2 and 60 to 80 to 3 and 80 to 100 to 4
data.loc[data['age'] < 20, 'age'] = 0
data.loc[(data['age'] >= 20) & (data['age'] < 40), 'age'] = 1
data.loc[(data['age'] >= 40) & (data['age'] < 60), 'age'] = 2
data.loc[(data['age'] >= 60) & (data['age'] < 80), 'age'] = 3
data.loc[(data['age'] >= 80) , 'age'] = 4



In [12]:
data["bmi"].value_counts()  
# map the values of bmi to 0 and 1 and 2 and 3 
data.loc[data['bmi'] < 18.5, 'bmi'] = 0
data.loc[(data['bmi'] >= 18.5) & (data['bmi'] <= 24.99), 'bmi'] = 1
data.loc[(data['bmi'] >= 25) & (data['bmi'] <= 29.99), 'bmi'] = 2
data.loc[(data['bmi'] >= 30) , 'bmi'] = 3
data["bmi"].value_counts()  


bmi
2.0    45744
3.0    23530
1.0    22216
0.0     8492
Name: count, dtype: int64

In [13]:
data["HbA1c_level"].value_counts()  
data["HbA1c_level"].mean()

# map the values of HbA1c_level to 0 and 1 and 2 and 3
data.loc[data['HbA1c_level'] < 5.7, 'HbA1c_level'] = 0
data.loc[(data['HbA1c_level'] >= 5.7) & (data['HbA1c_level'] <= 6.4), 'HbA1c_level'] = 1
data.loc[(data['HbA1c_level'] >= 6.5) , 'HbA1c_level'] = 2

data["HbA1c_level"].value_counts()

HbA1c_level
1.0    41338
0.0    37850
2.0    20794
Name: count, dtype: int64

In [14]:
data["blood_glucose_level"].value_counts()
# map the values of blood_glucose_level to 0 and 1 and 2 
data["blood_glucose_level"].mean()

# age_less_than_15 = data[(data['blood_glucose_level'] >= 126) ]
# age_less_than_15.shape[0]

data.loc[data['blood_glucose_level'] < 140, 'blood_glucose_level'] = 0
data.loc[(data['blood_glucose_level'] >= 140) & (data['blood_glucose_level'] <= 199), 'blood_glucose_level'] = 1
data.loc[(data['blood_glucose_level'] >= 200) , 'blood_glucose_level'] = 2

data["blood_glucose_level"].value_counts()


blood_glucose_level
1    45472
0    43634
2    10876
Name: count, dtype: int64

In [15]:
data["diabetes"].value_counts()



diabetes
0    91482
1     8500
Name: count, dtype: int64

In [16]:
# do classification using random forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = data.drop('diabetes', axis=1)
y = data['diabetes']    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [17]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9337400610091514

In [18]:
# save the model using pickle
import pickle
filename = 'diabetes_model.sav'
pickle.dump(clf, open(filename, 'wb'))


In [31]:
data["heart_disease"].value_counts()

heart_disease
0    96040
1     3942
Name: count, dtype: int64

In [38]:
import pickle
import gradio as gr
import pandas as pd
import numpy as np

# Load the model
model = pickle.load(open('diabetes_model.sav', 'rb'))

# Define the smoking map
smoking_map = {'No Info': 0, 'never': 1, 'former': 2, 'current': 3, 'not current': 4, 'ever' : 5}

def preprocess_data(gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level):
    # Map smoking history
    smoking_history_mapped = smoking_map.get(smoking_history, 0)

    # Create a DataFrame with the input data
    data = pd.DataFrame({
        'gender': [gender],
        'age': [age],
        'hypertension': [hypertension],
        'heart_disease': [heart_disease],
        'smoking_history': [smoking_history_mapped],
        'bmi': [bmi],
        'HbA1c_level': [HbA1c_level],
        'blood_glucose_level': [blood_glucose_level]
    })

    # Handle potential None values in age, bmi, HbA1c_level, and blood_glucose_level
    data['age'] = np.digitize([data['age'][0] or 0], [20, 40, 60, 80], right=True)[0]
    data['bmi'] = np.digitize([data['bmi'][0] or 0], [18.5, 25, 30], right=True)[0]
    data['HbA1c_level'] = np.digitize([data['HbA1c_level'][0] or 0], [5.7, 6.5], right=True)[0]
    data['blood_glucose_level'] = np.digitize([data['blood_glucose_level'][0] or 0], [140, 200], right=True)[0]
    # map if male = 0 if female = 1
    gender_map = {'Female': 0, 'Male': 1}
    data['gender'] = data['gender'].map(gender_map)




    return data

# Define prediction function
def predict_diabetes(gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level):
    # Preprocess the input data
    input_data = preprocess_data(gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level)

    # Extract the values as a numpy array
    input_array = input_data.values.reshape(1, -1)

    # Make the prediction
    prediction = model.predict(input_array)

    if str(prediction[0])== '0':
        prediction_str = "No Diabetes"
    else:
        prediction_str = "Diabetes"

    return prediction_str

# Gradio Interface
iface = gr.Interface(
    fn=predict_diabetes,
    inputs=[
        gr.Radio(choices=['Male', 'Female'], label="Gender"),
        gr.Slider(minimum=1, maximum=100, label="Age"),
        gr.Checkbox(label="Hypertension"),
        gr.Checkbox(label="Heart Disease"),
        gr.Dropdown(choices=list(smoking_map.keys()), label="Smoking History"),
        gr.Slider(minimum=10, maximum=50, label="BMI"),
        gr.Slider(minimum=4, maximum=8, label="HbA1c Level"),
        gr.Slider(minimum=70, maximum=300, label="Blood Glucose Level"),
    ],
    outputs=gr.Label(),
)

iface.launch()


Running on local URL:  http://127.0.0.1:7874

To create a public link, set `share=True` in `launch()`.




