In [72]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

df = pd.read_csv('diabetes.csv')
df_no_dup = df.copy()
df_no_dup.drop_duplicates()
#replace zeros with nan
df_nan = df_no_dup.copy()
df_nan[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df_nan[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
#impute missing values
df_imputed = df_nan.copy()
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
mean_imputer = mean_imputer.fit(df_imputed[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']])
df_imputed[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']] = mean_imputer.transform(df_imputed[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']].values)
#log transform skewed columns
from scipy.stats import skew
logged_cols = []
cols = df_imputed.columns
df_not_skewed = df_imputed.copy()
for col in cols:
    skewed = skew(df_not_skewed[col].dropna().astype(float))
    if skewed > 0.75:
        logged_cols.append(col)
        df_not_skewed[col] = np.log1p(df_not_skewed[col])
#train val test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
df_shuffle = df_not_skewed.sample(frac = 1, random_state=1234) # shuffle the data
X = df_shuffle.drop(['Outcome'],axis=1)
y = df_shuffle['Outcome']
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
X_train, X_val, y_train, y_val = train_test_split(X_trainval,y_trainval, test_size=0.2, random_state=1234)
#standardise
num_feat = df_not_skewed.columns[0:8]
scaler = StandardScaler()
X_train_stand = X_train.copy()
X_trainval_stand = X_trainval.copy()
X_val_stand = X_val.copy()
X_test_stand = X_test.copy()
X_train_stand[num_feat] = scaler.fit_transform(X_train_stand[num_feat])
X_val_stand[num_feat] = scaler.transform(X_val_stand[num_feat])
X_trainval_stand[num_feat] = scaler.transform(X_trainval_stand[num_feat])
X_test_stand[num_feat] = scaler.transform(X_test_stand[num_feat])
#train best model
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
final_logreg = LogisticRegression(C=10000, max_iter=1000) 
final_isotonic = CalibratedClassifierCV(final_logreg, cv=3, method='isotonic')
final_isotonic.fit(X_trainval_stand, y_trainval)
#results
import sklearn.metrics as metrics
final_y_test_pred_prob = final_isotonic.predict_proba(X_test_stand)
final_fpr, final_tpr, final_threshold = metrics.roc_curve(y_test, final_y_test_pred_prob[:,1])
final_roc_auc = metrics.auc(final_fpr, final_tpr)

Final score: 0.8252551020408163


<h1 align="center">Check if you have Diabetes</h1>

In [75]:
from ipywidgets import Label, FloatText, VBox, HBox, Layout, Button
import pandas as pd
import numpy as np
def on_button_clicked(button):
    input_data = {
        'Pregnancies': pregnancies.value, 
        'Glucose': glucose.value, 
        'BloodPressure': blood_pressure.value,
        'SkinThickness':  skin_thickness.value, 
        'Insulin': insulin.value, 
        'BMI': bmi.value,
        'DiabetesPedigreeFunction': diabetes_pedigree_function.value,
        'Age': age.value
    }
    # Convert the dictionary to a Pandas DataFrame
    input_df = pd.DataFrame.from_dict([input_data])
    # Reorder the columns to match the order of the input features in your trained model
    input_df = input_df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]
    #convert missing values to nan
    input_df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = input_df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
    #use imputer
    input_df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']] = mean_imputer.transform(input_df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']].values)
    #log the skewed columns
    for col in logged_cols:
        input_df[col] = np.log1p(input_df[col])
    #use scaler
    input_df[num_feat] = scaler.transform(input_df[num_feat])
    #use best model
    result = final_isotonic.predict_proba(input_df)
    result = round(result[0][1] * 100,2) 
    values_label.value = 'You are ' + str(result) + '% ' + 'likely to have diabetes'
pregnancies = FloatText(description='Pregnancies:')
glucose = FloatText(description='Glucose:')
blood_pressure = FloatText(description='BloodPressure:')
skin_thickness = FloatText(description='SkinThickness:')
insulin = FloatText(description='Insulin:')
bmi = FloatText(description='BMI:')
diabetes_pedigree_function = FloatText(description='DiabetesPedigreeFunction:')
age = FloatText(description='Age:')
button = Button(description='Submit')
button.on_click(on_button_clicked)
values_label = Label()

for w in [pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, diabetes_pedigree_function, age, values_label]:
    w.layout.width = 'auto'
    w.layout.margin = 'auto'

button.layout.margin = 'auto'
button.layout.width = '10%'

# set color scheme
style = {'description_width': 'initial',
         'font_weight': 'bold',
         'width': '100px',
         'background-color': '#F0F0F0'}
pregnancies.style = style
glucose.style = style
blood_pressure.style = style
skin_thickness.style = style
insulin.style = style
bmi.style = style
diabetes_pedigree_function.style = style
age.style = style
button.style.button_color = '#3CB371'

# arrange fields in two rows with four fields each
row1 = HBox([pregnancies, glucose, blood_pressure, skin_thickness], layout=Layout(margin='100px 0 0 0'))
row2 = HBox([insulin, bmi, diabetes_pedigree_function, age], layout=Layout(margin='100px 0 0 0'))
row3 = HBox([button], layout=Layout(margin='100px 0 0 0'))
fields = VBox([row1, row2, row3, values_label], layout=Layout(overflow='hidden'))

# display the fields
fields



VBox(children=(HBox(children=(FloatText(value=0.0, description='Pregnancies:'), FloatText(value=0.0, descripti…