In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
import pickle

In [2]:
diabetes_data = pd.read_csv('Datasets/diabetes.csv')
diabetes_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
diabetes_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
numerical_columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
z_scores = diabetes_data[numerical_columns].apply(zscore)
z_scores

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


In [9]:
threshold = 3
outliers = (z_scores.abs() > threshold).any(axis=1)
outliers

0      False
1      False
2      False
3      False
4       True
       ...  
763    False
764    False
765    False
766    False
767    False
Length: 768, dtype: bool

In [11]:
data_without_outliers = diabetes_data[~outliers]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_without_outliers[numerical_columns])
scaled_data = pd.DataFrame(scaled_features, columns = [col + '_scaled' for col in numerical_columns])
categorical_columns = [col for col in diabetes_data.columns if col not in numerical_columns]
final_data = pd.concat([data_without_outliers[categorical_columns].reset_index(drop =True), scaled_data], axis=1)
final_data

Unnamed: 0,Outcome,Pregnancies_scaled,Glucose_scaled,BloodPressure_scaled,SkinThickness_scaled,Insulin_scaled,BMI_scaled,DiabetesPedigreeFunction_scaled,Age_scaled
0,1,0.657355,0.924040,-0.028115,0.923219,-0.805266,0.210285,0.606516,1.479220
1,0,-0.868490,-1.177082,-0.515765,0.533462,-0.805266,-0.848063,-0.364220,-0.183265
2,1,1.267694,2.091330,-0.678315,-1.350366,-0.805266,-1.346999,0.764788,-0.095766
3,0,-0.868490,-1.043678,-0.515765,0.143704,0.238698,-0.621274,-1.011378,-1.058257
4,0,0.352186,-0.143197,0.134435,-1.350366,-0.805266,-0.999256,-0.891795,-0.270764
...,...,...,...,...,...,...,...,...,...
683,0,1.878032,-0.643464,0.296984,1.767693,1.193814,0.104450,-0.997309,2.616709
684,0,-0.563321,0.056910,-0.190665,0.403542,-0.805266,0.694102,-0.402909,-0.533262
685,0,0.352186,0.023559,-0.028115,0.143704,0.438606,-0.908540,-0.737040,-0.270764
686,1,-0.868490,0.190315,-1.003415,-1.350366,-0.805266,-0.318889,-0.371255,1.216722


In [13]:
X = final_data.drop(columns ='Outcome',axis=1)
Y=final_data['Outcome']
X,Y

(     Pregnancies_scaled  Glucose_scaled  BloodPressure_scaled  \
 0              0.657355        0.924040             -0.028115   
 1             -0.868490       -1.177082             -0.515765   
 2              1.267694        2.091330             -0.678315   
 3             -0.868490       -1.043678             -0.515765   
 4              0.352186       -0.143197              0.134435   
 ..                  ...             ...                   ...   
 683            1.878032       -0.643464              0.296984   
 684           -0.563321        0.056910             -0.190665   
 685            0.352186        0.023559             -0.028115   
 686           -0.868490        0.190315             -1.003415   
 687           -0.868490       -0.910273             -0.190665   
 
      SkinThickness_scaled  Insulin_scaled  BMI_scaled  \
 0                0.923219       -0.805266    0.210285   
 1                0.533462       -0.805266   -0.848063   
 2               -1.350366      

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [17]:
# print("Class Distribution Before SMOTE:", np.unique(Y_train, return_counts=True))
# smote = SMOTE(random_state=42)
# X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)
# print("Class Distribution After SMOTE:", np.unique(Y_train_resampled, return_counts=True))

In [19]:
scaler= StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [21]:
base_estimators =[
    ('rf',RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)),
    ('svm', SVC(kernel='rbf', C=100, gamma=0.001, probability=True, class_weight='balanced'))
]

model = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression(class_weight='balanced', random_state=42))
model.fit(X_train_scaled, Y_train)
print("Train Accuracy:", model.score(X_train_scaled, Y_train))
print("Test Accuracy:", model.score(X_test_scaled, Y_test))
print("Classification Report:\n", classification_report(Y_test, model.predict(X_test_scaled)))

Train Accuracy: 0.9327272727272727
Test Accuracy: 0.7753623188405797
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.74      0.81        92
           1       0.62      0.85      0.72        46

    accuracy                           0.78       138
   macro avg       0.76      0.79      0.76       138
weighted avg       0.81      0.78      0.78       138



In [23]:
def predict_heart_disease(input_data):
    np_array = np.asarray(input_data)
    input_data_reshaped = np_array.reshape(1, -1)
    scaled_data = scaler.transform(input_data_reshaped)
    prediction = model.predict(scaled_data)
    probabilities = model.predict_proba(scaled_data)
    print(f"Prediction: {'Diabetic' if prediction[0] == 1 else 'Not diabetic'}")
    print(f"Prediction Probabilities: {probabilities}")

In [25]:
input_data = (5, 160, 90, 35, 200, 33.0, 1.5, 50)
predict_heart_disease(input_data)

Prediction: Not diabetic
Prediction Probabilities: [[0.71446007 0.28553993]]




In [27]:
  input_data1 = [4,   # Pregnancies
    160, # Glucose
    85,  # BloodPressure
    30,  # SkinThickness
    250, # Insulin
    32,  # BMI
    0.6, # DiabetesPedigreeFunction
    55, #age
                ]                
predict_heart_disease(input_data1)

Prediction: Not diabetic
Prediction Probabilities: [[0.70529583 0.29470417]]




In [29]:
filename = 'Models/diabetes_model.sav'
pickle.dump(model, open(filename,'wb'))