# **ENV Setup**


In [None]:
!rm -rf *
!pip uninstall scikit-learn -y --quiet
!pip install -U scikit-learn --quiet
!pip install git+https://github.com/scikit-learn-contrib/imbalanced-learn.git@master > /dev/null 2>&1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25h

# **Imports**

In [None]:
import pandas as pd
import requests
import math
import random
import numpy as np
import json
import pickle


from os import path,getcwd,makedirs

# SKLearn Imports
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures,RobustScaler

# Imbalanced Learn
from imblearn.over_sampling import *
import seaborn as sns

import matplotlib.pyplot as plt
import seaborn as sns

# **Constants**

In [None]:
# File Constants
CSV_URL = "https://res.cloudinary.com/ddfaksud1/raw/upload/v1719156345/Final%20Year%20Project/Diabetes-data.csv"
LOCAL_CSV_NAME = "diabetes.csv"

# Training - Testing Constants
TEST_SIZE = 0.2

# Data Transormation Constants
OVERSAMPLING_STRATEGY = "minority"
OVERSAMPLING_RANDOM_STATE = random.randint(1,100)

# Model Constants
SVM_KERNEL = "linear"
SVM_RANDOM_STATE = 0
LOGISTIC_REGRESSION_RANDOM_STATE = 0
KNN_NUM_NEIGHBOURS = 5
KNN_METRIC = "minkowski"
KNN_POWER_PARAMETER = 2

# **Presets**

In [None]:
model_summary = {}
logs = {}

# **Download CSV File**

In [None]:
def download_csv_file():
  with requests.get(CSV_URL, stream=True) as response:
    with open(LOCAL_CSV_NAME, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

download_csv_file()

# **Local Setup**

In [None]:
makedirs("BackendAssets",exist_ok=True)
makedirs("BackendAssets/Models",exist_ok=True)
makedirs("BackendAssets/DataInsights",exist_ok=True)

# **Reading the CSV**

In [None]:
df = pd.read_csv(LOCAL_CSV_NAME).sample(frac=1)
print(df.head(1))
print(len(df[df["diabetes"] == 1]))
print(len(df[df["diabetes"] == 0]))

      gender   age  hypertension  heart_disease smoking_history    bmi  \
74298   Male  41.0             0              0         No Info  27.32   

       HbA1c_level  blood_glucose_level  diabetes  
74298          5.8                  100         0  
8500
91500


# **Data Transformation**

In [None]:
df['smoking_history'] = df['smoking_history'].apply(
    lambda x: 0 if x == 'never' else
              1 if x == 'No Info' else
              2 if x == 'current' else
              3 if x == 'former' else
              4 if x == 'not current' else 5)

df['gender'] = df['gender'].apply(
    lambda x: 0 if x == 'Male' else
              1 if x=='Female' else 2)

# **Correlation Matrix**

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),linewidth=.01,annot=True,cmap="winter")
plt.savefig("BackendAssets/DataInsights/CorrelationMatrix.png")
plt.show()

# **Train-Test Split + Oversampling**

In [None]:
# Extracting the Dependent and Indpendent features
X = df.drop(columns="diabetes", axis=1)
Y = df["diabetes"]

# Scaling the Data
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# Train-Test Split
x_train, x_test,y_train, y_test = train_test_split(
  X, Y, test_size=TEST_SIZE, stratify=Y
)

# Oversampling
ros = SMOTE(sampling_strategy=OVERSAMPLING_STRATEGY,random_state=OVERSAMPLING_RANDOM_STATE)
x_train,y_train = ros.fit_resample(x_train,y_train)

# **Model Class**

In [None]:
class ClassifierModel:
  def __init__(self,model,name):
    self.model_instance = model
    self.model_name = name
    self.prepare_model()
    self.calculate_metrics()
    self.log_model_insights()
    self.save_model()

  def prepare_model(self):
    self.model_instance.fit(x_train,np.ravel(y_train))
    self.y_pred = self.model_instance.predict(x_test)



  def calculate_metrics(self):
    self.accuracy = metrics.accuracy_score(y_test,self.y_pred) * 100
    self.confusion_matrix = metrics.confusion_matrix(y_test,self.y_pred)


  def log_model_insights(self):
    model_summary[self.model_name] = {
      "model": self.model_instance,
      "accuracy":self.accuracy,
      "confusion_matrix":self.confusion_matrix.tolist()
    }

  def save_model(self):
    with open(f'BackendAssets/Models/{self.model_name.replace(" ","_")}.pkl',"wb") as file:
      pickle.dump(self.model_instance,file)

# **Models**

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

svm_model = ClassifierModel(model = SVC(kernel=SVM_KERNEL,random_state=SVM_RANDOM_STATE),name = "SVM")
lr_model = ClassifierModel(model = LogisticRegression(random_state=LOGISTIC_REGRESSION_RANDOM_STATE),name = "Logistic Regression")
rfc_model = ClassifierModel(model = RandomForestClassifier(),name = "Random Forest")
gnb_model = ClassifierModel(model = GaussianNB(),name = "Gaussian Naive Bayes")
knn_model = ClassifierModel(model = KNeighborsClassifier(n_neighbors=KNN_NUM_NEIGHBOURS,metric=KNN_METRIC,p=KNN_POWER_PARAMETER),name = "KNN")
dt_model = ClassifierModel(model = DecisionTreeClassifier(),name = "Decision Tree")

In [None]:
model_summary

{'SVM': {'model': SVC(kernel='linear', random_state=0),
  'accuracy': 88.505,
  'confusion_matrix': [[16199, 2101], [198, 1502]]},
 'Logistic Regression': {'model': LogisticRegression(random_state=0),
  'accuracy': 88.545,
  'confusion_matrix': [[16212, 2088], [203, 1497]]},
 'Random Forest': {'model': RandomForestClassifier(),
  'accuracy': 95.985,
  'confusion_matrix': [[17941, 359], [444, 1256]]},
 'Gaussian Naive Bayes': {'model': GaussianNB(),
  'accuracy': 88.675,
  'confusion_matrix': [[16427, 1873], [392, 1308]]},
 'KNN': {'model': KNeighborsClassifier(),
  'accuracy': 91.295,
  'confusion_matrix': [[16879, 1421], [320, 1380]]},
 'Decision Tree': {'model': DecisionTreeClassifier(),
  'accuracy': 94.66,
  'confusion_matrix': [[17663, 637], [431, 1269]]}}

# **Logging**

In [None]:
# Logging the Basic Insights of the whole Training Process

logs["Test_size"] = 0.2
logs["Oversampling_strategy"] = OVERSAMPLING_STRATEGY
logs["Oversampling_random_state"] = OVERSAMPLING_RANDOM_STATE

logs["SVM"] = {}
logs["SVM"]["Kernel"] = SVM_KERNEL
logs["SVM"]["Random_State"] = SVM_RANDOM_STATE

logs["Logistic Regression"] = {}
logs["Logistic Regression"]["Random_State"] = LOGISTIC_REGRESSION_RANDOM_STATE

logs["KNN"] = {}
logs["KNN"]["Num_Neighbours"] = KNN_NUM_NEIGHBOURS
logs["KNN"]["Metric"] = KNN_METRIC
logs["KNN"]["Power_Parameter"] = KNN_POWER_PARAMETER

# **Testing the Category Based Accuracy**

In [None]:
def test_category_based_accuracy_of_model(category,model):
  cnt = 0
  filtered_test_set = x_test[y_test == category]
  for test_set in filtered_test_set:
    if int(model.predict([test_set])[0]) == category:
      cnt += 1
  return (cnt/len(filtered_test_set)) * 100

In [None]:
models = list(model_summary.keys())

for model in models:
  zero_acc = test_category_based_accuracy_of_model(0,model_summary[model]["model"])
  one_acc = test_category_based_accuracy_of_model(1,model_summary[model]["model"])

  model_summary[model]["0_accuracy"] = zero_acc
  model_summary[model]["1_accuracy"] = one_acc

print(model_summary)


{'SVM': {'model': SVC(kernel='linear', random_state=0), 'accuracy': 88.505, 'confusion_matrix': [[16199, 2101], [198, 1502]], '0_accuracy': 88.51912568306011, '1_accuracy': 88.3529411764706}, 'Logistic Regression': {'model': LogisticRegression(random_state=0), 'accuracy': 88.545, 'confusion_matrix': [[16212, 2088], [203, 1497]], '0_accuracy': 88.59016393442623, '1_accuracy': 88.05882352941177}, 'Random Forest': {'model': RandomForestClassifier(), 'accuracy': 95.985, 'confusion_matrix': [[17941, 359], [444, 1256]], '0_accuracy': 98.03825136612022, '1_accuracy': 73.88235294117646}, 'Gaussian Naive Bayes': {'model': GaussianNB(), 'accuracy': 88.675, 'confusion_matrix': [[16427, 1873], [392, 1308]], '0_accuracy': 89.76502732240436, '1_accuracy': 76.94117647058823}, 'KNN': {'model': KNeighborsClassifier(), 'accuracy': 91.295, 'confusion_matrix': [[16879, 1421], [320, 1380]], '0_accuracy': 92.23497267759562, '1_accuracy': 81.17647058823529}, 'Decision Tree': {'model': DecisionTreeClassifier(

# **Dumping all the Insights**

In [None]:
with open("BackendAssets/logs.json",'w') as fp:
  fp.write(json.dumps(logs))

with open("BackendAssets/models.json",'w') as fp:
  model_summary_writable = {}
  models = list(model_summary.keys())
  for model in models:
    model_summary_writable[model] = {
        "accuracy":model_summary[model]["accuracy"],
        "confusion_matrix":model_summary[model]["confusion_matrix"],
        '0_accuracy' : model_summary[model]["0_accuracy"],
        '1_accuracy' : model_summary[model]["1_accuracy"]
    }

  fp.write(json.dumps(model_summary_writable))

In [None]:
test_vector()