In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import pickle

In [2]:
diabetes_dataset = pd.read_csv('diabetes.csv')

In [3]:
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [5]:
classifier = svm.SVC(kernel='linear', probability=True)  # Enable probability estimation
classifier.fit(X_train, Y_train)

In [6]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print("\nAccuracy score on the training data:", training_data_accuracy)


Accuracy score on the training data: 0.7833876221498371


In [7]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print("Accuracy score on the test data:", test_data_accuracy)

Accuracy score on the test data: 0.7727272727272727


In [8]:
def classify_disease_level(probability):
    if probability < 0.4:
        return "Low Risk"
    elif 0.4 <= probability < 0.7:
        return "Moderate Risk"
    else:
        return "High Risk"

In [10]:
test_probabilities = classifier.predict_proba(X_test)[:, 1]  # Probabilities for class '1'
test_disease_levels = [classify_disease_level(prob) for prob in test_probabilities]

In [11]:
results_df = pd.DataFrame({

    "Predicted Probability": test_probabilities,
    "Disease Level": test_disease_levels
})

In [12]:
print("\nDisease Levels for Test Set:")
print(results_df)


Disease Levels for Test Set:
     Predicted Probability  Disease Level
0                 0.094510       Low Risk
1                 0.115917       Low Risk
2                 0.598868  Moderate Risk
3                 0.678228  Moderate Risk
4                 0.122585       Low Risk
..                     ...            ...
149               0.214508       Low Risk
150               0.949067      High Risk
151               0.190468       Low Risk
152               0.059776       Low Risk
153               0.463684  Moderate Risk

[154 rows x 2 columns]


In [15]:
filename = 'diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb'))