In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.shape

(768, 9)

In [8]:
df.isna().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [9]:
df["Age"].value_counts()

Unnamed: 0_level_0,count
Age,Unnamed: 1_level_1
22,72
21,63
25,48
24,46
23,38
28,35
26,33
27,32
29,29
31,24


In [10]:
df["Outcome"].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [11]:
df.groupby("Outcome").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [12]:
x=df.drop(columns="Outcome",axis=1)
y=df["Outcome"]

Standarizing the Data

In [16]:
from sklearn.preprocessing import MinMaxScaler

In [24]:
scaler=MinMaxScaler()

In [26]:
scaler.fit(x)

In [27]:
scaled_x=scaler.transform(x)

In [29]:
scaled_x

array([[0.35294118, 0.74371859, 0.59016393, ..., 0.50074516, 0.23441503,
        0.48333333],
       [0.05882353, 0.42713568, 0.54098361, ..., 0.39642325, 0.11656704,
        0.16666667],
       [0.47058824, 0.91959799, 0.52459016, ..., 0.34724292, 0.25362938,
        0.18333333],
       ...,
       [0.29411765, 0.6080402 , 0.59016393, ..., 0.390462  , 0.07130658,
        0.15      ],
       [0.05882353, 0.63316583, 0.49180328, ..., 0.4485842 , 0.11571307,
        0.43333333],
       [0.05882353, 0.46733668, 0.57377049, ..., 0.45305514, 0.10119556,
        0.03333333]])

In [30]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report


In [31]:
X=scaled_x
Y=df["Outcome"]

In [32]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [33]:
model=svm.SVC(kernel="linear")

In [35]:
model.fit(X_train,Y_train)

In [36]:
y_pred=model.predict(X_test)

In [38]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.92      0.83       100
           1       0.76      0.46      0.57        54

    accuracy                           0.76       154
   macro avg       0.76      0.69      0.70       154
weighted avg       0.76      0.76      0.74       154



In [40]:
y_pred_train=model.predict(X_train)

In [42]:
print(classification_report(Y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.79      0.91      0.84       400
           1       0.75      0.55      0.63       214

    accuracy                           0.78       614
   macro avg       0.77      0.73      0.74       614
weighted avg       0.78      0.78      0.77       614



In [46]:
input_data=(3,162,52,38,0,37.2,0.652,24)
input_data_as_numpy_array=np.asarray(input_data)
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)
std_data=scaler.transform(input_data_reshaped)
pred=model.predict(std_data)
print(pred)
if(pred[0]==0):
    print("Non-Diabetic")
else:
    print("Diabetic")

[1]
Diabetic




Fine Tuning for the model

In [47]:
from sklearn.model_selection import GridSearchCV

In [53]:
param_grid = {
    'C': [0.1,0.2,0.3, 1, 10, 100,200,300],               # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'degree': [2, 3, 4],                  # Degree for 'poly' kernel
    'gamma': ['scale', 'auto'],           # Kernel coefficient
    'class_weight': [None, 'balanced']    # Handles imbalanced datasets
}

# Step 4: Perform Grid Search with F1-score
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Step 5: Evaluate the best model
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score (on train set):", grid_search.best_score_)

Best Parameters: {'C': 0.1, 'class_weight': None, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
Best F1 Score (on train set): 0.7703411445879165


In [54]:
best_model=svm.SVC(C=1,class_weight=None,degree=2,gamma='scale',kernel='poly')
best_model.fit(X_train,Y_train)

In [55]:
y_pred=best_model.predict(X_test)

print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.93      0.83       100
           1       0.77      0.44      0.56        54

    accuracy                           0.76       154
   macro avg       0.77      0.69      0.70       154
weighted avg       0.76      0.76      0.74       154

