In [None]:
#Importing dependencies

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import probplot
from sklearn.compose import ColumnTransformer



In [88]:
#Loading dataset 

dataset = pd.read_csv("diabetes.csv")

In [89]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
#Data visualisation 

X = dataset.drop(columns = 'Outcome', axis = 1)
y = dataset["Outcome"]

# Group the features by target
features_by_target = X.groupby(y)

# Loop over each group of features and create QQ plots
for target, features in features_by_target:
    # Create a figure with multiple subplots
    fig, axes = plt.subplots(nrows=1, ncols=len(features), figsize=(16, 4))
    
    # Loop over each feature and create a QQ plot
    for i, feature in enumerate(features.columns):
        # Select the data for the current feature and target
        data = features[feature]
        
        # Create a QQ plot using scipy.stats.probplot
        _, r = probplot(data, fit=False, plot=axes[i])
        
        # Set the title of the subplot to the name of the feature
        axes[i].set_title(feature)
        
        # Add a label to the plot indicating the correlation coefficient
        axes[i].text(0.1, 0.9, f"r={r[2]:.2f}", transform=axes[i].transAxes)
    
    # Set the title of the figure to the name of the target
    fig.suptitle(f"QQ Plots for {target}", fontsize=16, fontweight='bold')

# Show all figures
plt.show()

In [84]:
dataset.shape #Size of dataset

(768, 9)

In [10]:
dataset.describe()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [11]:
#find the number of outcome in each category
#Note 0 --> non-Diabetic
#     1 --> Diabetic

dataset["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

**Important**



In [13]:
#To calculate the metrics of the dataset based on the 2 targets, in this case 
#we've chosen mean but can also use describe 

dataset.groupby("Outcome").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [15]:
#Separating the data and labels

X = dataset.drop(columns = 'Outcome', axis = 1)
y = dataset["Outcome"]

**Important** 
: Data Standardization


In [19]:
#We standardize because there is a wide range in values of data and it make prediction easier
scaler = StandardScaler()

In [25]:
standardized_data = scaler.fit_transform(X)

In [28]:
X = standardized_data

In [38]:
#Split data using train_test_split function

X_train , X_test, y_train, y_test = train_test_split(X,y , test_size = 0.2, stratify = y, random_state = 2)

In [46]:
#Here load and use the support vector machine classifier and kernel represent the model
#In this case, it will be a linear model
classifier_diabetes= svm.SVC(kernel = "linear")

In [52]:
#Training the support vector machine classifier
classifier_diabetes.fit(X_train, y_train)


SVC(kernel='linear')

In [54]:
#Model evaluation on training data
X_train_prediction = classifier_diabetes.predict(X_train)
acc = accuracy_score(X_train_prediction, y_train)
print("Accuracy Score on training data:", acc)

Accuracy Score on training data: 0.7866449511400652


In [58]:
#Model evaluation on test data
X_test_prediction = classifier_diabetes.predict(X_test)
acc = accuracy_score(X_test_prediction, y_test)
print("Accuracy Score on test data:", acc)

Accuracy Score on test data: 0.7727272727272727


**Saving Model in pickle file**

In [69]:
#Todo

**Making Predictive system**


In [82]:
input_data = (8,183,64,0,0,23.3,0.672,32)

#changing input data to numpy array

input_data_transf = np.asarray(input_data)

#Reshape the array as we are predicting for one instance
input_data_reshape = input_data_transf.reshape(1,-1)

#Standardize the input data 

st_data = scaler.transform(input_data_reshape)

prediction = classifier_diabetes.predict(st_data)
print(prediction)

if (prediction[0] == 0):
    print("The person is not diabetic")
else:
    print("The person is diabetic")


[1]
The person is diabetic


