## Import Libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import  accuracy_score


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# load dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diabetics.csv')
print(df.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148            148             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [8]:
print(df.tail())

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
762           10      101             76             48      180  32.9   
763            2      122             70             27        0  36.8   
764            5      121             72             23      112  26.2   
765            1      126             60              0        0  30.1   
766            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
762                     0.171   63        0  
763                     0.340   27        0  
764                     0.245   30        0  
765                     0.349   47        1  
766                     0.315   23        0  


In [9]:
# number of rows and columns
print(df.shape)

(767, 9)


In [10]:
df.describe()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0
mean,3.846154,120.950456,69.229465,20.521512,79.788787,31.993872,0.472168,33.250326,0.34811
std,3.371638,31.955871,19.564235,15.957245,115.318821,7.889223,0.331446,11.764993,0.476682
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,63.0,0.0,0.0,27.3,0.2435,24.0,0.0
50%,3.0,117.0,72.0,23.0,29.0,32.0,0.374,29.0,0.0
75%,6.0,140.5,80.0,32.0,127.5,36.6,0.6265,41.0,1.0
max,17.0,199.0,148.0,99.0,846.0,67.1,2.42,81.0,1.0


In [11]:
print("Data types of each column:")
print(df.dtypes)

Data types of each column:
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


In [12]:
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [13]:
df['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,267


In [14]:
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.872659,141.494382,71.187266,22.127341,100.382022,35.158052,0.551633,37.108614


In [15]:
# separating the data and labels
X=df.drop(columns='Outcome',axis=1)
Y=df['Outcome']

In [16]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148            148             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
762           10      101             76             48      180  32.9   
763            2      122             70             27        0  36.8   
764            5      121             72             23      112  26.2   
765            1      126             60              0        0  30.1   
766            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [17]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
762    0
763    0
764    0
765    1
766    0
Name: Outcome, Length: 767, dtype: int64


# data preprocessing

In [18]:
# data standerdization
scalar=StandardScaler()

In [19]:
scalar.fit(X)

In [20]:
standardized_data=scalar.transform(X)

In [21]:
X=standardized_data
Y=df['Outcome']

In [22]:
print(X)
print(Y)

[[ 6.39229762e-01  8.47017882e-01  4.02887897e+00 ...  2.03717875e-01
   4.67445242e-01  1.42461654e+00]
 [-8.44696471e-01 -1.12573727e+00 -1.65177569e-01 ... -6.84147432e-01
  -3.65813020e-01 -1.91397847e-01]
 [ 1.23280026e+00  1.94299297e+00 -2.67471631e-01 ... -1.10271250e+00
   6.03302567e-01 -1.06344459e-01]
 ...
 [ 3.42444515e-01  1.55138958e-03  1.41704616e-01 ... -7.34882592e-01
  -6.85832497e-01 -2.76451236e-01]
 [-8.44696471e-01  1.58119259e-01 -4.72059755e-01 ... -2.40214778e-01
  -3.71851123e-01  1.16945637e+00]
 [-8.44696471e-01 -8.75228677e-01  3.94105545e-02 ... -2.02163408e-01
  -4.74498880e-01 -8.71824957e-01]]
0      1
1      0
2      1
3      0
4      1
      ..
762    0
763    0
764    0
765    1
766    0
Name: Outcome, Length: 767, dtype: int64


## Train test

In [25]:
# Split the scaled data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,stratify=Y, random_state=2)

print("Data split into training and testing sets.")
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)

Data split into training and testing sets.
Shape of X_train: (613, 8)
Shape of X_test: (154, 8)
Shape of Y_train: (613,)
Shape of Y_test: (154,)


## Training Model


In [26]:
classifier=svm.SVC(kernel='linear')

In [27]:
# training the support vector Machine Classifier
classifier.fit(X_train,Y_train)

In [30]:
# model eavaluation
# accuracy score on the training data
X_train_prediction=classifier.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [31]:
print('Accuracy of trainning data: ',training_data_accuracy)

Accuracy of trainning data:  0.7862969004893964


In [32]:
# accuracy score on the training data
X_test_prediction=classifier.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [33]:
print('Accuracy of testing data: ',testing_data_accuracy)

Accuracy of testing data:  0.7857142857142857


# making prediction System

In [40]:
input_data=(1	,85,	66,	29,	0	,26.6,	0.351,	31	)
# convert in np array
check_data=np.asarray(input_data)
#reshape
check_data_reshape=check_data.reshape(1,-1)
# standardize the input data
std_data=scalar.transform(check_data_reshape)
print(std_data)
prediction=classifier.predict(std_data)
print(prediction)
if(prediction[0]==0):
   print("The persion is not diabetic")
else:
  print("The person is diabetic")

[[-0.84469647 -1.12573727 -0.16517757  0.53167197 -0.69234884 -0.68414743
  -0.36581302 -0.19139785]]
[0]
The persion is not diabetic


