##**Step-1: Install/ Import the required Python Packages/ Libraries, Mount the Google Drive and read and check the Data and Customer files**

**1) Install/ Import the required Python Packages/ Libraries**

In [159]:
#Import required python packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [160]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**2) Mounting the Google Drive**

In [161]:
# Mount the Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**3) Read the Data file and Customer file and check**

In [162]:
# Read the Diabetes Data from .csv file and check the data shape (number of Rows and Columns)
train_df = pd.read_csv('gdrive/My Drive/NCJ-MLP-Training-2022/NCJ-MLP-Projects-Latest/03-Diabetes-Project/Data-Files/diabetes-train.csv')
test_df = pd.read_csv('gdrive/My Drive/NCJ-MLP-Training-2022/NCJ-MLP-Projects-Latest/03-Diabetes-Project/Data-Files/diabetes-customer.csv')
print(train_df.shape)
print(test_df.shape)

(700, 9)
(68, 8)


In [163]:
train_df['Outcome'].value_counts()

0    459
1    241
Name: Outcome, dtype: int64

##**Step-2: Combine the Train and Test File**

In [164]:
train_df['train']=1
test_df['test'] = 0

In [165]:
print(train_df.shape)
print(test_df.shape)

(700, 10)
(68, 9)


In [166]:
test_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,test
0,2,122,76,27,200,35.9,0.483,26,0
1,6,125,78,31,0,27.6,0.565,49,0
2,1,168,88,29,0,35.0,0.905,52,0
3,2,129,0,0,0,38.5,0.304,41,0
4,4,110,76,20,100,28.4,0.118,27,0


In [167]:
train_df.info()
print()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               700 non-null    int64  
 1   Glucose                   700 non-null    int64  
 2   BloodPressure             700 non-null    int64  
 3   SkinThickness             700 non-null    int64  
 4   Insulin                   700 non-null    int64  
 5   BMI                       700 non-null    float64
 6   DiabetesPedigreeFunction  700 non-null    float64
 7   Age                       700 non-null    int64  
 8   Outcome                   700 non-null    int64  
 9   train                     700 non-null    int64  
dtypes: float64(2), int64(8)
memory usage: 54.8 KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------               

In [168]:
combined_df  = pd.concat([train_df, test_df])
combined_df.shape

(768, 11)

In [169]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 67
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   700 non-null    float64
 9   train                     700 non-null    float64
 10  test                      68 non-null     float64
dtypes: float64(5), int64(6)
memory usage: 72.0 KB


##**Step-3: Check the Data Types of the Columns as well as Missing Data**

**1) Execute the "info()" command and check datatypes of the Columns and Missing Data**

In [170]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 67
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   700 non-null    float64
 9   train                     700 non-null    float64
 10  test                      68 non-null     float64
dtypes: float64(5), int64(6)
memory usage: 72.0 KB


##**Step-4: Check on Data Preprocessing applicability (Initial)**


###**1) Checking the Missing Values and its Handling**

**a) Check the Missing Values, if any**

In [171]:
combined_df.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                      68
train                        68
test                        700
dtype: int64

In [172]:
combined_df.shape

(768, 11)

In [173]:
combined_df.drop_duplicates(inplace = True)
combined_df.shape

(768, 11)

In [174]:
combined_df["Outcome"].value_counts()

0.0    459
1.0    241
Name: Outcome, dtype: int64

##**Conclusion: It is a Binary Classification with imbalanced Classes**

In [175]:
combined_df['Pregnancies'].value_counts()

1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: Pregnancies, dtype: int64

In [176]:
combined_df['Glucose'].value_counts()

99     17
100    17
111    14
129    14
125    14
       ..
191     1
177     1
44      1
62      1
190     1
Name: Glucose, Length: 136, dtype: int64

In [177]:
combined_df['BloodPressure'].value_counts()

70     57
74     52
78     45
68     45
72     44
64     43
80     40
76     39
60     37
0      35
62     34
66     30
82     30
88     25
84     23
90     22
86     21
58     21
50     13
56     12
52     11
54     11
75      8
92      8
65      7
85      6
94      6
48      5
96      4
44      4
100     3
106     3
98      3
110     3
55      2
108     2
104     2
46      2
30      2
122     1
95      1
102     1
61      1
24      1
38      1
40      1
114     1
Name: BloodPressure, dtype: int64

In [178]:
combined_df['SkinThickness'].value_counts()

0     227
32     31
30     27
27     23
23     22
33     20
28     20
18     20
31     19
19     18
39     18
29     17
40     16
25     16
26     16
22     16
37     16
41     15
35     15
36     14
15     14
17     14
20     13
24     12
42     11
13     11
21     10
46      8
34      8
12      7
38      7
11      6
43      6
16      6
45      6
14      6
44      5
10      5
48      4
47      4
49      3
50      3
8       2
7       2
52      2
54      2
63      1
60      1
56      1
51      1
99      1
Name: SkinThickness, dtype: int64

In [179]:
combined_df['Insulin'].value_counts()

0      374
105     11
130      9
140      9
120      8
      ... 
73       1
171      1
255      1
52       1
112      1
Name: Insulin, Length: 186, dtype: int64

In [180]:
combined_df['BMI'].value_counts()

32.0    13
31.6    12
31.2    12
0.0     11
32.4    10
        ..
36.7     1
41.8     1
42.6     1
42.8     1
46.3     1
Name: BMI, Length: 248, dtype: int64

In [181]:
combined_df['DiabetesPedigreeFunction'].value_counts()

0.258    6
0.254    6
0.268    5
0.207    5
0.261    5
        ..
1.353    1
0.655    1
0.092    1
0.926    1
0.171    1
Name: DiabetesPedigreeFunction, Length: 517, dtype: int64

In [182]:
combined_df['Age'].value_counts()

22    72
21    63
25    48
24    46
23    38
28    35
26    33
27    32
29    29
31    24
41    22
30    21
37    19
42    18
33    17
38    16
36    16
32    16
45    15
34    14
46    13
43    13
40    13
39    12
35    10
50     8
51     8
52     8
44     8
58     7
47     6
54     6
49     5
48     5
57     5
53     5
60     5
66     4
63     4
62     4
55     4
67     3
56     3
59     3
65     3
69     2
61     2
72     1
81     1
64     1
70     1
68     1
Name: Age, dtype: int64

##**Step-7: Seggregate the Train and Test Data**

In [183]:
train_df1 = combined_df[combined_df["train"] == 1]
test_df1 = combined_df[combined_df["test"] == 0]
train_df1.drop(["train", "test"], axis=1, inplace=True)
test_df1.drop(["test", "train", "Outcome"], axis=1, inplace=True)

In [184]:
train_df1.shape

(700, 9)

In [185]:
train_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               700 non-null    int64  
 1   Glucose                   700 non-null    int64  
 2   BloodPressure             700 non-null    int64  
 3   SkinThickness             700 non-null    int64  
 4   Insulin                   700 non-null    int64  
 5   BMI                       700 non-null    float64
 6   DiabetesPedigreeFunction  700 non-null    float64
 7   Age                       700 non-null    int64  
 8   Outcome                   700 non-null    float64
dtypes: float64(3), int64(6)
memory usage: 54.7 KB


In [186]:
train_df1['Outcome'] = train_df1['Outcome'].astype('int')
train_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               700 non-null    int64  
 1   Glucose                   700 non-null    int64  
 2   BloodPressure             700 non-null    int64  
 3   SkinThickness             700 non-null    int64  
 4   Insulin                   700 non-null    int64  
 5   BMI                       700 non-null    float64
 6   DiabetesPedigreeFunction  700 non-null    float64
 7   Age                       700 non-null    int64  
 8   Outcome                   700 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.7 KB


In [187]:
test_df1.shape

(68, 8)

##**Step-8: Slice X and y Values**

In [188]:
X = train_df1.drop(['Outcome'], axis = 1)
y = train_df1['Outcome']
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [189]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [190]:
X.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [191]:
columnNames = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']

In [192]:
min_max_scaler_object = preprocessing.MinMaxScaler()
X1 = min_max_scaler_object.fit_transform(X)
X1 = pd.DataFrame(X1 , columns = columnNames)
X1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2


##**Step-9: Execute Train-Test-Split Command and Verify**

In [193]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.2, random_state = 66)

In [194]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(560, 8)
(560,)
(140, 8)
(140,)


##**Step-10: Learn the Data and Predict the dependent Variable values for the "X_test"data using "SVC()" algorithm**

In [195]:
from sklearn.svm import SVC
svc_clf = SVC(kernel = 'rbf', random_state = 0)
svc_clf.fit(X_train, y_train)

SVC(random_state=0)

In [196]:
#predictions
y_pred = svc_clf.predict(X_test)

In [197]:
svc_Train_acc=svc_clf.score(X_train,y_train)
svc_Test_acc=svc_clf.score(X_test,y_test)

##**Step-11: Calculate the Accuracy of the Model**

In [198]:
print('Accuracy on training set:',svc_Train_acc)
print('Accuracy on test set:',svc_Test_acc)

Accuracy on training set: 0.7928571428571428
Accuracy on test set: 0.7857142857142857


##**Step-12: Display the Confusion Matrix and Classification Report of the Model**

In [199]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  

[[84 13]
 [17 26]]
              precision    recall  f1-score   support

           0       0.83      0.87      0.85        97
           1       0.67      0.60      0.63        43

    accuracy                           0.79       140
   macro avg       0.75      0.74      0.74       140
weighted avg       0.78      0.79      0.78       140



##**Step-13: SVC Algorithm Parameters Fine Tuning using GridSearch CV Method**

In [200]:
model_params = {
     'svc': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
 }

In [201]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X1, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svc,0.774286,"{'C': 20, 'kernel': 'rbf'}"


In [202]:
#SV Classifier
svc_grid_acc = cross_val_score(SVC(C=20, kernel='rbf', gamma = 'auto'),X1, y, cv=5)
print("svc_grid_acc (CV_based) :", svc_grid_acc)
svc_grid_acc_avg=np.average(svc_grid_acc)
print()
print("svc_grid_acc_avg : ", svc_grid_acc_avg)

svc_grid_acc (CV_based) : [0.75714286 0.76428571 0.79285714 0.77142857 0.78571429]

svc_grid_acc_avg :  0.7742857142857142


In [203]:
min_max_scaler_object = preprocessing.MinMaxScaler()
test_df2 = min_max_scaler_object.fit_transform(test_df1)
test_df3 = pd.DataFrame(test_df2 , columns = columnNames)
test_df3.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.153846,0.456,0.808511,0.5625,0.392157,0.728195,0.343045,0.111111
1,0.461538,0.48,0.829787,0.645833,0.0,0.559838,0.420113,0.622222
2,0.076923,0.824,0.93617,0.604167,0.0,0.709939,0.739662,0.688889
3,0.153846,0.512,0.0,0.0,0.0,0.780933,0.174812,0.444444
4,0.307692,0.36,0.808511,0.416667,0.196078,0.576065,0.0,0.133333


In [204]:
#predictions for Customer Data
cust_data_pred = svc_clf.predict(test_df3)

In [205]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               68 non-null     int64  
 1   Glucose                   68 non-null     int64  
 2   BloodPressure             68 non-null     int64  
 3   SkinThickness             68 non-null     int64  
 4   Insulin                   68 non-null     int64  
 5   BMI                       68 non-null     float64
 6   DiabetesPedigreeFunction  68 non-null     float64
 7   Age                       68 non-null     int64  
 8   test                      68 non-null     int64  
dtypes: float64(2), int64(7)
memory usage: 4.9 KB


In [206]:
test_df.drop(["test"], axis=1, inplace=True)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               68 non-null     int64  
 1   Glucose                   68 non-null     int64  
 2   BloodPressure             68 non-null     int64  
 3   SkinThickness             68 non-null     int64  
 4   Insulin                   68 non-null     int64  
 5   BMI                       68 non-null     float64
 6   DiabetesPedigreeFunction  68 non-null     float64
 7   Age                       68 non-null     int64  
dtypes: float64(2), int64(6)
memory usage: 4.4 KB


In [207]:
test_df["Predicted_Outcome"]=cust_data_pred
print(test_df.shape)
test_df.head()

(68, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Predicted_Outcome
0,2,122,76,27,200,35.9,0.483,26,0
1,6,125,78,31,0,27.6,0.565,49,1
2,1,168,88,29,0,35.0,0.905,52,1
3,2,129,0,0,0,38.5,0.304,41,1
4,4,110,76,20,100,28.4,0.118,27,0


In [208]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               68 non-null     int64  
 1   Glucose                   68 non-null     int64  
 2   BloodPressure             68 non-null     int64  
 3   SkinThickness             68 non-null     int64  
 4   Insulin                   68 non-null     int64  
 5   BMI                       68 non-null     float64
 6   DiabetesPedigreeFunction  68 non-null     float64
 7   Age                       68 non-null     int64  
 8   Predicted_Outcome         68 non-null     int64  
dtypes: float64(2), int64(7)
memory usage: 4.9 KB


In [209]:
test_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Predicted_Outcome
0,2,122,76,27,200,35.9,0.483,26,0
1,6,125,78,31,0,27.6,0.565,49,1
2,1,168,88,29,0,35.0,0.905,52,1
3,2,129,0,0,0,38.5,0.304,41,1
4,4,110,76,20,100,28.4,0.118,27,0


In [210]:
from google.colab import files
test_df.to_csv("gdrive/My Drive/NCJ-MLP-Training-2022/NCJ-MLP-Projects-Latest/03-Diabetes-Project/Output-Files/diabetes-customer-data-with-predicted-Outcome-values2.csv", index = False)

In [211]:
import pickle
#now you can save it to a file
file = '/content/gdrive/My Drive/NCJ-MLP-Training-2022/NCJ-MLP-Projects-Latest/03-Diabetes-Project/Pickle-File/ML_Model_Diabetics.pkl'
with open(file, 'wb') as f:
    pickle.dump(svc_clf, f)

In [212]:
with open(file, 'rb') as f:
    k = pickle.load(f)

In [213]:
cy = k.predict([[6,	125,	78,	31,	0,	27.6,	0.565,	49]])
#6	125	78	31	0	27.6	0.565	49	
print(cy)

[1]
