#### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import ADASYN
%matplotlib inline

#### Reading the Dataset

In [5]:
data = pd.read_csv('pima-indians-diabetes.csv')

In [13]:
cols = ['NO_PREG','PLASMA_GLUCOSE','DIASTOLIC_BP',
        'SKIN_THICKNESS','SERUM_INSULIN','BMI','DIA_PEDI_FUNC','AGE','TARGET']

In [14]:
data.columns = cols

In [15]:
data.head()

Unnamed: 0,NO_PREG,PLASMA_GLUCOSE,DIASTOLIC_BP,SKIN_THICKNESS,SERUM_INSULIN,BMI,DIA_PEDI_FUNC,AGE,TARGET
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
data.columns

Index(['NO_PREG', 'PLASMA_GLUCOSE', 'DIASTOLIC_BP', 'SKIN_THICKNESS',
       'SERUM_INSULIN', 'BMI', 'DIA_PEDI_FUNC', 'AGE', 'TARGET'],
      dtype='object')

In [16]:
data.shape

(768, 9)

In [12]:
data['TARGET'].value_counts()

0    500
1    268
Name: TARGET, dtype: int64

#### Over-Sampling

In [17]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [18]:
print('Shape of Feature Matrix:', X.shape)
print('Shape of Target Vector:', y.shape)

Shape of Feature Matrix: (768, 8)
Shape of Target Vector: (768,)


In [19]:
from collections import Counter

In [20]:
print('Original Target Variable Distribution:', Counter(y))

Original Target Variable Distribution: Counter({0: 500, 1: 268})


In [36]:
ada = ADASYN(sampling_strategy='minority', random_state= 420 , n_neighbors = 5)

In [37]:
X_res, y_res = ada.fit_resample(X,y)

In [38]:
print('Oversampled Target Variable Distribution:', Counter(y_res))

Oversampled Target Variable Distribution: Counter({0: 500, 1: 474})


#### Model Fitting

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))


results = []
names = []
scoring = 'accuracy'

for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X_res, y_res, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)



LR: 0.694940 (0.106931)
LDA: 0.694908 (0.115581)
KNN: 0.696213 (0.063426)
CART: 0.725973 (0.052967)
NB: 0.675416 (0.108062)




SVM: 0.656827 (0.162346)


