###  Predict the diabetes status of a female patient given (label) their health measurements

- Attribute Information:

   -  pregnant -  Number of times pregnant 
   -  glucose  -  Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
   -  bp       -  Diastolic blood pressure (mm Hg) 
   -  skin     -  Triceps skin fold thickness (mm) 
   -  insulin  -  2-Hour serum insulin (mu U/ml) 
   -  bmi      -  Body mass index (weight in kg/(height in m)^2) 
   -  pedigree -  Diabetes pedigree function 
   -  age      -  Age (years) 
   -  diabetes - Class variable (0 or 1) 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics

%matplotlib inline

In [2]:
from sklearn import svm



In [3]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'diabetes']
pima_data = pd.read_csv('C:\\Users\\jp\\Desktop\\testData\\pima-indians-diabetes.csv', header=None, names=col_names)

In [4]:
pima_data.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
pima_data.shape

(768, 9)

In [6]:
# define features x and resopnse y
X = pima_data[['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age']]
y = pima_data['diabetes']

In [7]:
from sklearn.model_selection import train_test_split

# split train and test data
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1 )

In [8]:
svm_model = svm.SVC()
svm_model

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
svm_model.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
y_pred = svm_model.predict(x_test)
metrics.accuracy_score(y_pred, y_test)

0.640625

In [11]:
# SVM Model tuning Gridsearch
svm_model = svm.SVC()

param_grid = { "C"           : [2, 3, 4], 
               "kernel"      : ['rbf','linear'],             
               "gamma"      : [.001, .01, .1, 1]     
                }

optimized_svm = GridSearchCV(svm_model, param_grid, scoring='accuracy', cv=5 )
optimized_svm.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [2, 3, 4], 'kernel': ['rbf', 'linear'], 'gamma': [0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [12]:
optimized_svm.best_estimator_

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
optimized_svm.best_score_

0.77256944444444442

In [19]:
svm_model_final = svm.SVC(C=3, kernel='linear', gamma=.001)
svm_model_final.fit(x_train, y_train)

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [20]:
y_pred = svm_model_final.predict(x_test)
metrics.accuracy_score(y_pred, y_test)

0.78125