# Processing Diabetes DataSet with Support Vector Machines

In [12]:
from sklearn import svm
from featureNormalize import featureNormalize
import numpy as np
import pandas as pd
import scipy.optimize as op
# used to split data in train and test sets
from sklearn.model_selection import train_test_split
# for accuracy metrics
from sklearn.metrics import accuracy_score

# for features scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize

# precision, recall & f1 score
from sklearn.metrics import precision_recall_fscore_support

## Loading Data and Normalizing Features

In [117]:
# Load an example dataset that we will be using
file_path = '../../data/1_diabetes.csv'
df = pd.read_csv(file_path)

# describe data
# print(df.describe())

features = df.drop('Outcome', axis=1)
labels = df['Outcome']

X = features.as_matrix()
y = labels.as_matrix().ravel()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)

#  Z-Score or mean normalise features
X_norm1, mu, sigma = featureNormalize(X_train)

#Min-Max Scaling
std_scaler = MinMaxScaler().fit(X_train)
X_norm2 = std_scaler.transform(X_train)


X_norm3 = normalize(X_train)

print(X_norm1[0])
print(X_norm2[0])
print(X_norm3[0])

[-0.66603015  2.33055112  0.68330762 -0.78869722 -0.78869722 -0.08949492
 -0.78289682 -0.07021867]
[ 0.41176471  0.89447236  0.68852459  0.          0.          0.49503722
  0.1057363   0.33333333]
[ 0.03413127  0.86790947  0.40957525  0.          0.          0.19454825
  0.00161392  0.19991173]


## Training SVM with RBF Kernel

In [116]:
print('\nTraining SVM with RBF Kernel...\n')

# SVM Parameters
C = 1

model = svm.LinearSVC(C=C, tol=1e-3, max_iter=200)
model.fit(X_norm1, y_train)
print('\nTraining Done\n')


Training SVM with RBF Kernel...


Training Done



## Make prediction

In [115]:
p = model.predict(X_norm1)
print('\nTraining Set Accuracy: ', accuracy_score(y_train, p) * 100, '\n')


Training Set Accuracy:  78.2835820896 



## Test Accuracy 

In [114]:
normalized_test_data = np.divide(X_test - mu, sigma)
p = model.predict(normalized_test_data)
print('\nTest Set Accuracy: ', accuracy_score(y_test, p) * 100, '\n')
precision,  recall,  f1_score,  support = precision_recall_fscore_support(y_test, p, average='macro')
print("F1 Score = ", f1_score)


Test Set Accuracy:  77.1212121212 

F1 Score =  0.728025194375
