# Processing Diabetes DataSet with Support Vector Machines

In [129]:
from sklearn import svm
from featureNormalize import featureNormalize
import numpy as np
import pandas as pd
import scipy.optimize as op
# used to split data in train and test sets
from sklearn.model_selection import train_test_split
# for accuracy metrics
from sklearn.metrics import accuracy_score

# for features scaling
# from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize

# precision, recall & f1 score
from sklearn.metrics import precision_recall_fscore_support

## Loading Data and Normalizing Features

In [130]:
# Load an example dataset that we will be using
file_path = '../../data/1_diabetes.csv'
df = pd.read_csv(file_path)

# describe data
# print(df.describe())

features = df.drop('Outcome', axis=1)
labels = df['Outcome']

X = features.as_matrix()
y = labels.as_matrix().ravel()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)

#  Z-Score or mean normalise features
X_norm, mu, sigma = featureNormalize(X_train)

## Training SVM with Linear Kernel

In [132]:
print('\nTraining SVM with Linear Kernel...\n')

# SVM Parameters
C = 1

model = svm.LinearSVC(C=C, tol=1e-3, max_iter=200)
model.fit(X_norm, y_train)
print('\nTraining Done\n')


Training SVM with Linear Kernel...


Training Done



## Make prediction

In [133]:
p = model.predict(X_norm)
print('\nTraining Set Accuracy: ', accuracy_score(y_train, p) * 100, '\n')


Training Set Accuracy:  76.1194029851 



## Test Accuracy 

In [134]:
normalized_test_data = np.divide(X_test - mu, sigma)
p = model.predict(normalized_test_data)
print('\nTest Set Accuracy: ', accuracy_score(y_test, p) * 100, '\n')
precision,  recall,  f1_score,  support = precision_recall_fscore_support(y_test, p, average='macro')
print("F1 Score = ", f1_score)


Test Set Accuracy:  78.4848484848 

F1 Score =  0.735958348359
