# Processing Diabetes DataSet with K-Means Clustering

In [2]:
from findClosestCentroids import find_closest_centroids
from kMeansInitCentroids import kmeans_init_centroids
from runKMeans import run_kmeans
from featureNormalize import featureNormalize

import numpy as np
import pandas as pd
# used to split data in train and test sets
from sklearn.model_selection import train_test_split
#for metrics
from sklearn.metrics import accuracy_score

## Loading and normalizing features


In [3]:
# Load an example dataset that we will be using
file_path = '../../data/1_diabetes.csv'
df = pd.read_csv(file_path)

features = df.drop('Outcome', axis=1)
labels = df['Outcome']

X = features.as_matrix()
y = labels.as_matrix().flatten()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

#normalise features
X_norm, mu, sigma = featureNormalize(X_train)

## Running K-Means Clustering

In [8]:
print('\nRunning K-Means clustering on example dataset.\n\n')

# Settings for running K-Means
K = 2
max_iters = 10
initial_centroids = kmeans_init_centroids(X_norm, K)
# Run K-Means algorithm. The 'true' at the end tells our function to plot
# the progress of K-Means
centroids, idx = run_kmeans(X_norm, initial_centroids, max_iters, False)
print('\nK-Means Done.\n\n')

idx = idx.flatten().astype(int)

print('\nTraining Set Accuracy: ', accuracy_score(y_train, idx) * 100, '\n')


Running K-Means clustering on example dataset.


K-Means iteration 0/10...
K-Means iteration 1/10...
K-Means iteration 2/10...
K-Means iteration 3/10...
K-Means iteration 4/10...
K-Means iteration 5/10...
K-Means iteration 6/10...
K-Means iteration 7/10...
K-Means iteration 8/10...
K-Means iteration 9/10...

K-Means Done.



Training Set Accuracy:  67.7611940299 



## Test 

In [9]:
normalized_test_data = np.divide(X_test - mu, sigma)
idx = find_closest_centroids(normalized_test_data, centroids)
print('\nTest Set Accuracy: ', accuracy_score(y_test, idx) * 100, '\n')



Test Set Accuracy:  61.6666666667 

