# K-means Example

##### Using open-source Iris dataset, demonstrates clustering and correct identification for 75% of samples with only one sample of each type of iris.

In [1]:
# Import packages
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Create dataset with missing target data.

In [2]:
# Load data
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
                 header=None,
                 names=['sepal length', 'sepal width', 'petal length', 'petal width', 'target'])

# Get feature and target vectors
X, y = df[['sepal length', 'sepal width', 'petal length', 'petal width']].values, df['target'].values

# Encode the target
le = LabelEncoder()
y = le.fit_transform(y)

# Divide into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# Print y_train before removing class labels
print('y_train before removing the class labels:')
print(y_train)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Get index of rows containing the three unique class labels
classes, indices = np.unique(y_train, return_index=True)

# Remove the classes (by changing them to -1) in rows where the classes have already appeared.  In the end, y_train contains only three unique class labels. The removed classes are denoted by '-1'.
y_train = np.array([y_train[i] if i in indices else -1 for i in range(y_train.shape[0])])

# Print y_train
print('y_train after removing the labels:')
print(y_train)

y_train before removing the class labels:
[2 2 2 0 0 0 1 2 1 2 0 1 1 1 0 0 2 1 1 2 2 1 0 0 1 1 0 1 2 2 2 1 2 2 0 0 0
 1 0 0 2 1 2 0 0 0 1 1 0 1 1 1 2 0 1 1 1 1 2 0 1 2 1 1 2 1 2 0 1 2 2 2 2 0
 2 0 0 2 1 0 0 0 0 0 1 2 2 2 0 2 0 0 1 1 1 1 0 2 2 0 2 1 0 2 2]
y_train after removing the labels:
[ 2 -1 -1  0 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1]


## Simple K-Means model

1. When applying KMeans on the training data, we get the cluster index for each sample (in the training data), some of which have class labels (that are not -1). Thus we can obtain a map (a dictionary in essence) from cluster index to class.
2. When applying KMeans on the testing data, we get the cluster index for each sample (in the testing data). Using the map above, we can obtain the predicted classes.

Consider the number of class labels (in the data) when deciding the value of n_clusters for KMeans

In [3]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=3, random_state=0)

#### On the training data, compute cluster centers and predict cluster indices

In [4]:
y_train_pred = km.fit_predict(X_train)
print(y_train_pred)

[0 0 0 1 1 1 2 0 2 0 1 0 2 0 1 1 0 2 2 2 0 2 1 1 2 2 1 2 0 0 0 2 2 0 1 1 1
 2 1 1 0 2 0 1 1 1 0 0 1 2 2 2 0 1 0 0 2 2 0 1 2 2 2 2 0 2 0 1 2 0 2 0 0 1
 0 1 1 0 2 1 1 1 1 1 2 0 0 0 1 2 1 1 0 2 2 2 1 0 2 1 0 2 1 0 0]


#### Create the map from cluster index to class label (that is not -1)

In [5]:
dict_ = {}
for i in range(y_train.shape[0]):
    if y_train[i] != -1:
        key = y_train_pred[i]
        val = y_train[i]
        dict_[key] = val

#### On the testing data, predict cluster indices

In [6]:
y_test_pred = km.predict(X_test)

#### On the testing data, transform the cluster indices into class labels

In [7]:
y_test_pred = np.array([dict_[y_test_pred[i]] for i in range(y_test_pred.shape[0])])    

#### Print the accuracy

In [8]:
from sklearn.metrics import precision_recall_fscore_support

print('Accuracy:', end=' ')
print(precision_recall_fscore_support(y_test_pred, y_test, average='micro')[0])

Accuracy: 0.7777777777777778
