## Sklearn k-means example

### Imports

In [1]:
import numpy as np
import random
from time import time

In [2]:
import sklearn as sk
from sklearn import datasets
from sklearn.datasets import load_digits
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
from sklearn import metrics

### Functions

In [3]:
def create_bag(data, bag_size=1.0, with_repeats=True):
    data_length = data.shape[0]
    bag_length = int(data_length*bag_size)
    
    bag = np.empty((0,data.shape[1]))
    if with_repeats:
        # just draw randomly
#         print('Bagging with repeats')
        for i in range(0, bag_length):
            r = np.random.randint(0, data_length-1)
            bag = np.vstack([bag, data[r]])
            
        return bag
        
    else:
        # remove values from original set after selected
#         print('Bagging with no repeats')
        a = np.arange(0, data.shape[0])
        sample = np.random.choice(a, bag_length)
        
        return data[sample, :]

In [4]:
def print_class_ratios(y):
    l = len(y)
    
    unique, counts = np.unique(y, return_counts=True)
    print('Class \t Count \t Percentage')
    for i in range(0, len(unique)):
        perc = 100 * counts[i]/l
        print('{} \t {} \t {:.2f} %'.format(int(unique[i]), counts[i], perc))

In [5]:
def explain_data(X, y=None, problem_type='classification'):
    print('X.shape: {}'.format(X.shape))
    print('X first record: {}'.format(X[0]))
    print()
    
    if y is not None:
        print('y.shape: {}'.format(y.shape))
        print('y first record: {}'.format(y[0]))
        print()
        
        if problem_type == 'classification':
            print_class_ratios(y)

### Data preparation

In [6]:
np.random.seed(42)

digits = load_digits()
data = scale(digits.data)

In [7]:
n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target

In [8]:
explain_data(data, labels)

X.shape: (1797, 64)
X first record: [ 0.         -0.33501649 -0.04308102  0.27407152 -0.66447751 -0.84412939
 -0.40972392 -0.12502292 -0.05907756 -0.62400926  0.4829745   0.75962245
 -0.05842586  1.12772113  0.87958306 -0.13043338 -0.04462507  0.11144272
  0.89588044 -0.86066632 -1.14964846  0.51547187  1.90596347 -0.11422184
 -0.03337973  0.48648928  0.46988512 -1.49990136 -1.61406277  0.07639777
  1.54181413 -0.04723238  0.          0.76465553  0.05263019 -1.44763006
 -1.73666443  0.04361588  1.43955804  0.         -0.06134367  0.8105536
  0.63011714 -1.12245711 -1.06623158  0.66096475  0.81845076 -0.08874162
 -0.03543326  0.74211893  1.15065212 -0.86867056  0.11012973  0.53761116
 -0.75743581 -0.20978513 -0.02359646 -0.29908135  0.08671869  0.20829258
 -0.36677122 -1.14664746 -0.5056698  -0.19600752]

y.shape: (1797,)
y first record: 0

Class 	 Count 	 Percentage
0 	 178 	 9.91 %
1 	 182 	 10.13 %
2 	 177 	 9.85 %
3 	 183 	 10.18 %
4 	 181 	 10.07 %
5 	 182 	 10.13 %
6 	 181 	 10.07

In [9]:
# KMeans?

In [10]:
def create_bagged_classifier_list(data, labels, n_iter=10):
    labels = np.reshape(labels, (labels.shape[0], 1))
    full_set = np.hstack([data, labels])
    clf_config = {
        'algorithm' : 'auto',
        'copy_x' : True,
        'init' : 'k-means++',
        'max_iter' : 300,
        'n_clusters' : 10,
        'n_init' : 10,
        'n_jobs' : None,
        'precompute_distances' : 'auto',
        'random_state' : None,
        'tol' : 0.0001,
        'verbose' : 0
    }
    
    classifiers = []
    bags = []
    predicted_labels = []
    for i in range(0, n_iter):
        # Ensure all models get trained in the same way
        np.random.seed(42)
        
        # Create n_iter number of untrained k-means classifiers with same hyperparameters
        classifiers.append(KMeans(algorithm=clf_config['algorithm'], copy_x=clf_config['copy_x'],
                                  init=clf_config['init'], max_iter=clf_config['max_iter'], n_clusters=clf_config['n_clusters'],
                                  n_init=clf_config['n_init'], n_jobs=clf_config['n_jobs'],
                                  precompute_distances=clf_config['precompute_distances'], 
                                  random_state=clf_config['random_state'], tol=clf_config['tol'],
                                  verbose=clf_config['verbose']))
        
        # Create n_iter number of bags from the original data
        bags.append(create_bag(full_set))
        
        # Split the data back into features/labels
        features = bags[i][:,:-1]
        labels = bags[i][:,-1]
        
        # Train classifier on one of the bags
        classifiers[i].fit(features)
        
    return classifiers

### Training & Testing

#### Split train/test data

In [11]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(data, labels, test_size=0.3, random_state=42)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1257, 64)
(540, 64)
(1257,)
(540,)


### Fit OG model

In [13]:
og_clf = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
og_clf.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [14]:
og_train_acc = metrics.accuracy_score(y_true=y_train, y_pred=og_clf.labels_, normalize=True, sample_weight=None)
print('OG training accuracy: {:.3f} %'.format(og_train_acc*100))

OG training accuracy: 12.490 %


In [15]:
og_predictions = og_clf.predict(X_test)

og_test_acc = metrics.accuracy_score(y_true=y_test, y_pred=og_predictions, normalize=True, sample_weight=None)
print('OG test accuracy: {:.3f} %'.format(og_test_acc*100))

OG test accuracy: 11.852 %


### Get bagging results

In [16]:
clf_list = create_bagged_classifier_list(X_train, y_train, n_iter=50)

In [17]:
def get_bagged_predictions(clf_list, X_test, y_test, verbose=False):
    pred_list = np.empty((X_test.shape[0], 0), dtype=int)
    bagged_predictions = []
    
    # Get predicted classes for every model
    for i in range(0, len(clf_list)):
        preds = np.array(clf_list[i].predict(X_test)).reshape((len(X_test), 1))
        
        if verbose:
            print('pred_list.shape: {}'.format(pred_list.shape))
    #         print(pred_list)
    #         print('preds.shape: {}'.format(preds.shape))
    #         print(preds)
        
        pred_list = np.hstack([pred_list, preds])
        
        if y_test is not None and verbose:
            print('This model\'s prediction accuracy: {:.2f}%'.format(
                   metrics.accuracy_score(y_true=y_test, y_pred=preds, normalize=True, sample_weight=None)*100))
        
    # For every record, vote between models and pick the most popular answer (could be improved)
    for i in range(0, pred_list.shape[0]):
        counts = np.bincount(pred_list[i, :])
        bagged_predictions.append(np.argmax(counts))
        
    return bagged_predictions

In [18]:
bagged_predictions = get_bagged_predictions(clf_list, X_test, y_test, verbose=False)

In [19]:
bagged_predictions = np.array(bagged_predictions)
bagged_predictions.shape

(540,)

In [20]:
y_test.shape

(540,)

In [21]:
bagged_test_acc = metrics.accuracy_score(y_true=y_test, y_pred=bagged_predictions, normalize=True, sample_weight=None)
print('Bagged test accuracy: {:.3f} %'.format(bagged_test_acc*100))

Bagged test accuracy: 39.074 %


In [22]:
print(bagged_predictions[0:15])
print(y_test[0:15])

[7 3 3 9 5 2 6 6 6 5 1 6 4 0 4]
[6 9 3 7 2 1 5 2 5 2 1 9 4 0 4]


In [23]:
print('Single model k-means accuracy: \t\t{:.3f} %'.format(og_test_acc*100))
print('vs.')
print('Multiple models trained with bagging: \t{:.3f} %'.format(bagged_test_acc*100))

Single model k-means accuracy: 		11.852 %
vs.
Multiple models trained with bagging: 	39.074 %
