# Import Packages Etc

In [1]:
from IPython.display import display, HTML, Image

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn import metrics
from scipy.spatial import distance 

%matplotlib inline
#%qtconsole

# Define the TemplateMatchClassifier Class

In [2]:
class TemplateMatchClassifier(BaseEstimator, ClassifierMixin):
    """
    Parameters
    ----------
    distance_metric string, optional (default = 'euclidean')
    Other type of distance metrics: manhattan, chebyshev, and mahalanobis distance.

    Attributes
    ----------
    classes_ : array of shape = [n_classes] 
               The class labels (single output problem).
    templates_: dict
               A dictionary of the templates used for each class 
              {classes:templates}
    Notes
    -----


    See also
    --------
    
    
    ----------

    Examples
    --------
    >>> from sklearn.datasets import load_iris
    >>> from sklearn.model_selection import cross_val_score
    >>> clf = TemplateMatchClassifier()
    >>> iris = load_iris()
    >>> cross_val_score(clf, iris.data, iris.target, cv=10)
    
    """
    
    # Constructor for the classifier object
    def __init__(self, distance_metric = 'euclidean'):    #default is euclidean
        self.distance_metric = distance_metric
        
    # The fit function to train a classifier
    def fit(self, X, y):
        
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : array-like matrix of shape = [n_samples, n_features]
            The input samples. 
        Returns
        -------
        p : array of shape = [n_samples, n_labels].
            The predicted class label probabilities of the input samples. 
        """
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        
        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        
        # Obtain the distance matrix
        if self.distance_metric == 'euclidean':
            self.distance_fn = distance.euclidean
        elif self.distance_metric == 'cosine':
            self.distance_fn = distance.cosine
        elif self.distance_metric == 'minkowski':
            self.distance_fn = distance.minkowski
        elif self.distance_metric == 'manhattan':
            self.distance_fn = distance.cityblock
        
        # Count the number of occurrences of each class in the target vector (uses mupy unique function that returns a list of unique values and their counts)
        
        #unique, counts = np.unique(y, return_counts=True)
        
        # Store the classes seen during fit
        unique = np.unique(y)
        self.classes_ = unique
        print(self.classes_)
        
        #Create dict to store template for each class..
        self.templates_ = dict()
        for c in self.classes_:
                self.templates_[c] = X[y == c].mean(axis = 0)
        print(self.templates_)
        return self
    
    
    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csr_matrix``.
        Returns
        -------
        p : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
        
        # Check is fit had been called by confirming that the distributions_ dictionary has been set up
        check_is_fitted(self, ['templates_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)

        # Initialise an empty list to store the predictions made
        predictions = list()
        
        # Iterate through the query instances in the query dataset 
        for instance in X:
           
            #Let best match be first template...
            globalDistance = self.distance_fn(instance, self.templates_[self.classes_[0]]) #global minimum
            
            globalClass = self.classes_[0] #global solution
            
            for template in self.templates_:
                currDistance = self.distance_fn(instance, self.templates_[template])
                
                if currDistance < globalDistance:
                    globalDistance = currDistance
                    globalClass = template
            
            predictions.append(globalClass)
            
        return np.array(predictions)    

### Test the TemplateMatchClassifier

In [3]:
a = np.array([[1,23,3,4], [5,6,7,8], [7,5,6,2], [4,9,12,43]])
y = np.array([1, 2, 2, 2])

In [4]:
my_model = TemplateMatchClassifier()

In [6]:
my_model.fit(a, y)

[1 2]
{1: array([ 1., 23.,  3.,  4.]), 2: array([ 5.33333333,  6.66666667,  8.33333333, 17.66666667])}


TemplateMatchClassifier(distance_metric='euclidean')

In [9]:
my_model.templates_

{1: array([ 1., 23.,  3.,  4.]),
 2: array([ 5.33333333,  6.66666667,  8.33333333, 17.66666667])}

In [10]:
q = np.array([[2,15,6,21], [8,9,7,6]])

In [11]:
my_model.predict(q)

array([2, 2])

# Test the TemplateMatchClassifier on MNIST

In [13]:
data_sampling_rate = 0.1
cv_folds = 10

In [16]:
dataset = pd.read_csv('fashion-mnist_train.csv')
dataset = dataset.sample(frac=data_sampling_rate) #take a sample from the dataset so everyhting runs smoothly
num_classes = 10
classes = {0: "T-shirt/top", 1:"Trouser", 2: "Pullover", 3:"Dress", 4:"Coat", 5:"Sandal", 6:"Shirt", 7:"Sneaker", 8:"Bag", 9:"Ankle boot"}
display(dataset.head())

X = dataset[dataset.columns.difference(["label"])]
Y = np.array(dataset["label"])
X = X/255
X_train_plus_valid, X_test, y_train_plus_valid, y_test \
    = train_test_split(X, Y, random_state=0, \
                                    train_size = 0.7)

X_train, X_valid, y_train, y_valid \
    = train_test_split(X_train_plus_valid, \
                                        y_train_plus_valid, \
                                        random_state=0, \
                                        train_size = 0.5/0.7)

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
30749,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2196,9,0,0,0,0,0,0,0,0,0,...,0,0,4,0,72,198,187,188,199,3
52805,3,0,0,0,0,0,0,0,1,0,...,146,0,0,0,0,0,0,0,0,0
28409,0,0,0,0,0,2,0,0,0,13,...,53,33,0,0,0,0,0,0,0,0
52640,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
model = TemplateMatchClassifier()
model.fit(X_train, y_train)

[0 1 2 3 4 5 6 7 8 9]
{0: array([0.00000000e+00, 1.58465247e-01, 5.92404404e-01, 5.81760146e-01,
       5.81317178e-01, 5.74099407e-01, 5.70438408e-01, 5.41762752e-01,
       5.05204873e-01, 4.29118624e-01, 2.36609993e-01, 4.35411374e-02,
       2.33782815e-01, 1.13347665e-02, 6.25366426e-04, 0.00000000e+00,
       7.81708032e-05, 3.25711680e-04, 8.11673507e-03, 4.60947170e-02,
       2.23490326e-01, 4.46172888e-01, 5.20200638e-01, 2.86274510e-01,
       5.41606410e-01, 5.61852648e-01, 5.86658850e-01, 5.90984301e-01,
       5.77069898e-01, 5.78476972e-01, 5.91583610e-01, 5.87440558e-01,
       5.85694743e-01, 5.81994658e-01, 2.50667709e-01, 5.84639437e-01,
       5.86020455e-01, 5.84821836e-01, 5.51742557e-01, 5.28734284e-01,
       4.85987884e-01, 3.48576640e-01, 1.11823334e-01, 1.70151782e-02,
       3.30923067e-03, 2.01498274e-01, 0.00000000e+00, 9.11992704e-05,
       4.69024819e-04, 1.39404599e-02, 8.55970295e-02, 3.16122728e-01,
       5.02051984e-01, 5.41059214e-01, 5.44524787e-

TemplateMatchClassifier(distance_metric='euclidean')

In [18]:
# Make a set of predictions for the training data
y_pred = model.predict(X_train)

# Print performance details
accuracy = metrics.accuracy_score(y_train, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_train, y_pred))

# Print confusion matrix
# print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
display(pd.crosstab(np.array(y_train), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

Accuracy: 0.691
              precision    recall  f1-score   support

           0       0.70      0.72      0.71       301
           1       0.95      0.92      0.94       283
           2       0.57      0.53      0.55       285
           3       0.73      0.82      0.77       291
           4       0.56      0.54      0.55       309
           5       0.51      0.78      0.62       330
           6       0.39      0.20      0.26       294
           7       0.73      0.80      0.76       301
           8       0.92      0.73      0.82       306
           9       0.85      0.86      0.86       300

    accuracy                           0.69      3000
   macro avg       0.69      0.69      0.68      3000
weighted avg       0.69      0.69      0.68      3000

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,217,1,1,22,5,42,9,1,3,0,301
1,7,260,4,7,1,2,2,0,0,0,283
2,1,1,151,4,53,36,36,0,3,0,285
3,15,6,0,238,9,14,8,0,1,0,291
4,1,1,59,28,168,23,25,0,4,0,309
5,0,0,0,0,0,258,0,49,0,23,330
6,65,2,44,9,60,48,58,0,8,0,294
7,0,0,0,0,0,40,0,240,0,21,301
8,2,2,8,17,4,24,9,14,224,2,306
9,0,0,0,0,0,15,2,24,0,259,300


In [19]:
# Make a set of predictions for the training data
y_pred = model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
# print(metrics.confusion_matrix(y_train, y_pred))

# Print nicer homemade confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.6783333333333333
              precision    recall  f1-score   support

           0       0.75      0.72      0.73       197
           1       0.96      0.88      0.92       181
           2       0.60      0.49      0.54       193
           3       0.64      0.79      0.70       180
           4       0.51      0.51      0.51       167
           5       0.51      0.76      0.61       178
           6       0.31      0.19      0.23       175
           7       0.74      0.78      0.76       180
           8       0.91      0.78      0.84       177
           9       0.84      0.88      0.86       172

    accuracy                           0.68      1800
   macro avg       0.68      0.68      0.67      1800
weighted avg       0.68      0.68      0.67      1800

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,141,0,1,27,2,18,8,0,0,0,197
1,9,159,2,10,0,0,1,0,0,0,181
2,0,0,94,5,31,22,38,1,2,0,193
3,5,5,1,142,11,11,5,0,0,0,180
4,1,2,32,21,86,7,16,0,2,0,167
5,0,0,0,0,0,136,0,31,1,10,178
6,32,0,22,10,38,32,33,0,8,0,175
7,0,0,0,0,0,20,0,141,0,19,180
8,0,0,4,7,2,15,5,6,138,0,177
9,0,0,0,1,0,7,1,12,0,151,172


In [21]:
#perform crossvalidation 
my_model = TemplateMatchClassifier()
scores = cross_val_score(my_model, X_train_plus_valid, y_train_plus_valid, cv=cv_folds, n_jobs=-1)
print(scores)

[0.72813239 0.6563981  0.68957346 0.7014218  0.66587678 0.70783848
 0.71190476 0.63875598 0.67951807 0.66506024]


In [22]:
# Set up the parameter grid to seaerch
param_grid = [
 {'distance_metric': ['euclidean', 'cosine', 'manhattan']}
]

# Perform the search
tuned_model = GridSearchCV(TemplateMatchClassifier(), param_grid, cv=cv_folds, verbose = 2, n_jobs=-1)
tuned_model.fit(X_train_plus_valid, y_train_plus_valid)

# Print details
print("Best parameters set found on development set:")
print(tuned_model.best_params_)
print(tuned_model.best_score_)

Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:    0.9s remaining:    0.9s


[0 1 2 3 4 5 6 7 8 9]
{0: array([0.00000000e+00, 1.53594771e-01, 5.97292250e-01, 5.79094304e-01,
       5.79617180e-01, 5.74995331e-01, 5.66339869e-01, 5.30952381e-01,
       4.97675070e-01, 4.18281979e-01, 2.34500467e-01, 4.69841270e-02,
       2.25919701e-01, 1.31559290e-02, 1.16713352e-03, 0.00000000e+00,
       9.33706816e-05, 1.24183007e-03, 1.06255836e-02, 4.85714286e-02,
       2.19897292e-01, 4.33762838e-01, 5.11111111e-01, 2.84117647e-01,
       5.28160598e-01, 5.54920635e-01, 5.81419234e-01, 5.84920635e-01,
       5.76209150e-01, 5.80597572e-01, 5.89878618e-01, 5.90448179e-01,
       5.86162465e-01, 5.74332400e-01, 2.55023343e-01, 5.81951447e-01,
       5.81923436e-01, 5.70924370e-01, 5.39663866e-01, 5.26274510e-01,
       4.81157796e-01, 3.45294118e-01, 1.09962652e-01, 1.87394958e-02,
       3.33333333e-03, 2.00373483e-01, 0.00000000e+00, 6.53594771e-05,
       2.42763772e-03, 1.98506069e-02, 8.78711485e-02, 3.05770308e-01,
       4.91839402e-01, 5.31549953e-01, 5.31942110e-

[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.4s finished


In [23]:
# Make a set of predictions for the test data
y_pred = tuned_model.predict(X_test)

# Print performance details
accuracy = metrics.accuracy_score(y_test, y_pred) # , normalize=True, sample_weight=None
print("Accuracy: " +  str(accuracy))
print(metrics.classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix")
pd.crosstab(np.array(y_test), y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

Accuracy: 0.6766666666666666
              precision    recall  f1-score   support

           0       0.74      0.72      0.73       197
           1       0.96      0.88      0.92       181
           2       0.58      0.49      0.53       193
           3       0.64      0.79      0.70       180
           4       0.51      0.51      0.51       167
           5       0.51      0.76      0.61       178
           6       0.31      0.18      0.23       175
           7       0.73      0.78      0.76       180
           8       0.93      0.78      0.85       177
           9       0.84      0.87      0.85       172

    accuracy                           0.68      1800
   macro avg       0.67      0.68      0.67      1800
weighted avg       0.68      0.68      0.67      1800

Confusion Matrix


Predicted,0,1,2,3,4,5,6,7,8,9,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,141,0,4,27,2,18,5,0,0,0,197
1,9,159,2,10,0,0,1,0,0,0,181
2,0,0,94,5,31,22,39,1,1,0,193
3,5,5,1,142,11,11,5,0,0,0,180
4,1,1,33,22,86,6,17,0,1,0,167
5,0,0,0,0,0,135,0,32,1,10,178
6,34,0,23,9,37,32,32,0,8,0,175
7,0,0,0,0,0,20,0,141,0,19,180
8,0,0,5,7,2,15,4,6,138,0,177
9,0,0,0,1,0,7,1,13,0,150,172
