# 10. Learning Vector Quantization

In [1]:
from random import seed, randrange
from math import sqrt

from functions import *

### Define functions

In [2]:
# Locate the best matching unit
# Input: test_row to find BMU for, and codebooks (patterns) among which to look
# Output: the BMU; the codebook (pattern) with shortest Euclidian distance from test_row
def get_best_matching_unit(codebooks, test_row):
    # List of tuples; each contains a vector and its distance to test_row
    distances = list()
    # Iterate over codebooks (patterns to match against) to find closest match
    for codebook in codebooks:
        # Find Euclidian distance between test_row and current codebook (pattern)
        dist = euclidian_distance(codebook, test_row)
        # Add tuple of codebook (pattern) and its distance to test_row to list of tuples
        distances.append((codebook, dist))
    # Sort list of tuples by distance (second tuple element: tup[1]), by default in ascending order
    distances.sort(key=lambda tup: tup[1])
    # Return codebook (pattern) with shortest Euclidian distance from test_row
    return distances[0][0]

# Create a randomized codebook vector
# Input: the training set from which to pull random feature values
# Output: a codebook vector with randomized feature pattern values from among the training set
def random_codebook(train):
    n_records = len(train)
    # Number of features /  input variables / columns
    n_features = len(train[0])
    # Initialize codebook as a vector (set) of random features from training data
    # For each feature, take that of a random record in the training set
    codebook = [train[randrange(n_records)][i] for i in range(n_features)]
    return codebook

# Make a prediction with codebook vectors
# Input: codebook vectors ("blueprint" class patterns), test row to predict for
# Output: predicted class for test row
def predict_LVQ(codebooks, test_row):
    # Find best matching unit, i.e. codebook vector whose pattern has least Euclidian distance
    bmu = get_best_matching_unit(codebooks, test_row)
    # Return the BMU class as the prediction
    return bmu[-1]

# Train a number of codebook vector feature values for a provided dataset, with learning rate decay
# Input: training data, number of codebooks (patterns), initial learning rate, number of epochs
# Output: list of tuned codebook vectors (practically class blueprint patterns of input data)
def train_codebooks(train, n_codebooks, lrate, epochs):
    # Populate a list[] of randomly initialized codebook vectors ("class" input data patterns)
    codebooks = [random_codebook(train) for i in range(n_codebooks)]
    for epoch in range(epochs):
        # Update effective learning rate through epoch-based decay
        rate = lrate * (1.0 - (epoch / float(epochs)))
        # Start epoch error rate at 0
        sum_error = 0.0
        # Per epoch, iterate over all rows in train
        for row in train:
            # Find BMU of row from among codebook vectors
            bmu = get_best_matching_unit(codebooks, row)
            # Iterate over features to update codebook vector (i.e. "blueprint" class pattern)
            for i in range(len(row)-1):
                # Calculate feature value difference between BMU and current training row
                error = row[i] - bmu[i]
                # Add to epoch sum squared error
                sum_error += error**2
                # If last column of BMU same as train row, hence same (correct) class
                if bmu[-1] == row[-1]:
                    # Increment BMU feature value to bring it closer to train row feature value
                    # E.g. row[i] > bmu[i], then adding rate*error increases bmu[i] closer to row[i]
                    bmu[i] += rate * error
                # BMU is a different class, hence wrong prediction
                else:
                    # Decrement BMU feature value, i.e. moving it further from the test row pattern
                    bmu[i] -= rate * error
        print(">epoch%d, lrate=%.3f, error=%.3f" % (epoch, rate, sum_error))
    return codebooks

# Learning Vector Quantization Algorithm
def learning_vector_quantization(train, test, n_codebooks, lrate, epochs):
    # Train codebook vector feature values
    codebooks = train_codebooks(train, n_codebooks, lrate, epochs)
    predictions = list()
    for row in test:
        # Get predicted class based for given row based on trained codebook vectors
        output = predict_LVQ(codebooks, row)
        predictions.append(output)
    return predictions

### Testing LVQ on contrived dataset

In [3]:
# Contrived dataset for testing
dataset = [[2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,1],
    [5.332441248,2.088626775,1],
    [6.922596716,1.77106367,1],
    [8.675418651,-0.242068655,1],
    [7.673756466,3.508563011,1]]

# Testing Euclidian distance
row0 = dataset[0]
for row in dataset:
    distance = euclidian_distance(row0, row)
    print(round(distance, 2))

# Testing Best Matching Unit (BMU) function
test_row = dataset[0]
bmu = get_best_matching_unit(dataset, test_row)
print("\n First row:", dataset[0])
print(" BMU:      ", bmu)
# As expected, the most similar codebook to the first row is the first row (itself)
# Making predictions with a set of codebook vectors is the same thing.
# We use 1-nearest neighbour algorithm, but look among codebooks rather than full training set

# Testing codebook training function
seed(1)
learn_rate = 0.3
n_epochs = 10
n_codebooks = 2
print("\nTesting training function:")
codebooks = train_codebooks(dataset, n_codebooks, learn_rate, n_epochs)
print("Codebooks: %s" % codebooks)

0.0
1.33
1.95
1.56
0.54
4.85
2.59
4.21
6.52
4.99

 First row: [2.7810836, 2.550537003, 0]
 BMU:       [2.7810836, 2.550537003, 0]

Testing training function:
>epoch0, lrate=0.300, error=43.270
>epoch1, lrate=0.270, error=30.403
>epoch2, lrate=0.240, error=27.146
>epoch3, lrate=0.210, error=26.301
>epoch4, lrate=0.180, error=25.537
>epoch5, lrate=0.150, error=24.789
>epoch6, lrate=0.120, error=24.058
>epoch7, lrate=0.090, error=23.346
>epoch8, lrate=0.060, error=22.654
>epoch9, lrate=0.030, error=21.982
Codebooks: [[2.432316086217663, 2.839821664184211, 0], [7.319592257892681, 1.97013382654341, 1]]


### Testing LVQ on Ionosphere dataset

In [5]:
# Evaluate Ionosphere Case Study
seed(1)
dataset = load_csv("data/ionosphere.csv")
# Convert string numbers to floats
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# Convert class column from string to integer
str_column_to_int(dataset, len(dataset[0])-1)

# Evaluate algorithm
print("\nIonsophere Case Study:")
n_folds = 5
learn_rate = 0.3
n_epochs = 50
n_codebooks = 20
# n_codebooks, learn_rate, and n_epochs are passed via *args
scores = evaluate_algorithm(dataset, learning_vector_quantization, n_folds, accuracy_metric, n_codebooks, learn_rate, n_epochs)
print("Scores: %s" % scores)
print("Mean Accuracy: %.3f%%" % (sum(scores)/float(len(scores))))
# 87.143% is better than the baseline of 64.286%
# 20 codebooks is also far less than holding the entire dataset, like we would have to with KNN


Loaded data file data/ionosphere.csv with 351 rows and 35 columns.

Ionsophere Case Study:
>epoch0, lrate=0.300, error=2106.606
>epoch1, lrate=0.294, error=2033.558
>epoch2, lrate=0.288, error=1931.089
>epoch3, lrate=0.282, error=1899.031
>epoch4, lrate=0.276, error=1906.729
>epoch5, lrate=0.270, error=1881.332
>epoch6, lrate=0.264, error=1869.365
>epoch7, lrate=0.258, error=1857.928
>epoch8, lrate=0.252, error=1839.964
>epoch9, lrate=0.246, error=1839.005
>epoch10, lrate=0.240, error=1824.272
>epoch11, lrate=0.234, error=1825.209
>epoch12, lrate=0.228, error=1807.112
>epoch13, lrate=0.222, error=1798.466
>epoch14, lrate=0.216, error=1788.285
>epoch15, lrate=0.210, error=1776.740
>epoch16, lrate=0.204, error=1763.173
>epoch17, lrate=0.198, error=1755.526
>epoch18, lrate=0.192, error=1747.907
>epoch19, lrate=0.186, error=1740.369
>epoch20, lrate=0.180, error=1729.157
>epoch21, lrate=0.174, error=1721.589
>epoch22, lrate=0.168, error=1714.244
>epoch23, lrate=0.162, error=1706.964
>epoch2

>epoch20, lrate=0.180, error=1471.520
>epoch21, lrate=0.174, error=1465.897
>epoch22, lrate=0.168, error=1460.311
>epoch23, lrate=0.162, error=1458.982
>epoch24, lrate=0.156, error=1449.192
>epoch25, lrate=0.150, error=1439.539
>epoch26, lrate=0.144, error=1431.698
>epoch27, lrate=0.138, error=1423.412
>epoch28, lrate=0.132, error=1417.905
>epoch29, lrate=0.126, error=1417.880
>epoch30, lrate=0.120, error=1406.439
>epoch31, lrate=0.114, error=1405.912
>epoch32, lrate=0.108, error=1393.973
>epoch33, lrate=0.102, error=1391.508
>epoch34, lrate=0.096, error=1382.632
>epoch35, lrate=0.090, error=1381.079
>epoch36, lrate=0.084, error=1373.857
>epoch37, lrate=0.078, error=1370.051
>epoch38, lrate=0.072, error=1362.348
>epoch39, lrate=0.066, error=1357.132
>epoch40, lrate=0.060, error=1351.590
>epoch41, lrate=0.054, error=1346.953
>epoch42, lrate=0.048, error=1341.475
>epoch43, lrate=0.042, error=1335.893
>epoch44, lrate=0.036, error=1331.728
>epoch45, lrate=0.030, error=1325.676
>epoch46, lr