In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
from matplotlib import style
import random
import numpy.linalg as la

In [2]:
def pca(centeredData, numOfComponents):
    [U,s,Vt] = la.svd(centeredData)
    Uk = U[:,:numOfComponents]
    sk = s[:numOfComponents]
    Wp = np.diag(sk**-0.5) @ Uk.T
    return Wp

In [3]:
def plotter(data):
    fig, axs = plt.subplots(6, 6)

    for i in range(6):
        for j in range(6):
            dataIndex = 6*i+j
            axs[i, j].imshow(np.reshape(data[:,dataIndex],(28,28),'F'), cmap=plt.cm.afmhot)
    counter = 0
    for ax in axs.flatten():
        ax.axes.xaxis.set_ticks([])
        ax.axes.yaxis.set_ticks([])

        counter += 1
    fig.set_figheight(8)
    fig.set_figwidth(8)

In [4]:
# load data and set some helper variables
data = np.load("./Data/digits-labels.npz")
X = data["d"]
labels = data["l"]
m, n = X.shape # m is number of features and n is number of images
k = 36 # k is the number of pc's we want

In [5]:
print(f"We have {n} images of vectorized length {m}. We are trying to reduce these images to be {k}-dimensional.")
#plt.imshow(np.reshape( X[:,0],(28,28),'F'))

We have 10000 images of vectorized length 784. We are trying to reduce these images to be 36-dimensional.


In [6]:
dataSet = np.vstack((X, labels))

training = []
testing = []

numClasses = 10

for i in range(numClasses):
    classi =  dataSet[:,np.squeeze(np.argwhere(labels==i))]
    sizeClassi = np.shape(classi)[1]
    random100 = random.sample(range(sizeClassi), 100)
    conditionals1 = np.full(sizeClassi, False)
    conditionals1[random100] = True
    training.append(classi[:,conditionals1])
    testing.append(classi[:,~conditionals1])

training = np.hstack(training)
testing = np.hstack(testing)
trainingX = training[:784, :]
trainingY = training[784, :]
testingX = testing[:784, :]
testingY = testing[784, :]

In [8]:
#2. calculate the mean image and center all the training data
meanTrainingImage = np.mean(trainingX, axis = 1).reshape(-1, 1)

centeredTrainingX = trainingX - meanTrainingImage

print(np.shape(centeredTrainingX))

(784, 1000)


In [9]:
#3. run PCA on centered training data covariance to get Wp
centeredCovariance = np.cov(centeredTrainingX)

Wp = pca(centeredCovariance, k)
compressedTrainingX = Wp @ centeredTrainingX
compressedTestingX = Wp @ (testingX - meanTrainingImage)
Wp_inv = np.linalg.pinv(Wp)
print(np.shape(compressedTestingX))

(36, 9000)


In [10]:
#plotter(Wp_inv)

In [11]:
class GaussianDiscriminant:
    def __init__(self, rawClassData, classProbability):
        self._rawClassData = rawClassData
        self.classMean = np.mean(rawClassData, axis=1).reshape((36,1))
        self.classCovariance = np.cov(rawClassData)
        self.classProbability = classProbability
        self.inverseClassCovariance = np.linalg.inv(self.classCovariance)
        self.getDiscriminantParameters()
        
    def getDiscriminantParameters(self):
        self.A = (-0.5) * self.inverseClassCovariance
        self.b = self.inverseClassCovariance @ self.classMean
        self.c = (-0.5) * (self.classMean.T @ self.inverseClassCovariance @ self.classMean + np.log(np.linalg.det(self.classCovariance))) + np.log(self.classProbability)
        
    def evalGaussianDiscriminantForMany(self, X):
        return np.sum((X.T @ self.A) * X.T, axis=1) + self.b.T @ X + self.c

In [12]:
class GausianClassifier:
    def __init__(self, rawData):
        self.agregates = {}
        for i in range(numClasses):
            rawClassIndex = np.squeeze(np.argwhere(trainingY==i))
            rawClassData = rawData[:,rawClassIndex]
            self.agregates[i] = GaussianDiscriminant(rawClassData, 100/1000)

          
            

In [13]:
digitClassifier = GausianClassifier(compressedTrainingX)
scores = []
for i in range(10):
    
    scores.append(digitClassifier.agregates[i].evalGaussianDiscriminantForMany(compressedTestingX))
scores = np.squeeze(scores)
print(np.shape(scores))

(10, 9000)


In [14]:
numCorrect = 0
for i in range(9000):
    if(np.argmax(scores[:,i]) == int(testingY[i])):
        numCorrect += 1
print(numCorrect/9000)

0.915
