In [49]:
from keras.datasets import mnist
from sklearn.decomposition import KernelPCA 
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import math

(trainX, trainy), (testX, testy) = mnist.load_data()

#**Part 1:**
## Implement PCA and helper functions

In [50]:
def pca(matrix):

  # Means of column
  means = get_column_mean(matrix)

  # Covariance
  center_mat = (cov_mat(matrix,means))

  # Eigen
  eigenvalue = np.linalg.eig(center_mat)

  # SVD
  u, sigma, vt = np.linalg.svd(matrix, full_matrices=False)

  return means, eigenvalue, matrix_multip(matrix,vt.T)

#----------------------------------------------------------------------

#Multiplies matrices
def matrix_multip(x, y):
  result = [[0 for x in range(len(x))] for y in range(len(y))]
  for i in range(len(x)):
    for j in range(len(y[0])):
      for k in range(len(y)):
        result[i][j] += x[i][k] * y[k][j]

#----------------------------------------------------------------------

#Gets matrix and means, returns centerized by mean.
def centerized_matrix(means,matrix):
  col = len(matrix)
  row = len(matrix[0])
  for j in range(0,row):
    total = 0
    for i in range(0,col):  
      matrix[i][j] -= means[j] 
  return matrix

#Gets matrix, returns column mean.
def get_column_mean(matrix):
  list = []
  col = len(matrix)
  row = len(matrix[0])
  for j in range(0,row):
    total = 0
    for i in range(0,col):  
      total += matrix[i][j]
    list.insert(j,total/row)
  return list

#----------------------------------------------------------------------

#Covariance
def covariance(x, y):
  xbar, ybar = x.mean(), y.mean()
  return np.sum((x - xbar)*(y - ybar))/(len(x) - 1)

# Covariance matrix
def cov_mat(matrix, means):
  return np.array([[covariance(matrix[0], matrix[0]), covariance(matrix[0], matrix[1])],[covariance(matrix[1], matrix[0]), covariance(matrix[1], matrix[1])]])

#**Part 3 (with 2) :**
## Use a non linear PCA.

In [51]:
testX = testX[:1000]
testy = testy[:1000]

trainX_splits = np.array_split(testX,10)
trainY_splits = np.array_split(testy,10)

In [52]:
t_minst = []
for i in range(0,10):
  t_minst.insert(i,np.reshape(trainX_splits[i], [trainX_splits[i].shape[0], trainX_splits[i].shape[1]*trainX_splits[i].shape[2]]))

In [53]:
kpca = KernelPCA()
trains = []
for i in range(0,10):
  trains.insert(i,kpca.fit_transform(t_minst[i]))

In [54]:
regressors = []

for i in range(0,10):
  regressors.insert(i,RandomForestRegressor(n_estimators = 100, random_state = 0))
  regressors[i].fit(trains[i], trainY_splits[i])
  print(f"{i} is done.")

0 is done.
1 is done.
2 is done.
3 is done.
4 is done.
5 is done.
6 is done.
7 is done.
8 is done.
9 is done.


In [60]:
fail = 0
for i in range(0,10):
  for j in range(0,10):
    if j == i:
      j += 1
    if j >= 10:
      break
    for k in range(0,len(trains[j])):
      res = regressors[i].predict(trains[j][k].reshape(1,-1))
      if trainY_splits[j][i] != math.floor(res) and trainY_splits[j][i] != math.ceil(res):
        #means fail
        fail += 1
      


In [64]:
print(f"Testing done. Predicted 90k times as test data and result is:"),
print(f"fail : {fail}. This means success rate is : {(90000 - fail)/90000} (max is 1)")

Testing done. Predicted 90k times as test data and result is:
fail : 7988. This means success rate is : 0.9112444444444444 (max is 1)
