In [1]:
import numpy as np
import pandas as pd

def fetch_data(path, encoding="utf-8"):
    X = []
    y = []
    with open(path, encoding=encoding) as file:
        for line in file:
            data_string = line.rstrip().split(",")
            X.append(np.array(data_string[:-1], dtype=float))
            y.append(int(data_string[-1]))
    return np.array(X), np.array(y)

X_train, y_train = fetch_data("VSTrain.dt")
X_test, y_test = fetch_data("VSTest.dt")

N = len(y_train)

In [2]:
counts = np.unique(y_train, return_counts=True)

classes = []
for class_, count in zip(counts[0], counts[1]):
    if count >= 65:
        classes.append(class_)
        
    freq = count / N
    print(f"Class: {class_:2}  |  Total: {count:2}  |  Frequency: {freq:f}  |  Percentage: {freq*100:9f} %")
    
classes = np.array(classes)

Class:  0  |  Total: 68  |  Frequency: 0.088197  |  Percentage:  8.819715 %
Class:  1  |  Total: 22  |  Frequency: 0.028534  |  Percentage:  2.853437 %
Class:  2  |  Total:  1  |  Frequency: 0.001297  |  Percentage:  0.129702 %
Class:  3  |  Total: 96  |  Frequency: 0.124514  |  Percentage: 12.451362 %
Class:  4  |  Total: 17  |  Frequency: 0.022049  |  Percentage:  2.204929 %
Class:  5  |  Total: 50  |  Frequency: 0.064851  |  Percentage:  6.485084 %
Class:  6  |  Total: 60  |  Frequency: 0.077821  |  Percentage:  7.782101 %
Class:  7  |  Total: 10  |  Frequency: 0.012970  |  Percentage:  1.297017 %
Class:  8  |  Total: 27  |  Frequency: 0.035019  |  Percentage:  3.501946 %
Class:  9  |  Total: 58  |  Frequency: 0.075227  |  Percentage:  7.522698 %
Class: 10  |  Total:  9  |  Frequency: 0.011673  |  Percentage:  1.167315 %
Class: 11  |  Total: 21  |  Frequency: 0.027237  |  Percentage:  2.723735 %
Class: 12  |  Total: 19  |  Frequency: 0.024643  |  Percentage:  2.464332 %
Class: 13  |

In [3]:
train_indices = [idx for idx, y in enumerate(y_train) if y not in classes]
test_indices = [idx for idx, y in enumerate(y_test) if y not in classes]
        
X_train, y_train = np.delete(X_train, train_indices, axis=0), np.delete(y_train, train_indices)
X_test, y_test = np.delete(X_test, test_indices, axis=0), np.delete(y_test, test_indices)

In [4]:
len(y_train), len(y_test)

(314, 335)

In [5]:
def normalize(data, mu, var):
    return (data - mu)/var

X_train_mu = X_train.mean(axis=0)
X_train_std = X_train.std(axis=0)

X_train_norm = normalize(X_train, X_train_mu, X_train_std)
X_test_norm = normalize(X_test, X_train_mu, X_train_std)

In [9]:
S = np.dot(X_train_norm.T, X_train_norm) # Computes a 61x61 scatter matrix / empirical covariance matrix
decomp = np.linalg.eig(S / N)

In [15]:
decomp[0].shape, decomp[1].shape

((61,), (61, 61))

In [33]:
u,s,v = np.linalg.svd(X_train_norm, full_matrices=False)

In [35]:
u.shape, s.shape, v.shape

((314, 61), (61,), (61, 61))

In [39]:
u @ (s[..., None] * v)

array([[ 1.31066988e+01,  2.24667775e-02,  1.08463457e+00, ...,
         3.82603733e-02,  1.63788534e-02,  8.52393075e-03],
       [-5.20455154e-01, -4.55129570e-01, -7.91425158e-01, ...,
        -8.15150359e-03,  9.66681197e-03,  3.33368784e-03],
       [-8.00135982e+00,  3.30461896e-01,  7.16637742e+00, ...,
        -1.10783557e-02, -1.48192414e-01, -2.39699509e-02],
       ...,
       [ 2.98090468e+00,  4.35071372e-01,  1.00957643e+00, ...,
        -7.42361617e-02,  3.99754942e-03,  2.41264333e-02],
       [ 3.53060093e+00,  3.09742817e+01,  2.36445295e-01, ...,
         5.08872935e-02, -5.34347906e-02, -4.93121266e-02],
       [ 3.54245423e+00, -7.14170974e-01,  1.77222035e+00, ...,
        -6.84021874e-02, -9.69401939e-03,  4.72411231e-02]])