# Feature Dimensionality Reduction

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

# ML utils
import time
import pickle
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

In [2]:
train_path = Path("output/emnist_train_features.npz")
test_path = Path("output/emnist_test_features.npz")
mapping_path = Path("class_mapping.pkl")
rng_seed = 42

In [3]:
with np.load(train_path) as data:
    X_train = data['X_train'].astype(np.float32, copy=False)
    y_train = data['y_train'].astype(np.uint8, copy=False)

with open(mapping_path, 'rb') as f:
    class_mapping: dict = pickle.load(f)

In [4]:
pca_frac = 0.1
pca_n = max(1, int(pca_frac * X_train.shape[1]))
print(f"PCA n_components={pca_n} (frac={pca_frac})")

n_classes = len(class_mapping)
lda_n = min(max(1, n_classes - 1), 64)
print(f"n_classes={n_classes}, LDA n_components={lda_n}")

pipe = Pipeline([('scaler', StandardScaler()),
                    ('pca', PCA(n_components=pca_n, svd_solver='randomized', random_state=rng_seed)),
                    ('lda', LDA(n_components=lda_n))])

del pca_frac, pca_n, n_classes, lda_n, class_mapping

PCA n_components=143 (frac=0.1)
n_classes=62, LDA n_components=61


In [5]:
pipe.fit(X_train, y_train)
with open("dim_reduction_pipeline.pkl", 'wb') as f:
    pickle.dump(pipe, f)
    
X_train_reduced = pipe.transform(X_train).astype(np.float32, copy=False)
np.savez_compressed(
    "output/emnist_train_reduced.npz",
    X_train=X_train_reduced,
    y_train=y_train
)
del X_train, y_train, X_train_reduced

In [6]:
with np.load(test_path) as data:
    X_test = data['X_test'].astype(np.float32, copy=False)
    y_test = data['y_test'].astype(np.uint8, copy=False)

X_test_reduced = pipe.transform(X_test).astype(np.float32, copy=False)
np.savez_compressed(
    "output/emnist_test_reduced.npz",
    X_test=X_test_reduced,
    y_test=y_test
)
del X_test, y_test, X_test_reduced

In [None]:
svm = LinearSVC(C=1.0, max_iter=1000, random_state=rng_seed)
with np.load("output/emnist_train_reduced.npz") as data:
    X_train = data['X_train']
    y_train = data['y_train']

svm.fit(X_train, y_train)
del X_train, y_train

with np.load("output/emnist_test_reduced.npz") as data:
    X_test = data['X_test']
    y_test = data['y_test']

y_pred = svm.predict(X_test)
del X_test
print(f"Accuracy: {np.mean(y_pred == y_test)}")
print(f"F1-score: {f1_score(y_test, y_pred, average='weighted')}")
conf_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], dropna=False)
del y_test, y_pred
print(conf_matrix)
prec = pd.Series(np.diag(conf_matrix) / conf_matrix.sum(axis=0), name='Precision')
print(prec)
rec = pd.Series(np.diag(conf_matrix) / conf_matrix.sum(axis=1), name='Recall')
print(rec)