# Feature Dimensionality Reduction

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

# ML utils
import time
import pickle
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
train_path = Path("output/emnist_train_features.npz")
test_path = Path("output/emnist_test_features.npz")
mapping_path = Path("class_mapping.pkl")
frac = 0.10
rng_seed = 42

In [None]:
with np.load(train_path) as data:
    X_train = data['X_train'].astype(np.float32, copy=False)
    y_train = data['y_train'].astype(np.uint8, copy=False)

with open(mapping_path, 'rb') as f:
    class_mapping = pickle.load(f)

In [None]:
pca_frac = 0.1
pca_n = max(1, int(pca_frac * X_train.shape[1]))
print(f"PCA n_components={pca_n} (frac={pca_frac})")

n_classes = len(class_mapping)
lda_n = min(max(1, n_classes - 1), 64)
print(f"n_classes={n_classes}, LDA n_components={lda_n}")

pipe = Pipeline([('scaler', StandardScaler()),
                    ('pca', PCA(n_components=pca_n, svd_solver='randomized', random_state=rng_seed)),
                    ('lda', LDA(n_components=lda_n))])

In [None]:
pipe.fit(X_train, y_train)
X_train_reduced = pipe.transform(X_train).astype(np.float32, copy=False)
np.savez_compressed(
    "output/emnist_train_reduced.npz",
    X_train=X_train_reduced,
    y_train=y_train
)
del X_train, y_train, X_train_reduced

In [None]:
with np.load(test_path) as data:
    X_test = data['X_test'].astype(np.float32, copy=False)
    y_test = data['y_test'].astype(np.uint8, copy=False)

X_test_reduced = pipe.transform(X_test).astype(np.float32, copy=False)
np.savez_compressed(
    "output/emnist_test_reduced.npz",
    X_test=X_test_reduced,
    y_test=y_test
)
del X_test, y_test, X_test_reduced