# Feature Dimensionality Reduction

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

# ML utils
import time
import pickle
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import f1_score

In [None]:
train_path = Path("output/emnist_train_features.npz")
test_path = Path("output/emnist_test_features.npz")
mapping_path = Path("class_mapping.pkl")

In [None]:
with np.load(train_path) as data:
    X_train = data['X_train']
    y_train = data['y_train']

with np.load(test_path) as data:
    X_test = data['X_test']
    y_test = data['y_test']

with open(mapping_path, 'rb') as f:
    class_mapping = pickle.load(f)

print(f"Train set shape: {X_train.shape}, Train labels shape: {y_train.shape}")
print(f"Test set shape: {X_test.shape}, Test labels shape: {y_test.shape}")
print(f"Class mapping shape: {class_mapping.shape}")

## Comparación de pipelines

Este experimento compara tres configuraciones supervisadas usando `StandardScaler` + `SVM (linear)` como clasificador:

- Sin reducción: `StandardScaler` → `SVM (linear)`
- PCA: `StandardScaler` → `PCA(n_components=128)` → `SVM (linear)`
- PCA + LDA: `StandardScaler` → `PCA(n_components=128)` → `LDA` → `SVM (linear)`
- PCA + LDA + Kernel RBF: `StandardScaler` → `PCA(n_components=128)` → `LDA` → `SVM (RBF)`

La celda calcula validación cruzada estratificada (5 folds) para cada pipeline y reporta accuracy y F1-macro (media y desviación estándar). También entrena el mejor pipeline completo y evalúa en el conjunto de test para comparar rendimiento final.

In [None]:
n_classes = len(class_mapping)
lda_n = min(max(1, n_classes - 1), 64)
print(f"n_classes={n_classes}, LDA n_components={lda_n}")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'f1_macro']

pipelines = {
    'no_reduction': Pipeline([('scaler', StandardScaler()), ('clf', SVC(kernel='linear', C=1.0))]),
    'pca': Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=128, random_state=42)), ('clf', SVC(kernel='linear', C=1.0))]),
    'pca_lda': Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=128, random_state=42)), ('lda', LDA(n_components=lda_n)), ('clf', SVC(kernel='linear', C=1.0))]),
    'pca_lda_rbf': Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=128, random_state=42)), ('lda', LDA(n_components=lda_n)), ('clf', SVC(kernel='rbf', C=1.0, gamma='scale'))])
}

results = {}
for name, pipe in pipelines.items():
    print(f"Evaluando: {name}")
    t0 = time.time()
    cv_res = cross_validate(pipe, X_train, y_train, cv=skf, scoring=scoring, n_jobs=-1, return_train_score=False)
    dt = time.time() - t0
    res_summary = {
        'cv_accuracy_mean': float(np.mean(cv_res['test_accuracy'])),
        'cv_accuracy_std': float(np.std(cv_res['test_accuracy'])),
        'cv_f1_mean': float(np.mean(cv_res['test_f1_macro'])),
        'cv_f1_std': float(np.std(cv_res['test_f1_macro'])),
        'cv_time_sec': dt
    }
    pipe.fit(X_train, y_train)
    test_acc = pipe.score(X_test, y_test)
    y_pred = pipe.predict(X_test)
    test_f1 = f1_score(y_test, y_pred, average='macro')
    res_summary.update({'test_accuracy': float(test_acc), 'test_f1_macro': float(test_f1)})
    results[name] = res_summary
    print(f"  CV acc: {res_summary['cv_accuracy_mean']:.4f} ± {res_summary['cv_accuracy_std']:.4f}, CV f1: {res_summary['cv_f1_mean']:.4f} ± {res_summary['cv_f1_std']:.4f}, CV time: {dt:.1f}s")
    print(f"  Test acc: {test_acc:.4f}, Test f1_macro: {test_f1:.4f}\n")


results_path = Path("output/dim_reduction_results.pkl")
with open(results_path, 'wb') as f:
    pickle.dump(results, f)

df_results = pd.DataFrame(results).T
print(df_results)