Reducción de la dimensionalidad

In [7]:
# === Configuración de rutas ===
TRAIN_PATH = "../data/dataset/X_train_scaled.csv"  
TEST_PATH  = "../data/dataset/X_test_scaled.csv"   
OUT_DIR    = "../data/output"                          
TARGET_VARIANCE = 0.95                             # 95% por defecto

In [8]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA
import pickle

In [None]:
X_train = pd.read_csv(TRAIN_PATH)
X_test  = pd.read_csv(TEST_PATH)

(6819, 6008)

In [10]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score


In [13]:
def analyze_missing_values(df):
    # Calcular valores faltantes
    missing_values = df.isnull().sum()
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    
    # Crear DataFrame con el análisis
    missing_info = pd.DataFrame({
        'Missing Values': missing_values,
        'Missing Percentage': missing_percentage.round(2)
    })
    
    # Ordenar por cantidad de valores faltantes (descendente)
    missing_info = missing_info.sort_values('Missing Values', ascending=False)
    
    # Mostrar solo columnas con al menos un valor faltante
    return missing_info[missing_info['Missing Values'] > 0]

# Analizar tanto el conjunto de entrenamiento como el de prueba
print("=== Missing Values en X_train ===")
display(analyze_missing_values(X_train))

print("\n=== Missing Values en X_test ===")
display(analyze_missing_values(X_test))

# Mostrar el total de valores faltantes
print("\nTotal de valores faltantes:")
print(f"X_train: {X_train.isnull().sum().sum():,}")
print(f"X_test: {X_test.isnull().sum().sum():,}")

=== Missing Values en X_train ===


Unnamed: 0,Missing Values,Missing Percentage
genres.1,6819,100.00
title,6819,100.00
genres,6819,100.00
title.1,6819,100.00
hog_1168,4728,69.34
...,...,...
zernike_15.2,1766,25.90
zernike_14.2,1766,25.90
zernike_13.2,1766,25.90
zernike_12.2,1766,25.90



=== Missing Values en X_test ===


Unnamed: 0,Missing Values,Missing Percentage
title.1,2923,100.00
title,2923,100.00
hog_1319.1,786,26.89
hog_1733.2,786,26.89
hog_1335.1,786,26.89
...,...,...
color_2,726,24.84
color_1,726,24.84
color_6,726,24.84
color_24,726,24.84



Total de valores faltantes:
X_train: 16,513,352
X_test: 4,560,844


In [14]:
# Guardar dimensiones originales
print("Dimensiones originales:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

# Eliminar filas con valores faltantes
X_train_clean = X_train.dropna(axis=0, how='any')
X_test_clean = X_test.dropna(axis=0, how='any')

# Mostrar dimensiones después de la limpieza
print("\nDimensiones después de eliminar filas con NA:")
print(f"X_train: {X_train_clean.shape}")
print(f"X_test: {X_test_clean.shape}")

# Mostrar cantidad de filas eliminadas
print("\nFilas eliminadas:")
print(f"X_train: {X_train.shape[0] - X_train_clean.shape[0]:,} filas")
print(f"X_test: {X_test.shape[0] - X_test_clean.shape[0]:,} filas")

Dimensiones originales:
X_train: (6819, 6008)
X_test: (2923, 5966)

Dimensiones después de eliminar filas con NA:
X_train: (0, 6008)
X_test: (0, 5966)

Filas eliminadas:
X_train: 6,819 filas
X_test: 2,923 filas


In [None]:
    # Limpieza por si el índice está guardado como columna
for df in (X_train, X_test):
    if df.columns[0].lower() in ("unnamed: 0", "index", ""):
        df.drop(df.columns[0], axis=1, inplace=True)
    df[:] = df.apply(pd.to_numeric, errors="coerce")

print("Shapes → train:", X_train.shape, " test:", X_test.shape)


Shapes → train: (6819, 6008)  test: (2923, 5966)


In [12]:
pca_full = PCA()
pca_full.fit(X_train.values)
explained_var = pca_full.explained_variance_ratio_
cum_explained = np.cumsum(explained_var)

# Selección de k (mínimo con ≥ TARGET_VARIANCE)
k = int(np.searchsorted(cum_explained, TARGET_VARIANCE) + 1)
print(f"Componentes totales: {len(explained_var)} | k seleccionado (≥{int(TARGET_VARIANCE*100)}%): {k}")
print(f"Varianza acumulada con k={k}: {cum_explained[k-1]:.4f}")

# Tabla de varianza explicada
var_table = pd.DataFrame({
    'component': np.arange(1, len(explained_var)+1),
    'explained_variance_ratio': explained_var,
    'cumulative_variance_ratio': cum_explained
})
display(var_table.head(25))

# Gráficos
plt.figure()
plt.plot(np.arange(1, len(explained_var)+1), explained_var, marker='o')
plt.xlabel('Componente principal')
plt.ylabel('Varianza explicada (ratio)')
plt.title('Scree plot - PCA (todas las componentes)')
plt.tight_layout()
plt.show()

plt.figure()
plt.plot(np.arange(1, len(cum_explained)+1), cum_explained, marker='o')
plt.axhline(y=TARGET_VARIANCE, linestyle='--')
plt.axvline(x=k, linestyle='--')
plt.xlabel('Número de componentes')
plt.ylabel('Varianza explicada acumulada')
plt.title(f'Varianza acumulada - PCA (k={k})')
plt.tight_layout()
plt.show()

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Refit con k componentes y transformación de train/test 
pca_k = PCA(n_components=k, svd_solver='full', random_state=0)
pca_k.fit(X_train.values)
Z_train = pca_k.transform(X_train.values)
Z_test  = pca_k.transform(X_test.values)
print("Z_train:", Z_train.shape, "Z_test:", Z_test.shape)

Z_train: (6819, 19) Z_test: (2923, 19)


In [None]:
# === Guardado de resultados y artefactos ===
out_dir = Path(OUT_DIR)
train_out = out_dir / f"X_train_pca_k{k}.csv"
test_out  = out_dir / f"X_test_pca_k{k}.csv"
var_out   = out_dir / "pca_variance_report.csv"
meta_out  = out_dir / "pca_metadata.json"
model_out = out_dir / f"pca_model_k{k}.pkl"

pd.DataFrame(Z_train).to_csv(train_out, index=False)
pd.DataFrame(Z_test).to_csv(test_out, index=False)
var_table.to_csv(var_out, index=False)

meta = {
    "train_path": TRAIN_PATH,
    "test_path": TEST_PATH,
    "train_shape": tuple(X_train.shape),
    "test_shape": tuple(X_test.shape),
    "n_features": int(X_train.shape[1]),
    "k_selected": int(k),
    "target_variance": float(TARGET_VARIANCE),
    "achieved_variance": float(np.cumsum(pca_k.explained_variance_ratio_)[-1])
}
with open(meta_out, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

with open(model_out, "wb") as f:
    pickle.dump(pca_k, f)

print("Guardado:")
print("-", train_out)
print("-", test_out)
print("-", var_out)
print("-", meta_out)
print("-", model_out)

Guardado:
- ..\data\output\X_train_pca_k19.csv
- ..\data\output\X_test_pca_k19.csv
- ..\data\output\pca_variance_report.csv
- ..\data\output\pca_metadata.json
- ..\data\output\pca_model_k19.pkl
