Reducción de la dimensionalidad

In [1]:
# === Configuración de rutas ===
TRAIN_PATH = "../data/dataset/X_train_scaled_leonel.csv"  
TEST_PATH  = "../data/dataset/X_test_scaled_leonel.csv"   
OUT_DIR    = "../data/output"                          
TARGET_VARIANCE = 0.95                             # 95% por defecto

In [2]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.decomposition import PCA
import pickle

Antes de hacer el PCA, vamos a contar los missing values de nuestras características para ver la estrategia a seguir, ya que no podemos tener missing values para PCA.

In [13]:
X_train = pd.read_csv(TRAIN_PATH)
X_test  = pd.read_csv(TEST_PATH)

missing_cnt = X_train.isna().sum()
print(missing_cnt)
missing_cols = missing_cnt[missing_cnt > 0].sort_values(ascending=False)
print("Colunas con al menos 1 missing value:",missing_cols.shape[0], "\n")
missing_pct = X_train.isna().mean() * 100
summary = (pd.DataFrame({
    "dtype": X_train.dtypes.astype(str),
    "missing values": missing_cnt,
    "missing_%": missing_pct.round(2)
})
.sort_values("missing_%", ascending=False))

print("\n=== NA por columna (ordenado) ===")
print(summary.head(50))  # muestra las 50 con más NA
print(f"\nTotal columnas: {X_train.shape[1]}, filas: {X_train.shape[0]}")

(no genres listed)     868
Action                 868
Adventure              868
Animation              868
Children               868
                      ... 
zernike_20            1856
zernike_21            1856
zernike_22            1856
zernike_23            1856
zernike_24            1856
Length: 2024, dtype: int64
Colunas con al menos 1 missing value: 2002 


=== NA por columna (ordenado) ===
            dtype  missing values  missing_%
hog_1763  float64            1867      24.29
hog_1762  float64            1867      24.29
hog_1761  float64            1867      24.29
hog_1760  float64            1867      24.29
hog_1759  float64            1867      24.29
hog_1758  float64            1867      24.29
hog_1757  float64            1867      24.29
hog_1756  float64            1867      24.29
hog_1755  float64            1867      24.29
hog_1754  float64            1867      24.29
hog_1753  float64            1867      24.29
hog_1752  float64            1867      24.29
hog_1751  f

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score

# --- 1) Resumen de faltantes ---
missing_cnt = X_train.isna().sum()
missing_pct = (X_train.isna().mean() * 100).round(2)

n_cols_total = X_train.shape[1]
n_cols_with_na = (missing_cnt > 0).sum()
print(f"Columnas totales: {n_cols_total}")
print(f"Columnas con ≥1 NaN: {n_cols_with_na}")

# Top columnas con mas nan
top_na = missing_pct.sort_values(ascending=False).head(30)
print("\nTOP 30 columnas con más NaN (%):")
print(top_na)

# 2) Selección de columnas por umbral
UMBRAL_DROP = 60.0  # 60% nan
drop_cols = missing_pct[missing_pct >= UMBRAL_DROP].index.tolist()
keep_cols = [c for c in X_train.columns if c not in drop_cols]

print(f"\nColumnas a eliminar (≥{UMBRAL_DROP}% NaN): {len(drop_cols)}")
if len(drop_cols) <= 20:
    print(drop_cols)

# descartar filas con muchísimos NaN (sobre keep_cols)
row_pct_na = X_train[keep_cols].isna().mean(axis=1) * 100
bad_rows = row_pct_na[row_pct_na > 50].index  # ajusta 40–50%
print(f"Filas con >50% NaN (en columnas conservadas): {len(bad_rows)}")

Xf = X_train.drop(columns=drop_cols).drop(index=bad_rows).reset_index(drop=True)


Columnas totales: 2024
Columnas con ≥1 NaN: 2002

TOP 30 columnas con más NaN (%):
hog_1763    24.29
hog_1762    24.29
hog_1761    24.29
hog_1760    24.29
hog_1759    24.29
hog_1758    24.29
hog_1757    24.29
hog_1756    24.29
hog_1755    24.29
hog_1754    24.29
hog_1753    24.29
hog_1752    24.29
hog_1751    24.29
hog_1750    24.29
hog_1749    24.29
hog_1748    24.29
hog_1747    24.29
hog_1746    24.29
hog_1745    24.29
hog_1744    24.29
hog_1743    24.29
hog_1742    24.29
hog_1741    24.29
hog_1740    24.29
hog_1739    24.29
hog_1738    24.29
hog_1737    24.29
hog_1736    24.29
hog_1735    24.29
hog_1734    24.29
dtype: float64

Columnas a eliminar (≥60.0% NaN): 0
[]
Filas con >50% NaN (en columnas conservadas): 1867


In [16]:
total_celdas = Xf.shape[0] * Xf.shape[1]

# Por columna
missing_cnt_cols = Xf.isna().sum()
missing_pct_cols = (Xf.isna().mean() * 100).round(2)

# Columnas con ≥1 NaN
missing_cols = missing_cnt_cols[missing_cnt_cols > 0].sort_values(ascending=False)

# Totales
n_missing_cells = int(missing_cnt_cols.sum())
n_cols_with_na  = int((missing_cnt_cols > 0).sum())
n_rows_with_na  = int(Xf.isna().any(axis=1).sum())
pct_cells_na    = round(100 * n_missing_cells / total_celdas, 4)

print("=== Faltantes en Xf ===")
print("Filas x Columnas:", Xf.shape)
print("Total de celdas:", total_celdas)
print("Celdas con NaN:", n_missing_cells, f"({pct_cells_na}%)")
print("Columnas con ≥1 NaN:", n_cols_with_na, "/", Xf.shape[1])
print("Filas con ≥1 NaN:", n_rows_with_na, "/", Xf.shape[0])

# (Opcional) Top columnas con más NaN
print("\nTop 20 columnas con más NaN:")
print(missing_cols.head(20))


=== Faltantes en Xf ===
Filas x Columnas: (5820, 2024)
Total de celdas: 11779680
Celdas con NaN: 13839 (0.1175%)
Columnas con ≥1 NaN: 21 / 2024
Filas con ≥1 NaN: 659 / 5820

Top 20 columnas con más NaN:
(no genres listed)    659
Action                659
Adventure             659
Animation             659
Children              659
Comedy                659
Crime                 659
Documentary           659
Drama                 659
Fantasy               659
Film-Noir             659
Horror                659
IMAX                  659
Musical               659
Mystery               659
Romance               659
Sci-Fi                659
Thriller              659
War                   659
Western               659
dtype: int64


In [None]:


    # Limpieza por si el índice está guardado como columna
for df in (X_train, X_test):
    if df.columns[0].lower() in ("unnamed: 0", "index", ""):
        df.drop(df.columns[0], axis=1, inplace=True)
    df[:] = df.apply(pd.to_numeric, errors="coerce")

print("Shapes → train:", X_train.shape, " test:", X_test.shape)


Shapes → train: (7687, 2024)  test: (3210, 2003)


In [4]:
pca_full = PCA()
pca_full.fit(X_train.values)
explained_var = pca_full.explained_variance_ratio_
cum_explained = np.cumsum(explained_var)

# Selección de k (mínimo con ≥ TARGET_VARIANCE)
k = int(np.searchsorted(cum_explained, TARGET_VARIANCE) + 1)
print(f"Componentes totales: {len(explained_var)} | k seleccionado (≥{int(TARGET_VARIANCE*100)}%): {k}")
print(f"Varianza acumulada con k={k}: {cum_explained[k-1]:.4f}")

# Tabla de varianza explicada
var_table = pd.DataFrame({
    'component': np.arange(1, len(explained_var)+1),
    'explained_variance_ratio': explained_var,
    'cumulative_variance_ratio': cum_explained
})
display(var_table.head(25))

# Gráficos
plt.figure()
plt.plot(np.arange(1, len(explained_var)+1), explained_var, marker='o')
plt.xlabel('Componente principal')
plt.ylabel('Varianza explicada (ratio)')
plt.title('Scree plot - PCA (todas las componentes)')
plt.tight_layout()
plt.show()

plt.figure()
plt.plot(np.arange(1, len(cum_explained)+1), cum_explained, marker='o')
plt.axhline(y=TARGET_VARIANCE, linestyle='--')
plt.axvline(x=k, linestyle='--')
plt.xlabel('Número de componentes')
plt.ylabel('Varianza explicada acumulada')
plt.title(f'Varianza acumulada - PCA (k={k})')
plt.tight_layout()
plt.show()

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Refit con k componentes y transformación de train/test 
pca_k = PCA(n_components=k, svd_solver='full', random_state=0)
pca_k.fit(X_train.values)
Z_train = pca_k.transform(X_train.values)
Z_test  = pca_k.transform(X_test.values)
print("Z_train:", Z_train.shape, "Z_test:", Z_test.shape)

Z_train: (6819, 19) Z_test: (2923, 19)


In [None]:
# === Guardado de resultados y artefactos ===
out_dir = Path(OUT_DIR)
train_out = out_dir / f"X_train_pca_k{k}.csv"
test_out  = out_dir / f"X_test_pca_k{k}.csv"
var_out   = out_dir / "pca_variance_report.csv"
meta_out  = out_dir / "pca_metadata.json"
model_out = out_dir / f"pca_model_k{k}.pkl"

pd.DataFrame(Z_train).to_csv(train_out, index=False)
pd.DataFrame(Z_test).to_csv(test_out, index=False)
var_table.to_csv(var_out, index=False)

meta = {
    "train_path": TRAIN_PATH,
    "test_path": TEST_PATH,
    "train_shape": tuple(X_train.shape),
    "test_shape": tuple(X_test.shape),
    "n_features": int(X_train.shape[1]),
    "k_selected": int(k),
    "target_variance": float(TARGET_VARIANCE),
    "achieved_variance": float(np.cumsum(pca_k.explained_variance_ratio_)[-1])
}
with open(meta_out, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

with open(model_out, "wb") as f:
    pickle.dump(pca_k, f)

print("Guardado:")
print("-", train_out)
print("-", test_out)
print("-", var_out)
print("-", meta_out)
print("-", model_out)

Guardado:
- ..\data\output\X_train_pca_k19.csv
- ..\data\output\X_test_pca_k19.csv
- ..\data\output\pca_variance_report.csv
- ..\data\output\pca_metadata.json
- ..\data\output\pca_model_k19.pkl
