In [14]:

TRAIN_PATH = "../data/dataset/X_train_scaled.csv"  
TEST_PATH  = "../data/dataset/X_test_scaled.csv"   
OUT_DIR    = "../data/output"  

In [15]:
import pandas as pd

In [16]:
X_train = pd.read_csv(TRAIN_PATH)
X_test  = pd.read_csv(TEST_PATH)

In [17]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (6819, 2003)
Test shape: (2923, 2003)


In [18]:
def analyze_missing_values(df):
    # Calcular valores faltantes
    missing_values = df.isnull().sum()
    missing_percentage = (df.isnull().sum() / len(df)) * 100
    
    # Crear DataFrame con el análisis
    missing_info = pd.DataFrame({
        'Missing Values': missing_values,
        'Missing Percentage': missing_percentage.round(2)
    })
    
    # Ordenar por cantidad de valores faltantes (descendente)
    missing_info = missing_info.sort_values('Missing Values', ascending=False)
    
    # Mostrar solo columnas con al menos un valor faltante
    return missing_info[missing_info['Missing Values'] > 0]


In [19]:
# Analizar tanto el conjunto de entrenamiento como el de prueba
print("=== Missing Values en X_train ===")
display(analyze_missing_values(X_train))

print("\n=== Missing Values en X_test ===")
display(analyze_missing_values(X_test))

# Mostrar el total de valores faltantes
print("\nTotal de valores faltantes:")
print(f"X_train: {X_train.isnull().sum().sum():,}")
print(f"X_test: {X_test.isnull().sum().sum():,}")

=== Missing Values en X_train ===


Unnamed: 0,Missing Values,Missing Percentage
hog_1763,1586,23.26
hog_1762,1586,23.26
hog_1761,1586,23.26
hog_1760,1586,23.26
hog_1759,1586,23.26
...,...,...
zernike_22,1575,23.10
color_9,1575,23.10
zernike_0,1575,23.10
zernike_23,1575,23.10



=== Missing Values en X_test ===


Unnamed: 0,Missing Values,Missing Percentage
hog_1763,700,23.95
hog_1762,700,23.95
hog_1761,700,23.95
hog_1760,700,23.95
hog_1759,700,23.95
...,...,...
zernike_22,698,23.88
color_9,698,23.88
zernike_0,698,23.88
zernike_23,698,23.88



Total de valores faltantes:
X_train: 3,139,479
X_test: 1,386,266


In [20]:
# Guardar dimensiones originales
print("Dimensiones originales:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

# Eliminar filas con valores faltantes
X_train_clean = X_train.dropna(axis=0, how='any')
X_test_clean = X_test.dropna(axis=0, how='any')

# Mostrar dimensiones después de la limpieza
print("\nDimensiones después de eliminar filas con NA:")
print(f"X_train: {X_train_clean.shape}")
print(f"X_test: {X_test_clean.shape}")

# Mostrar cantidad de filas eliminadas
print("\nFilas eliminadas:")
print(f"X_train: {X_train.shape[0] - X_train_clean.shape[0]:,} filas")
print(f"X_test: {X_test.shape[0] - X_test_clean.shape[0]:,} filas")

Dimensiones originales:
X_train: (6819, 2003)
X_test: (2923, 2003)

Dimensiones después de eliminar filas con NA:
X_train: (5233, 2003)
X_test: (2223, 2003)

Filas eliminadas:
X_train: 1,586 filas
X_test: 700 filas


In [21]:
# exportar dataframe a csv
import os

# Elige qué DataFrames exportar (prioridad: escalados > limpios > originales)
Xtr = X_train_clean
Xte = X_test_clean

# Carpeta de salida
out_dir = "../data/dataset/final"
os.makedirs(out_dir, exist_ok=True)

# Rutas de archivo (usando exactamente los nombres que pediste)
path_train = os.path.join(out_dir, "X_trained_scaled_final.csv")
path_test  = os.path.join(out_dir, "X_test_scaled_final.csv")

# Exportar sin índice
Xtr.to_csv(path_train, index=False)
Xte.to_csv(path_test, index=False)

print("Guardado:")
print(f"- {path_train}  -> shape={Xtr.shape}")
print(f"- {path_test}   -> shape={Xte.shape}")


Guardado:
- ../data/dataset/final\X_trained_scaled_final.csv  -> shape=(5233, 2003)
- ../data/dataset/final\X_test_scaled_final.csv   -> shape=(2223, 2003)


Ahora vamos a eliminar las categorías para solo trabajar las características en los métodos de clustering

In [None]:
# sin categorias pero con movieid
Xtr_no_cat = Xtr.drop(columns=Xtr.columns[1:22])
Xte_no_cat = Xte.drop(columns=Xte.columns[1:22])

path_train_no_cat = os.path.join(out_dir, "X_trained_scaled_final_no_cat.csv")
path_test_no_cat  = os.path.join(out_dir, "X_test_scaled_final_no_cat.csv")

# Exportar sin índice
Xtr_no_cat.to_csv(path_train_no_cat, index=False)
Xte_no_cat.to_csv(path_test_no_cat, index=False)

print("Guardado:")
print(f"- {path_train+"_no_cat"}  -> shape={Xtr_no_cat.shape}")
print(f"- {path_test+"_no_cat"}   -> shape={Xte_no_cat.shape}")

Guardado:
- ../data/dataset/final\X_trained_scaled_final.csv  -> shape=(5233, 1982)
- ../data/dataset/final\X_test_scaled_final.csv   -> shape=(2223, 1982)


In [29]:
# sin categorias sin movieid
PATH = "../data/dataset/final_no_movieid" 
Xtr_no_cat3 = Xtr_no_cat.drop(columns=Xtr.columns[0])
Xte_no_cat3 = Xte_no_cat.drop(columns=Xte.columns[0])

path_train_no_cat = os.path.join(PATH, "X_trained_scaled_final_no_cat_noid.csv")
path_test_no_cat  = os.path.join(PATH, "X_test_scaled_final_no_cat_noid.csv")

# Exportar sin índice
Xtr_no_cat3.to_csv(path_train_no_cat, index=False)
Xte_no_cat3.to_csv(path_test_no_cat, index=False)

print("Guardado:")
print(f"- {PATH}  -> shape={Xtr_no_cat3.shape}")
print(f"- {PATH}   -> shape={Xte_no_cat3.shape}")

Guardado:
- ../data/dataset/final_no_movieid  -> shape=(5233, 1981)
- ../data/dataset/final_no_movieid   -> shape=(2223, 1981)
