In [21]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from pathlib import Path
from sklearn.model_selection import train_test_split
import pickle

# Detecta en automatico el path donde esta el notebook y sube un nivel en las carpetas
root_dir = Path.cwd().parents[0]

# Declara las direcciones de los datos
data_intermediate = root_dir / "data" / "intermediate"
data_processed = root_dir / "data" / "processed"

# Imprime directorio raiz
print("Directorio raíz:", root_dir)

# -- lectura del dataset --
df = pd.read_pickle(data_intermediate / "df.pkl")

# -- Seleccion de Features --
# Columnas que terminan en _freq o _target_mean
freq_cols = [col for col in df.columns if col.endswith('_freq')]
target_mean_cols = [col for col in df.columns if col.endswith('_target_mean')]
feature_cols = ['Academic_Score']

# Combina todos los features
X = df[freq_cols + target_mean_cols + feature_cols]
# X = df[freq_cols + feature_cols]
y = df['Performance_num']

# --- Split Train/Test antes del balanceo SMOTE ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# -- Balanceo de clases --
smote = SMOTE(sampling_strategy='auto', random_state=42)  
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# --- Guardado de datos procesados ---
# crea carpeta si no existe
data_processed.mkdir(parents=True, exist_ok=True)

# guarda el archivo train tipo .pkl
with open(data_processed / "Xy_train_resampled.pkl", "wb") as f:
    pickle.dump((X_train_res, y_train_res), f)

# guarda el archivo de test
with open(data_processed / "Xy_test.pkl", "wb") as f:
    pickle.dump((X_test, y_test), f)

print("Datos Entrenamiento resampleados y guardados en:", data_processed / "Xy_train_resampled.pkl")
print("Datos Testeo guardados en:", data_processed / "Xy_test.pkl")

Directorio raíz: C:\Users\Jesus Tamez\Desktop\MLOps_Proyecto\test-MLops-CEE_DATA
Datos Entrenamiento resampleados y guardados en: C:\Users\Jesus Tamez\Desktop\MLOps_Proyecto\test-MLops-CEE_DATA\data\processed\Xy_train_resampled.pkl
Datos Testeo guardados en: C:\Users\Jesus Tamez\Desktop\MLOps_Proyecto\test-MLops-CEE_DATA\data\processed\Xy_test.pkl
