In [None]:
# TFM_06_modeling_anomalies.ipynb

In [None]:
import pandas as pd
import numpy as np
import joblib
import time
import logging
import os
import gc
from collections import Counter
from tqdm.notebook import tqdm
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    make_scorer, recall_score, precision_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    classification_report
)
import json
import math

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

N_SPLITS = 5
MODEL_DIR = './modelos_anomalias_clf'
RESULTS_DIR = './resultados_anomalias_clf'
RESULTS_FILE = os.path.join(RESULTS_DIR, 'resultados_cv_anomalias.json')
BEST_MODEL_FILE = os.path.join(MODEL_DIR, 'mejor_modelo_anomalias.joblib')
FEATURES_FILE = os.path.join(MODEL_DIR, 'columnas_relevantes_anomalias.json')
SCALER_FILE = 'old_logs_jsons_and_joblib/escaladores.joblib'

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

def mapear_prediccion_anomalia(pred):
    return np.where(pred == -1, 1, 0)

recall_sc = make_scorer(lambda yt, yp: recall_score(yt, mapear_prediccion_anomalia(yp), pos_label=1, zero_division=0))
prec_sc = make_scorer(lambda yt, yp: precision_score(yt, mapear_prediccion_anomalia(yp), pos_label=1, zero_division=0))
f1_sc = make_scorer(lambda yt, yp: f1_score(yt, mapear_prediccion_anomalia(yp), pos_label=1, zero_division=0))
roc_auc_sc = make_scorer(lambda yt, ys: roc_auc_score(yt, -ys), needs_threshold=True, greater_is_better=True)
pr_auc_sc = make_scorer(lambda yt, ys: average_precision_score(yt, -ys, pos_label=1), needs_threshold=True, greater_is_better=True)

SCORERS = {'recall': recall_sc, 'precision': prec_sc, 'f1': f1_sc, 'roc_auc': roc_auc_sc, 'pr_auc': pr_auc_sc}
PRIMARY_METRIC = 'pr_auc'
HIGHER_BETTER = True

logging.info("Configuración inicial lista.")

In [None]:
logging.info("Cargando datos y escaladores...")
datos_cargados = False
escaladores = None
X_train = None
y_train = None
X_test = None
y_test = None
nombres_caracteristicas = []

try:
    df_train_orig = pd.read_csv("datos_divididos/datos_financieros_train_completo.csv")
    df_test_orig = pd.read_csv("datos_divididos/datos_financieros_test_completo.csv")

    y_train = df_train_orig['target'].copy()
    y_test = df_test_orig['target'].copy()

    cols_a_eliminar = ['timestamp', 'Unnamed: 0', 'symbol', 'target', 'future_max_increase_capped']
    X_train = df_train_orig.drop(columns=[col for col in cols_a_eliminar if col in df_train_orig.columns], errors='ignore')
    X_test = df_test_orig.drop(columns=[col for col in cols_a_eliminar if col in df_test_orig.columns], errors='ignore')
    nombres_caracteristicas = X_train.columns.tolist()

    logging.info(f"Características iniciales: {len(nombres_caracteristicas)}")

    if not os.path.exists(SCALER_FILE):
        raise FileNotFoundError(f"Archivo no encontrado: {SCALER_FILE}")
    escaladores = joblib.load(SCALER_FILE)
    logging.info(f"Escaladores cargados: {SCALER_FILE}")

    logging.info(f"Shapes: X_train={X_train.shape}, y_train={y_train.shape}, X_test={X_test.shape}, y_test={y_test.shape}")
    logging.info(f"Clases Train: {Counter(y_train)}")
    datos_cargados = True

except FileNotFoundError as e_fnf:
     logging.error(f"{e_fnf}")
     datos_cargados = False
except Exception as e:
    logging.error(f"Error carga: {e}", exc_info=False) # exc_info=False para simplificar
    datos_cargados = False

In [None]:
def aplicar_escaladores_por_grupo(df_input, escaladores_por_simbolo, columna_grupo, nombres_cols_features):
    """
    Aplica escaladores a un DataFrame por grupo.

    Args:
        df_input (pd.DataFrame): DataFrame de entrada.
        escaladores_por_simbolo (dict): Diccionario con escaladores por grupo.
        columna_grupo (str): Nombre de la columna para agrupar.
        nombres_cols_features (list): Lista de nombres de columnas a escalar.

    Returns:
        pd.DataFrame: DataFrame con columnas escaladas.
    """
    if columna_grupo not in df_input.columns:
        raise ValueError(f"Columna grupo '{columna_grupo}' no encontrada.")

    df_output = df_input.copy()
    cols_a_escalar = [col for col in nombres_cols_features if col in df_output.columns]

    grouped_df = df_output.groupby(columna_grupo)

    for simbolo, grupo_df in tqdm(grouped_df, total=len(grouped_df.groups), desc=f"Escalando '{columna_grupo}'", leave=False):
        if simbolo not in escaladores_por_simbolo:
            logging.warning(f"Escaladores no encontrados para '{simbolo}'. Se asigna NaN.")
            df_output.loc[grupo_df.index, cols_a_escalar] = np.nan
            continue

        scalers_simbolo_actual = escaladores_por_simbolo[simbolo]

        for col_name in cols_a_escalar:
            if col_name not in scalers_simbolo_actual:
                logging.warning(f"Escalador no encontrado para '{simbolo}' / '{col_name}'. Se asigna NaN.")
                df_output.loc[grupo_df.index, col_name] = np.nan
                continue

            min_val, max_val = scalers_simbolo_actual[col_name]
            col_original_grupo = grupo_df[col_name]

            if pd.isna(min_val) or pd.isna(max_val):
                scaled_col = pd.Series(np.nan, index=col_original_grupo.index, name=col_name)
            else:
                denom = max_val - min_val
                if abs(denom) < 1e-9:
                    scaled_col = pd.Series(0.5, index=col_original_grupo.index, name=col_name)
                else:
                    scaled_col = (col_original_grupo - min_val) / denom
                scaled_col = scaled_col.where(col_original_grupo.notna(), np.nan)

            df_output.loc[grupo_df.index, col_name] = scaled_col

    nans_restantes = df_output[cols_a_escalar].isna().sum().sum()
    if nans_restantes > 0:
        logging.warning(f"{nans_restantes} NaNs restantes tras escalar. Imputando con 0.")
        df_output[cols_a_escalar] = df_output[cols_a_escalar].fillna(0)

    return df_output


scaling_ok = False
X_train_norm = None
X_test_norm = None
X_train_normal_norm = None

if datos_cargados and escaladores:
    logging.info("Aplicando escaladores por grupo...")
    try:
        columna_grupo = 'symbol'

        if columna_grupo not in X_train.columns and columna_grupo in df_train_orig.columns:
            X_train_temp = pd.concat([X_train, df_train_orig[[columna_grupo]]], axis=1)
        else:
            X_train_temp = X_train.copy()

        if columna_grupo not in X_test.columns and columna_grupo in df_test_orig.columns:
             X_test_temp = pd.concat([X_test, df_test_orig[[columna_grupo]]], axis=1)
        else:
             X_test_temp = X_test.copy()

        X_train_norm_con_grupo = aplicar_escaladores_por_grupo(X_train_temp, escaladores, columna_grupo, nombres_caracteristicas)
        X_test_norm_con_grupo = aplicar_escaladores_por_grupo(X_test_temp, escaladores, columna_grupo, nombres_caracteristicas)

        X_train_norm = X_train_norm_con_grupo.drop(columns=[columna_grupo], errors='ignore')
        X_test_norm = X_test_norm_con_grupo.drop(columns=[columna_grupo], errors='ignore')

        X_train_normal_norm = X_train_norm[y_train == 0]
        if X_train_normal_norm.empty:
            raise ValueError("No hay datos normales (clase 0) después de escalar.")

        logging.info("Escalado por grupo finalizado.")
        logging.info(f"Shapes escalados: X_train_norm={X_train_norm.shape}, X_test_norm={X_test_norm.shape}")
        logging.info(f"Shape entreno anomalías: {X_train_normal_norm.shape}")
        scaling_ok = True

    except ValueError as e_val:
         logging.error(f"Error valor: {e_val}")
         scaling_ok = False
    except Exception as e_scale:
        logging.error(f"Error escalado: {e_scale}", exc_info=False)
        scaling_ok = False
else:
     logging.info("Escalado omitido por errores previos en carga.")
     scaling_ok = False

In [None]:
modelos_anomalia = {}
ALLOW_MULTIPROCESSING = True

if scaling_ok:
    modelos_anomalia = {
        'IF': {
            'model': IsolationForest,
            'grid': [
                {'n_estimators': 100, 'contamination': 'auto', 'max_features': 1.0},
                {'n_estimators': 300, 'contamination': 0.03, 'max_features': 0.7},
            ],
            'fixed': {'random_state': 42, 'warm_start': False, 'n_jobs': -1 if ALLOW_MULTIPROCESSING else 1},
        },
        'OCSVM': {
            'model': OneClassSVM,
            'grid': [
                {'kernel': 'rbf', 'gamma': 'scale', 'nu': 0.03},
                {'kernel': 'rbf', 'gamma': 0.01, 'nu': 0.05},
            ],
            'fixed': {'max_iter': 3000, 'cache_size': 500},
        },
    }
    logging.info(f"Modelos definidos: {list(modelos_anomalia.keys())}")
else:
     logging.error("Modelos no definidos por errores previos.")

In [None]:
resultados_cv_anom = {}

if scaling_ok and modelos_anomalia:
    logging.info("Iniciando Cross-Validation...")
    model_bar = tqdm(modelos_anomalia.items(), desc="Modelos")

    for nombre_mod, config_mod in model_bar:
        model_bar.set_description(f"Modelo: {nombre_mod}")
        resultados_cv_anom[nombre_mod] = {}
        param_bar = tqdm(config_mod['grid'], desc=f"Params ({nombre_mod})", leave=False)

        for i, params in enumerate(param_bar):
            param_key = f"p_{i}"
            param_bar.set_postfix_str(str(params), refresh=True)
            resultados_cv_anom[nombre_mod][param_key] = {'params': params, 'cv_scores': {}, 'status': 'Pendiente'}
            scores_fold = {m: [] for m in SCORERS}
            fits_t, preds_t = [], []
            cv_error = False

            cv_split = TimeSeriesSplit(n_splits=N_SPLITS)
            fold_bar = tqdm(enumerate(cv_split.split(X_train_norm)), total=N_SPLITS, desc="CV Folds", leave=False)

            for fold, (idx_train, idx_val) in fold_bar:
                fold_bar.set_description(f"Fold {fold+1}/{N_SPLITS}")
                try:
                    xt_fold = X_train_norm.iloc[idx_train]
                    yt_fold = y_train.iloc[idx_train]
                    xv_fold = X_train_norm.iloc[idx_val]
                    yv_fold = y_train.iloc[idx_val]

                    xt_fold_norm = xt_fold[yt_fold == 0]

                    if len(xt_fold_norm) < 5:
                        logging.warning(f"Fold {fold+1}: Pocos datos normales ({len(xt_fold_norm)}). Omitiendo.")
                        for m in scores_fold: scores_fold[m].append(np.nan)
                        continue

                    m_fold = config_mod['model'](**config_mod.get('fixed', {}), **params)
                    t0 = time.time(); m_fold.fit(xt_fold_norm); fits_t.append(time.time() - t0)

                    scores_v, preds_v_raw = None, None
                    t0 = time.time()
                    if hasattr(m_fold, 'decision_function'):
                        scores_v = m_fold.decision_function(xv_fold)
                    elif hasattr(m_fold, 'score_samples'):
                        scores_v = -m_fold.score_samples(xv_fold)

                    if hasattr(m_fold, 'predict'):
                        preds_v_raw = m_fold.predict(xv_fold)
                    elif scores_v is not None:
                        cont = params.get('contamination','auto'); nu=params.get('nu')
                        pct = (nu*100) if nu else (2 if cont=='auto' else cont*100) # Simple logic for threshold
                        try:
                            thr = np.percentile(scores_v[~np.isnan(scores_v)], pct) # Handle potential NaNs in scores_v
                            preds_v_raw = np.where(scores_v < thr, -1, 1)
                        except IndexError: 
                            preds_v_raw = np.ones_like(scores_v) 
                            
                    preds_t.append(time.time() - t0)

                    for m_name, scorer in SCORERS.items():
                        try:
                            if getattr(scorer, '_needs_threshold', False):
                                score = scorer._score_func(yv_fold, scores_v) if scores_v is not None else np.nan
                            else:
                                score = scorer._score_func(yv_fold, preds_v_raw) if preds_v_raw is not None else np.nan
                            scores_fold[m_name].append(score)
                        except Exception:
                            scores_fold[m_name].append(np.nan)

                except Exception as e_f:
                    logging.error(f"Fold {fold+1} Error: {e_f}", exc_info=False)
                    cv_error = True
                    for m in scores_fold:
                         if len(scores_fold[m]) == fold: scores_fold[m].append(np.nan)

            if not cv_error:
                summary = {}
                for m, s_list in scores_fold.items():
                    valid = [s for s in s_list if not pd.isna(s)]
                    summary[f'{m}_mean'] = np.mean(valid) if valid else np.nan
                    summary[f'{m}_std'] = np.std(valid) if valid else np.nan
                resultados_cv_anom[nombre_mod][param_key]['cv_scores'] = {k: (round(v, 5) if not pd.isna(v) else None) for k,v in summary.items()}
                resultados_cv_anom[nombre_mod][param_key]['t_fit_m'] = round(np.mean(fits_t), 3) if fits_t else np.nan
                resultados_cv_anom[nombre_mod][param_key]['t_pred_m'] = round(np.mean(preds_t), 3) if preds_t else np.nan
                resultados_cv_anom[nombre_mod][param_key]['status'] = 'OK'
            else:
                resultados_cv_anom[nombre_mod][param_key]['status'] = 'Error CV'
                resultados_cv_anom[nombre_mod][param_key]['cv_scores'] = {f'{m}_mean': np.nan for m in SCORERS} # Ensure scores dict exists even on error

            del m_fold; gc.collect()

    logging.info("Cross-Validation finalizada.")
else:
    logging.warning("Cross-Validation no ejecutada por errores previos.")

def json_serializer(obj):
    """Función auxiliar para serializar tipos numpy/pandas a JSON."""
    if isinstance(obj, (np.number, np.bool_)):
        return obj.item()
    if isinstance(obj, (np.ndarray,)):
        return obj.tolist()
    if isinstance(obj, type):
        return str(obj)
    if pd.isna(obj):
        return None
    return str(obj)

logging.info(f"Guardando resultados CV: {RESULTS_FILE}")
try:
    with open(RESULTS_FILE, 'w') as f:
        json.dump(resultados_cv_anom, f, indent=2, default=json_serializer)
except Exception as e:
    logging.error(f"Error guardando JSON: {e}")

In [None]:
best_score = -np.inf if HIGHER_BETTER else np.inf
best_cfg = None
best_name = None
best_param_key = None
best_cv_scores = None

logging.info(f"Buscando mejor configuración basada en '{PRIMARY_METRIC}'...")

if not resultados_cv_anom:
    logging.warning("No hay resultados de CV para seleccionar.")
else:
    for name, p_dict in resultados_cv_anom.items():
        for p_key, res in p_dict.items():
            if res.get('status') == 'OK' and 'cv_scores' in res:
                score = res['cv_scores'].get(f'{PRIMARY_METRIC}_mean')
                if score is not None and not pd.isna(score):
                    is_better = (HIGHER_BETTER and score > best_score) or \
                                (not HIGHER_BETTER and score < best_score)
                    if is_better:
                        best_score = score
                        best_cfg = res.get('params')
                        best_name = name
                        best_param_key = p_key
                        best_cv_scores = res.get('cv_scores')
                        logging.info(f"  Nuevo mejor: {best_name} ({best_param_key}), Score: {best_score:.5f}")

    if best_cfg is not None:
        logging.info(f"Mejor Modelo: {best_name}")
        logging.info(f"  Parámetros: {best_cfg}")
        logging.info(f"  Score CV ({PRIMARY_METRIC}): {best_score:.5f}")
        logging.info(f"  Scores CV Detalle: {best_cv_scores}")
    else:
        logging.warning("No se encontró una configuración válida como la mejor.")

In [None]:
model_saved = False
final_model = None
test_metrics = {}

if scaling_ok and best_cfg is not None and best_name is not None:
    logging.info(f"Re-entrenando mejor modelo ({best_name}) con datos normales...")

    cfg_best = modelos_anomalia.get(best_name)
    if cfg_best:
        model_class = cfg_best.get('model')
        final_params = {**cfg_best.get('fixed', {}), **best_cfg}

        if model_class:
            final_model = model_class(**final_params)

            try:
                if X_train_normal_norm.isna().any().any():
                    logging.warning("NaNs detectados en datos de entrenamiento final. Imputando con 0.")
                    X_train_normal_norm = X_train_normal_norm.fillna(0)

                t_start_fit = time.time()
                final_model.fit(X_train_normal_norm)
                t_fit = time.time() - t_start_fit
                logging.info(f"Entrenamiento final completado ({t_fit:.2f}s).")

                logging.info("Evaluando modelo final en conjunto de test...")
                scores_t, preds_raw_t = None, None

                if X_test_norm.isna().any().any():
                    logging.warning("NaNs detectados en datos de test. Imputando con 0.")
                    X_test_norm = X_test_norm.fillna(0)

                if hasattr(final_model, 'decision_function'):
                    scores_t = final_model.decision_function(X_test_norm)
                elif hasattr(final_model, 'score_samples'):
                    scores_t = -final_model.score_samples(X_test_norm)

                if hasattr(final_model, 'predict'):
                    preds_raw_t = final_model.predict(X_test_norm)
                elif scores_t is not None:
                    cont = best_cfg.get('contamination','auto'); nu=best_cfg.get('nu')
                    pct = (nu*100) if nu else (2 if cont=='auto' else cont*100)
                    try:
                        thr = np.percentile(scores_t[~np.isnan(scores_t)], pct)
                        preds_raw_t = np.where(scores_t < thr, -1, 1)
                    except IndexError:
                         preds_raw_t = np.ones_like(scores_t)


                if scores_t is not None:
                    try:
                        test_metrics['roc_auc'] = round(roc_auc_score(y_test, -scores_t), 5)
                    except Exception: pass
                    try:
                        test_metrics['pr_auc'] = round(average_precision_score(y_test, -scores_t, pos_label=1), 5)
                    except Exception: pass

                if preds_raw_t is not None:
                    preds_map_t = mapear_prediccion_anomalia(preds_raw_t)
                    try:
                        test_metrics['recall'] = round(recall_score(y_test, preds_map_t, pos_label=1, zero_division=0), 5)
                    except Exception: pass
                    try:
                        test_metrics['precision'] = round(precision_score(y_test, preds_map_t, pos_label=1, zero_division=0), 5)
                    except Exception: pass
                    try:
                        test_metrics['f1'] = round(f1_score(y_test, preds_map_t, pos_label=1, zero_division=0), 5)
                    except Exception: pass

                    print(f"\nMétricas Test: {test_metrics}")
                    print("\nMatriz de Confusión Test:")
                    print(confusion_matrix(y_test, preds_map_t, labels=[0, 1]))
                    print("\nReporte de Clasificación Test:")
                    print(classification_report(y_test, preds_map_t, labels=[0, 1], target_names=['Normal', 'Anomalia'], zero_division=0))
                elif test_metrics:
                    print(f"\nMétricas Test (basadas en score): {test_metrics}")
                else:
                    print("\nNo se pudieron calcular métricas de test.")


                logging.info(f"Guardando modelo final: {BEST_MODEL_FILE}")
                try:
                  
                    try:
                        if hasattr(final_model, 'feature_names_in_') and nombres_caracteristicas:
                           setattr(final_model, 'feature_names_in_', nombres_caracteristicas)
                    except AttributeError:
                        pass 

                    joblib.dump(final_model, BEST_MODEL_FILE)
                    logging.info("Modelo guardado correctamente.")
                    model_saved = True
                except Exception as e_save:
                    logging.error(f"Error guardando modelo: {e_save}")

            except Exception as e_final:
                logging.error(f"Error en re-entrenamiento/evaluación: {e_final}", exc_info=False)
        else:
            logging.error(f"Clase de modelo no encontrada para {best_name}")
    else:
        logging.error(f"Configuración no encontrada para {best_name}")

else:
    logging.warning("Re-entrenamiento no ejecutado: faltan datos escalados o mejor configuración.")


if 'model_saved' not in locals():
    model_saved = False


In [None]:
logging.info("Analizando importancia de características (si aplica)...")
features_dict = {}
NUM_TOP = 15

if model_saved and final_model is not None and nombres_caracteristicas:
    logging.info(f"Calculando importancia para: {best_name}")
    try:
        imp = None
        feature_names = nombres_caracteristicas # Usar la lista guardada

        if hasattr(final_model, 'feature_importances_'):
            imp = final_model.feature_importances_
            imp_type = 'importance'
        elif isinstance(final_model, OneClassSVM) and getattr(final_model, 'kernel', '') == 'linear' and hasattr(final_model, 'coef_'):
             imp = np.abs(final_model.coef_[0])
             imp_type = 'abs_coef_proxy'
        else:
             features_dict = {"info": "Importancia no disponible para este modelo."}

        if imp is not None:
             # Asegurarse de que la longitud coincida
             if len(imp) == len(feature_names):
                 df_imp = pd.DataFrame({'feature': feature_names, imp_type: imp})
                 df_imp = df_imp.sort_values(imp_type, ascending=False).head(NUM_TOP)
                 features_dict = df_imp.round(5).to_dict('records')
                 print(f"\nTop {len(features_dict)} Características (Proxy):")
                 for f in features_dict:
                     print(f"  - {f}")
             else:
                 features_dict = {"error": f"Discrepancia en número de importancias ({len(imp)}) y características ({len(feature_names)})."}
                 logging.error(features_dict["error"])

        elif "info" in features_dict:
             print(f"  {features_dict['info']}")
        elif "error" in features_dict:
             print(f"  Error calculando importancia.")


        if features_dict: 
            try:
                with open(FEATURES_FILE, 'w') as f:
                    json.dump(features_dict, f, indent=2, default=json_serializer)
                logging.info(f"Información de importancia guardada: {FEATURES_FILE}")
            except Exception as e:
                logging.error(f"Error guardando JSON de importancia: {e}")

    except Exception as e:
        logging.error(f"Error analizando importancia: {e}", exc_info=False)
else:
    logging.warning("Análisis de importancia omitido: modelo no guardado, no cargado o faltan nombres de características.")

logging.info("Proceso completado.")