Dependencias

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error,  mean_absolute_percentage_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
import joblib
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [10]:
possible_paths = [
    '../analisis_exploratorios_tp3/',  
    '../../analisis_exploratorios_tp3/' 
]

data_path = None
for path in possible_paths:
    if os.path.exists(f"{path}train.csv"):
        data_path = path
        break

if data_path:
    train = pd.read_csv(f'{data_path}train.csv', index_col=0, parse_dates=True)
    validation = pd.read_csv(f'{data_path}validation.csv', index_col=0, parse_dates=True)
    test = pd.read_csv(f'{data_path}test.csv', index_col=0, parse_dates=True)
else:
    print(" No se encontraron los archivos CSV en ninguna ruta")
    print(" Contenido del directorio actual:")
    for item in os.listdir('.'):
        print(f"   {'' if os.path.isdir(item) else ''} {item}")

print(f" Datos cargados exitosamente:")
print(f"   Train: {train.shape}")
print(f"   Validation: {validation.shape}")
print(f"   Test: {test.shape}")
print(f"   Columnas disponibles: {len(train.columns)} columnas")

 Datos cargados exitosamente:
   Train: (1110, 48)
   Validation: (370, 48)
   Test: (370, 48)
   Columnas disponibles: 48 columnas


In [None]:
from sklearn.compose import ColumnTransformer
mapper = ColumnTransformer([
   
    ('scale_btc_h', StandardScaler(), ['btc_h']),
    ('scale_btc_l', StandardScaler(), ['btc_l']),
    ('scale_btc_v', StandardScaler(), ['btc_v']),
    ('scale_btc_prev_c1', StandardScaler(), ['btc_prev_c1']),
    ('scale_btc_prev_c2', StandardScaler(), ['btc_prev_c2']),
    ('scale_btc_prev_c3', StandardScaler(), ['btc_prev_c3']),
    ('scale_btc_prev_c4', StandardScaler(), ['btc_prev_c4']),
    ('scale_btc_prev_c5', StandardScaler(), ['btc_prev_c5']),
    ('scale_btc_prev_c6', StandardScaler(), ['btc_prev_c6']),
    ('scale_btc_prev_c7', StandardScaler(), ['btc_prev_c7']),
    ('scale_fng_value', StandardScaler(), ['fng_value']),
    ('scale_bitcoin', StandardScaler(), ['bitcoin']),
    ('scale_is_month_start', StandardScaler(), ['is_month_start']),
    ('scale_is_month_end', StandardScaler(), ['is_month_end']),
    ('scale_sp500_c', StandardScaler(), ['sp500_c']),
    ('scale_dowj_c', StandardScaler(), ['dowj_c']),
    ('scale_nasdaq_c', StandardScaler(), ['nasdaq_c']),
    ('scale_eurostoxx_c', StandardScaler(), ['eurostoxx_c']),
    ('scale_nikkei_c', StandardScaler(), ['nikkei_c']),
    
    
    ('ohe_day', OneHotEncoder(handle_unknown='ignore'), ['day']),
    ('ohe_month', OneHotEncoder(handle_unknown='ignore'), ['month']),
    ('ohe_year', OneHotEncoder(handle_unknown='ignore'), ['Year']),
    
    ('scale_RSI_14', StandardScaler(), ['RSI_14']),
    ('scale_BBL_20_2.0_2.0', StandardScaler(), ['BBL_20_2.0_2.0']),
    ('scale_BBM_20_2.0_2.0', StandardScaler(), ['BBM_20_2.0_2.0']),
    ('scale_BBU_20_2.0_2.0', StandardScaler(), ['BBU_20_2.0_2.0']),
    ('scale_BBB_20_2.0_2.0', StandardScaler(), ['BBB_20_2.0_2.0']),
    ('scale_BBP_20_2.0_2.0', StandardScaler(), ['BBP_20_2.0_2.0']),
    ('scale_MACD_12_26_9', StandardScaler(), ['MACD_12_26_9']),
    ('scale_MACDh_12_26_9', StandardScaler(), ['MACDh_12_26_9']),
    ('scale_MACDs_12_26_9', StandardScaler(), ['MACDs_12_26_9']),
    ('scale_ATRr_14', StandardScaler(), ['ATRr_14']),
])

mapper.fit(train)

train_transformed = mapper.transform(train)
validation_transformed = mapper.transform(validation)
test_transformed = mapper.transform(test)

train_df = pd.DataFrame(
    train_transformed, 
    columns=mapper.get_feature_names_out(), 
    index=train.index
)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [6]:
targets = [f"NextClose_BTC{i}" for i in range(1, 8)]
knn_models = {} 
output_dir = 'src/models' 


param_grid = {
    'regressor__n_neighbors': [3, 5, 7, 9, 11, 13, 15], 
    'regressor__weights': ['uniform', 'distance'], 
    'regressor__metric': ['euclidean', 'manhattan'] 
}

print(" Iniciando BÚSQUEDA DE HIPERPARÁMETROS para 7 modelos KNN...")

for target in targets:
    print(f"\n--- Optimizando y Entrenando {target} ---")
    
    
    train_cleaned = train.dropna(subset=[target])
    
   
    knn_pipeline = Pipeline([
        ('mapper', mapper),
        ('imputer', SimpleImputer(strategy='mean')),
        ('regressor', KNeighborsRegressor()), 
    ])
    
   
    grid_search = GridSearchCV(
        estimator=knn_pipeline, 
        param_grid=param_grid, 
        scoring='neg_mean_absolute_error', 
        cv=3, 
        n_jobs=-1, 
        verbose=0
    )
    
   
    grid_search.fit(train_cleaned, train_cleaned[target])
    
    
    best_knn_model = grid_search.best_estimator_
    knn_models[target] = best_knn_model
    
    
    y_pred = best_knn_model.predict(validation.dropna(subset=[target]))
    y_true = validation.dropna(subset=[target])[target]
    
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    
    print(f"    MEJOR K: {grid_search.best_params_['regressor__n_neighbors']}")
    print(f"    Mejor Puntuación de CV (MAE): {-grid_search.best_score_:.2f} USD")
    print(f"    RMSE en Validación: {rmse:,.2f} USD, MAPE: {mape:.2f}%")

print(f"\n✅ {len(knn_models)} modelos KNN optimizados y entrenados.")

os.makedirs(output_dir, exist_ok=True)
joblib.dump(knn_models, os.path.join(output_dir, 'knn_modelos_optimizados.joblib'))
print(f" Modelos KNN guardados en: {os.path.join(output_dir, 'knn_modelos_optimizados.joblib')}")

 Iniciando BÚSQUEDA DE HIPERPARÁMETROS para 7 modelos KNN...

--- Optimizando y Entrenando NextClose_BTC1 ---


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

    MEJOR K: 3
    Mejor Puntuación de CV (MAE): 1376.35 USD
    RMSE en Validación: 1,813.05 USD, MAPE: 2.64%

--- Optimizando y Entrenando NextClose_BTC2 ---


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

    MEJOR K: 3
    Mejor Puntuación de CV (MAE): 1548.14 USD
    RMSE en Validación: 2,166.62 USD, MAPE: 2.97%

--- Optimizando y Entrenando NextClose_BTC3 ---


alue is needed for imputation with strategy='mean'.
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


    MEJOR K: 3
    Mejor Puntuación de CV (MAE): 1765.86 USD
    RMSE en Validación: 2,230.06 USD, MAPE: 3.07%

--- Optimizando y Entrenando NextClose_BTC4 ---


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

    MEJOR K: 3
    Mejor Puntuación de CV (MAE): 1837.08 USD
    RMSE en Validación: 2,404.25 USD, MAPE: 3.37%

--- Optimizando y Entrenando NextClose_BTC5 ---


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

    MEJOR K: 3
    Mejor Puntuación de CV (MAE): 1952.77 USD
    RMSE en Validación: 2,436.80 USD, MAPE: 3.48%

--- Optimizando y Entrenando NextClose_BTC6 ---


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

    MEJOR K: 3
    Mejor Puntuación de CV (MAE): 2001.25 USD
    RMSE en Validación: 2,647.89 USD, MAPE: 3.70%

--- Optimizando y Entrenando NextClose_BTC7 ---


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

    MEJOR K: 3
    Mejor Puntuación de CV (MAE): 2073.17 USD
    RMSE en Validación: 2,470.32 USD, MAPE: 3.65%

✅ 7 modelos KNN optimizados y entrenados.
 Modelos KNN guardados en: src/models/knn_modelos_optimizados.joblib


In [11]:
K = 3  

targets = [f"NextClose_BTC{i}" for i in range(1, 8)]
knn_models = {}

# Cargar test si no está cargado
if 'test' not in locals():
    test = pd.read_csv('test.csv')

print(f"Entrenando 7 modelos KNN con K={K} y Evaluando...")

for target in targets:
    print(f"\n--- Analizando {target} ---")
    
    # 1. Preparar datos
    train_cleaned = train.dropna(subset=[target])
    validation_cleaned = validation.dropna(subset=[target])
    test_cleaned = test.dropna(subset=[target]) # <--- NUEVO: Preparar Test
    
    # 2. Pipeline y Entrenamiento
    knn_model = Pipeline([
        ('mapper', mapper),
        ('imputer', SimpleImputer(strategy='mean')),
        ('regressor', KNeighborsRegressor(n_neighbors=K)),
    ])
    
    knn_model.fit(train_cleaned, train_cleaned[target])
    knn_models[target] = knn_model
    
    # 3. Métrica en VALIDACIÓN (Para tu ajuste interno)
    y_pred_val = knn_model.predict(validation_cleaned)
    rmse_val = np.sqrt(mean_squared_error(validation_cleaned[target], y_pred_val))
    
    # 4. Métrica en TEST (Para el reporte final) <--- ESTO ES LO QUE FALTABA
    y_pred_test = knn_model.predict(test_cleaned)
    rmse_test = np.sqrt(mean_squared_error(test_cleaned[target], y_pred_test))
    mape_test = mean_absolute_percentage_error(test_cleaned[target], y_pred_test) * 100
    
    print(f"   Validation RMSE: {rmse_val:,.2f}")
    print(f"   >> TEST RMSE:    {rmse_test:,.2f} USD")
    print(f"   >> TEST MAPE:    {mape_test:.2f}%")

print(f"\nModelos guardados en el diccionario 'knn_models'")

Entrenando 7 modelos KNN con K=3 y Evaluando...

--- Analizando NextClose_BTC1 ---
   Validation RMSE: 2,923.55
   >> TEST RMSE:    3,038.05 USD
   >> TEST MAPE:    3.90%

--- Analizando NextClose_BTC2 ---
   Validation RMSE: 3,116.83
   >> TEST RMSE:    3,209.63 USD
   >> TEST MAPE:    4.52%

--- Analizando NextClose_BTC3 ---
   Validation RMSE: 3,213.87
   >> TEST RMSE:    3,340.11 USD
   >> TEST MAPE:    4.83%

--- Analizando NextClose_BTC4 ---
   Validation RMSE: 3,338.36
   >> TEST RMSE:    3,584.81 USD
   >> TEST MAPE:    5.13%

--- Analizando NextClose_BTC5 ---
   Validation RMSE: 3,411.77
   >> TEST RMSE:    3,689.24 USD
   >> TEST MAPE:    5.39%

--- Analizando NextClose_BTC6 ---
   Validation RMSE: 3,473.53
   >> TEST RMSE:    3,807.22 USD
   >> TEST MAPE:    5.51%

--- Analizando NextClose_BTC7 ---
   Validation RMSE: 3,441.21
   >> TEST RMSE:    4,017.58 USD
   >> TEST MAPE:    5.92%

Modelos guardados en el diccionario 'knn_models'


In [12]:

MODEL_FILENAME = 'knn_modelos_entrenados.joblib'
FEATURES_FILENAME = 'knn_features_por_target.joblib'


output_dir = 'modelos_entrenados'
os.makedirs(output_dir, exist_ok=True)


joblib.dump(knn_models, os.path.join(output_dir, MODEL_FILENAME))


features_por_target_knn = {target: list(train.columns) for target in targets}
joblib.dump(features_por_target_knn, os.path.join(output_dir, FEATURES_FILENAME))

print(f"\n7 Modelos KNN guardados exitosamente en:")
print(f"    Modelos: {output_dir}/{MODEL_FILENAME}")
print(f"    Features: {output_dir}/{FEATURES_FILENAME}")
print(f"\n Modelos disponibles: {list(knn_models.keys())}")


7 Modelos KNN guardados exitosamente en:
    Modelos: modelos_entrenados/knn_modelos_entrenados.joblib
    Features: modelos_entrenados/knn_features_por_target.joblib

 Modelos disponibles: ['NextClose_BTC1', 'NextClose_BTC2', 'NextClose_BTC3', 'NextClose_BTC4', 'NextClose_BTC5', 'NextClose_BTC6', 'NextClose_BTC7']
