# Cuaderno de Entrenamiento del Modelo de Machine Learning

# Fase 1: Carga y Preparación de Datos

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import os
import sys
import json
import pickle
import warnings
import numpy as np
from dotenv import load_dotenv
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

sys.path.append(os.path.abspath(os.path.join('..')))
from src.ml.feature_engineering import crear_features_nlp, guardar_vectorizer

warnings.simplefilter(action='ignore', category=FutureWarning)

load_dotenv(dotenv_path='../.env')

def get_data_from_db():
    try:
        db_user = os.getenv("DB_USER")
        db_password = os.getenv("DB_PASSWORD")
        db_host = os.getenv("DB_HOST")
        db_name = os.getenv("DB_NAME")
        db_uri = f"mysql+mysqlconnector://{db_user}:{db_password}@{db_host}/{db_name}"
        engine = create_engine(db_uri)
        query = "SELECT price_usd, barrio, ambientes, dormitorios, banos, superficie_total_m2, cocheras, description FROM propiedades WHERE price_usd IS NOT NULL AND superficie_total_m2 IS NOT NULL;"
        df = pd.read_sql(query, engine)
        return df
    except Exception as e:
        return None

df_modelo = get_data_from_db()


# Fase 2: Pre-procesamiento y Feature Engineering con TF-IDF

In [2]:
if df_modelo is not None:
    df_nlp_features, tfidf_vectorizer = crear_features_nlp(df_modelo, 'description')
    df_enriquecido = pd.concat([df_modelo.drop('description', axis=1), df_nlp_features], axis=1)
    X = df_enriquecido.drop(['price_usd'], axis=1)
    y = df_enriquecido['price_usd']
    if 'barrio' in X.columns:
        X_codificado = pd.get_dummies(X, columns=['barrio'], drop_first=True, dtype=int)


# Fase 3: Comparación de Modelos con K-Fold Cross-Validation

In [3]:
if 'X_codificado' in locals():
    # Definir la grilla de parámetros para RandomForest
    param_grid_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }

    # Definir la grilla de parámetros para XGBoost
    param_grid_xgb = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0]
    }

    # RandomizedSearchCV para RandomForest
    rf_random_search = RandomizedSearchCV(
        estimator=RandomForestRegressor(random_state=42),
        param_distributions=param_grid_rf,
        n_iter=10,  # Número de combinaciones a probar
        cv=5,
        scoring='neg_root_mean_squared_error',
        verbose=2,
        random_state=42,
        n_jobs=-1
    )
    print("--- Iniciando Búsqueda para RandomForest ---")
    rf_random_search.fit(X_codificado, y)

    # RandomizedSearchCV para XGBoost
    xgb_random_search = RandomizedSearchCV(
        estimator=XGBRegressor(random_state=42),
        param_distributions=param_grid_xgb,
        n_iter=10,
        cv=5,
        scoring='neg_root_mean_squared_error',
        verbose=2,
        random_state=42,
        n_jobs=-1
    )
    print("--- Iniciando Búsqueda para XGBoost ---")
    xgb_random_search.fit(X_codificado, y)

    print("\n--- Resultados de la Búsqueda Aleatoria ---")
    print(f"RandomForest - Mejor RMSE: {-rf_random_search.best_score_:.2f} USD")
    print(f"RandomForest - Mejores Parámetros: {rf_random_search.best_params_}")
    print(f"XGBoost - Mejor RMSE: {-xgb_random_search.best_score_:.2f} USD")
    print(f"XGBoost - Mejores Parámetros: {xgb_random_search.best_params_}")


--- Iniciando Búsqueda para RandomForest ---
Fitting 5 folds for each of 10 candidates, totalling 50 fits


--- Iniciando Búsqueda para XGBoost ---
Fitting 5 folds for each of 10 candidates, totalling 50 fits



--- Resultados de la Búsqueda Aleatoria ---
RandomForest - Mejor RMSE: 141776.73 USD
RandomForest - Mejores Parámetros: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30}
XGBoost - Mejor RMSE: 116397.99 USD
XGBoost - Mejores Parámetros: {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 1.0}


# Fase 4: Selección y Entrenamiento del Modelo Final

In [4]:
if 'X_codificado' in locals():
    # Comparar los mejores scores (recordar que son negativos)
    if xgb_random_search.best_score_ > rf_random_search.best_score_:
        print("Modelo Ganador: XGBoost")
        final_model = xgb_random_search.best_estimator_
        best_search = xgb_random_search
        model_name = "XGBoost"
    else:
        print("Modelo Ganador: RandomForest")
        final_model = rf_random_search.best_estimator_
        best_search = rf_random_search
        model_name = "RandomForest"

    # Crear el diccionario de métricas
    # Necesitamos el R² score, que no está en el search. Lo calculamos con cross_val_score.
    from sklearn.model_selection import cross_val_score
    r2_scores = cross_val_score(final_model, X_codificado, y, cv=5, scoring='r2')

    final_metrics = {
        "model": model_name,
        "best_params": best_search.best_params_,
        "r2_score_mean": np.mean(r2_scores),
        "r2_score_std": np.std(r2_scores),
        "rmse_usd_mean": -best_search.best_score_,
        "rmse_usd_std": best_search.cv_results_['std_test_score'][best_search.best_index_]
    }
    
    print(f"Métricas Finales: {final_metrics}")

    # Re-entrenar el modelo final con todos los datos (best_estimator_ ya está entrenado, pero es buena práctica hacerlo explícito)
    final_model.fit(X_codificado, y)

    # Guardar artefactos
    model_dir = '../src/ml/'
    os.makedirs(model_dir, exist_ok=True)
    
    with open(os.path.join(model_dir, 'model.pkl'), 'wb') as f: pickle.dump(final_model, f)
    with open(os.path.join(model_dir, 'model_columns.pkl'), 'wb') as f: pickle.dump(list(X_codificado.columns), f)
    guardar_vectorizer(tfidf_vectorizer, os.path.join(model_dir, 'tfidf_vectorizer.pkl'))
    with open(os.path.join(model_dir, 'metrics.json'), 'w') as f: json.dump(final_metrics, f, indent=4)


Modelo Ganador: XGBoost


Métricas Finales: {'model': 'XGBoost', 'best_params': {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 1.0}, 'r2_score_mean': 0.9141472864905144, 'r2_score_std': 0.03549961541411899, 'rmse_usd_mean': 116397.9899188918, 'rmse_usd_std': 22375.878473124474}
