# Cuaderno de Entrenamiento del Modelo de Machine Learning

# Fase 1: Carga y Preparación de Datos

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import os
import sys
import json
import pickle
import warnings
import numpy as np
from dotenv import load_dotenv
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

sys.path.append(os.path.abspath(os.path.join('..')))
from src.ml.feature_engineering import crear_features_nlp, guardar_vectorizer

warnings.simplefilter(action='ignore', category=FutureWarning)

load_dotenv(dotenv_path='../.env')

def get_data_from_db():
    try:
        db_user = os.getenv("DB_USER")
        db_password = os.getenv("DB_PASSWORD")
        db_host = os.getenv("DB_HOST")
        db_name = os.getenv("DB_NAME")
        db_uri = f"mysql+mysqlconnector://{db_user}:{db_password}@{db_host}/{db_name}"
        engine = create_engine(db_uri)
        query = "SELECT price_usd, barrio, ambientes, dormitorios, banos, superficie_total_m2, cocheras, description FROM propiedades WHERE price_usd IS NOT NULL AND superficie_total_m2 IS NOT NULL;"
        df = pd.read_sql(query, engine)
        return df
    except Exception as e:
        return None

df_modelo = get_data_from_db()


# Fase 2: Pre-procesamiento y Feature Engineering con TF-IDF

In [2]:
if df_modelo is not None:
    df_nlp_features, tfidf_vectorizer = crear_features_nlp(df_modelo, 'description')
    df_enriquecido = pd.concat([df_modelo.drop('description', axis=1), df_nlp_features], axis=1)
    X = df_enriquecido.drop(['price_usd'], axis=1)
    y = df_enriquecido['price_usd']
    if 'barrio' in X.columns:
        X_codificado = pd.get_dummies(X, columns=['barrio'], drop_first=True, dtype=int)


# Fase 3: Comparación de Modelos con K-Fold Cross-Validation

In [3]:
if 'X_codificado' in locals():
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    rf_r2_scores, rf_rmse_scores = [], []
    xgb_r2_scores, xgb_rmse_scores = [], []
    
    for fold, (train_index, val_index) in enumerate(kf.split(X_codificado, y)):
        X_train, X_val = X_codificado.iloc[train_index], X_codificado.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        # RandomForest
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        rf_model.fit(X_train, y_train)
        y_pred_rf = rf_model.predict(X_val)
        rf_r2_scores.append(r2_score(y_val, y_pred_rf))
        rf_rmse_scores.append(np.sqrt(mean_squared_error(y_val, y_pred_rf)))
        
        # XGBoost
        xgb_model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        xgb_model.fit(X_train, y_train)
        y_pred_xgb = xgb_model.predict(X_val)
        xgb_r2_scores.append(r2_score(y_val, y_pred_xgb))
        xgb_rmse_scores.append(np.sqrt(mean_squared_error(y_val, y_pred_xgb)))

    print("--- Resultados de Cross-Validation ---")
    print(f"RandomForest - R² Promedio: {np.mean(rf_r2_scores):.4f} (± {np.std(rf_r2_scores):.4f})")
    print(f"RandomForest - RMSE Promedio: ${np.mean(rf_rmse_scores):,.2f} USD (± ${np.std(rf_rmse_scores):,.2f})")
    print(f"XGBoost - R² Promedio: {np.mean(xgb_r2_scores):.4f} (± {np.std(xgb_r2_scores):.4f})")
    print(f"XGBoost - RMSE Promedio: ${np.mean(xgb_rmse_scores):,.2f} USD (± ${np.std(xgb_rmse_scores):,.2f})")


# Fase 4: Selección y Entrenamiento del Modelo Final

In [4]:
if 'X_codificado' in locals():
    if np.mean(xgb_rmse_scores) < np.mean(rf_rmse_scores):
        final_model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        final_metrics = {
            "model": "XGBoost",
            "r2_score_mean": np.mean(xgb_r2_scores),
            "r2_score_std": np.std(xgb_r2_scores),
            "rmse_usd_mean": np.mean(xgb_rmse_scores),
            "rmse_usd_std": np.std(xgb_rmse_scores)
        }
    else:
        final_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        final_metrics = {
            "model": "RandomForest",
            "r2_score_mean": np.mean(rf_r2_scores),
            "r2_score_std": np.std(rf_r2_scores),
            "rmse_usd_mean": np.mean(rf_rmse_scores),
            "rmse_usd_std": np.std(rf_rmse_scores)
        }

    final_model.fit(X_codificado, y)

    model_dir = '../src/ml/'
    os.makedirs(model_dir, exist_ok=True)
    
    with open(os.path.join(model_dir, 'model.pkl'), 'wb') as f: pickle.dump(final_model, f)
    with open(os.path.join(model_dir, 'model_columns.pkl'), 'wb') as f: pickle.dump(list(X_codificado.columns), f)
    guardar_vectorizer(tfidf_vectorizer, os.path.join(model_dir, 'tfidf_vectorizer.pkl'))
    with open(os.path.join(model_dir, 'metrics.json'), 'w') as f: json.dump(final_metrics, f, indent=4)
