# Algoritmo de regresión con XGBoost para 4 clusters - No muy exitoso

In [1]:
from pymongo import MongoClient
import logging
import os
import pandas as pd

In [2]:
# Configuracion basica
mongo_user = os.environ.get("MONGO_USER")
mongo_password = os.environ.get("MONGO_PASSWORD")
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s]: %(message)s', handlers=[logging.StreamHandler()])



In [3]:
client = MongoClient(f"mongodb://{mongo_user}:{mongo_password}@localhost:27017/")
db = client["tfm"]
collection = db["frontur_dl"]

# Obtener los datos
cursor = collection.find({})
data = list(cursor)

# Convertir a DataFrame
df = pd.DataFrame(data)

# Opcional: eliminar la columna '_id' si no es útil
if "_id" in df.columns:
    df.drop("_id", axis=1, inplace=True)

df.head()

Unnamed: 0,AÑO,MES,PAIS_RESIDENCIA,TIPO_VISITANTE,CCAA_DESTINO,VISITANTES
0,2020,1,Alemania,Excursionista,Andalucía,3446
1,2020,1,Alemania,Excursionista,Baleares,6103
2,2020,1,Alemania,Excursionista,Canarias,23521
3,2020,1,Alemania,Excursionista,Castilla y León,567
4,2020,1,Alemania,Excursionista,Cataluña,7139


In [4]:
df.shape

(19589, 6)

In [5]:
# Clusters obtenidos del analisis con K-Means
ccaa_clusters = {
    "Andalucía": 0,
    "Cataluña": 0,
    "Baleares": 0,
    "Canarias": 0,
    "Com.Valenciana": 1,
    "País Vasco": 1,
    "Galicia": 1,
    "Castilla y León": 2,
    "Com.Madrid": 2,
    "C.F.Navarra": 2,
    "Castilla La Mancha": 2,
    "Extremadura": 2,
    "Región de Murcia": 2,
    "Cantabria": 2,
    "Aragón": 2,
    "La Rioja": 2,
    "Principado de Asturias": 2,
    "Transito": 3
}

In [6]:
# Eliminar Ceuta y Melilla
df = df[~df["CCAA_DESTINO"].isin(["Ceuta", "Melilla"])]

In [7]:
# Asignar cluster según la tabla
df["Cluster"] = df["CCAA_DESTINO"].map(ccaa_clusters)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
import joblib

columnas_categoricas = ["PAIS_RESIDENCIA", "TIPO_VISITANTE"]
columnas_numericas = ["AÑO", "MES"]

resultados = []

for cluster_id in sorted(df["Cluster"].dropna().unique()):
    df_cluster = df[df["Cluster"] == cluster_id]

    X = df_cluster.drop(columns=["VISITANTES", "CCAA_DESTINO", "Cluster"])
    y = df_cluster["VISITANTES"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    preprocessor = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), columnas_categoricas),
        ("num", StandardScaler(), columnas_numericas)
    ])

    model = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", XGBRegressor(
            n_estimators=200, learning_rate=0.1,
            max_depth=6, subsample=1.0,
            colsample_bytree=0.7, random_state=42
        ))
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    resultados.append({
        "Cluster": cluster_id,
        "MAE": round(mae, 2),
        "R2": round(r2, 3)
    })

    joblib.dump(model, f"modelo_cluster_{cluster_id}.pkl")

# Mostrar resultados
pd.DataFrame(resultados).sort_values("R2", ascending=False)


Unnamed: 0,Cluster,MAE,R2
3,3,15059.23,0.964
0,0,40370.34,0.502
1,1,34023.49,0.303
2,2,11953.66,0.119
