# Modelo de regresion con Random Forest

In [1]:
from pymongo import MongoClient
import logging
import os
import pandas as pd

In [2]:
# Configuracion basica
mongo_user = os.environ.get("MONGO_USER")
mongo_password = os.environ.get("MONGO_PASSWORD")
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s]: %(message)s', handlers=[logging.StreamHandler()])



In [3]:
client = MongoClient(f"mongodb://{mongo_user}:{mongo_password}@localhost:27017/")
db = client["tfm"]
collection = db["frontur_dl"]

# Obtener los datos
cursor = collection.find({})
data = list(cursor)

# Convertir a DataFrame
df = pd.DataFrame(data)

# Opcional: eliminar la columna '_id' si no es útil
if "_id" in df.columns:
    df.drop("_id", axis=1, inplace=True)

df.head()

Unnamed: 0,AÑO,MES,PAIS_RESIDENCIA,TIPO_VISITANTE,CCAA_DESTINO,VISITANTES
0,2016,1,Alemania,Excursionista,Andalucía,3897
1,2016,1,Alemania,Excursionista,Aragón,48
2,2016,1,Alemania,Excursionista,Baleares,4870
3,2016,1,Alemania,Excursionista,Canarias,1949
4,2016,1,Alemania,Excursionista,Castilla y León,217


In [4]:
df.shape

(36839, 6)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Selección de variables
features = ['AÑO', 'MES', 'PAIS_RESIDENCIA', 'CCAA_DESTINO']  # ajusta nombres exactos
target = 'VISITANTES'

# Eliminar filas con valores nulos en columnas clave
df.dropna(subset=features + [target], inplace=True)

X = df[features]
y = df[target]

# Codificar variables categóricas
categorical_features = ['PAIS_RESIDENCIA', 'CCAA_DESTINO']
numeric_features = ['AÑO', 'MES']

# Transformador de columnas
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # deja pasar año y mes sin modificar
)

# División entrenamiento/prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

# Pipeline completo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Entrenamiento
pipeline.fit(X_train, y_train)

# Predicción
y_pred = pipeline.predict(X_test)

# Evaluación
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

logging.info(f"Error absoluto medio (MAE): {mae:.0f} visitantes")
logging.info(f"Coeficiente de determinación (R²): {r2:.2f}")

2025-07-15 14:53:09,022 [INFO]: Error absoluto medio (MAE): 25062 visitantes
2025-07-15 14:53:09,071 [INFO]: Coeficiente de determinación (R²): 0.27
