Imports

In [1]:
import joblib
import numpy as np
import pandas as pd
from config import *
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

Preparacion del conjunto de datos

In [3]:
data = pd.read_csv(formated_train, sep=';')

# convertir fecha a datetime y luego a int
data['fechaUltimaActualizacion'] = pd.to_datetime(data['fechaUltimaActualizacion'])
data['fechaUltimaActualizacion'] = data['fechaUltimaActualizacion'].astype('int64')

features = ['terrenoEdificado', 'coordX', 'coordY', 'barrio', 'fechaUltimaActualizacion']
data_for_clustering = data[features]

scaler_kmeans = StandardScaler()
scaled_data = scaler_kmeans.fit_transform(data_for_clustering)

kmeans = joblib.load(kmeans_model)
data['clusterKM'] = kmeans.predict(scaled_data)

Modelo ensamblado inicial

In [5]:
ensamble = joblib.load(ensamble_precio_model)

features = ['terrenoEdificado', 'comisariaCercana', 'transporteCercano', 
            'saludCercana', 'coordX', 'coordY', 'clusterKM', 'barrio', 'fechaUltimaActualizacion']
data_for_train = data[features]
target_column = 'precioUSD'
target = data[target_column]

scaler_ensamble = StandardScaler()
scaled_data = scaler_ensamble.fit_transform(data_for_train)

X_train, X_test, y_train, y_test = train_test_split(data_for_train, target, train_size=0.99, random_state=33)

X_test_scaled = scaler_ensamble.transform(X_test)
all_data_predictions = ensamble.predict(X_test_scaled)

ensemble_r2 = r2_score(y_test, all_data_predictions)
ensemble_rmse = np.sqrt(mean_squared_error(y_test, all_data_predictions))

print(f'Ensamble R2: {ensemble_r2}')
print(f'Ensamble RMSE: {ensemble_rmse}')

Ensamble R2: 0.8562784673097436
Ensamble RMSE: 31030.90300354034


Confección del modelo de correccion

In [50]:
residuals = y_test - all_data_predictions

residual_boosting_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5, criterion='friedman_mse'), n_estimators=50, random_state=42)
residual_boosting_model.fit(X_test_scaled, residuals)

residual_pred_ada = residual_boosting_model.predict(X_test_scaled)
final_predictions = all_data_predictions + residual_pred_ada

r2 = r2_score(y_test, final_predictions)
rmse = np.sqrt(mean_squared_error(y_test, final_predictions))

print(f"R^2 para el modelo ensamblado corregido con residuos de boosting: {r2}")
print(f"RMSE del modelo ensamblado corregido con residuos de boosting: {rmse}")

R^2 para el modelo ensamblado corregido con residuos de boosting: 0.9947212600510387
RMSE del modelo ensamblado corregido con residuos de boosting: 5947.009435498988


