In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans
from xgboost import XGBRegressor
import scipy.stats as stats

# Cargar los datos de entrenamiento
data = pd.read_csv("./Barcelona_rent_price.csv", delimiter=';')

# Preprocesamiento de los datos de entrenamiento
numeric_data = data.select_dtypes(include=['float64', 'int64'])
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(numeric_data)

kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(data_scaled)
data['Cluster'] = clusters
data['Distance_to_Centroid'] = np.linalg.norm(data_scaled - kmeans.cluster_centers_[clusters], axis=1)

threshold = data['Distance_to_Centroid'].quantile(0.95)
data['Is_Outlier'] = data['Distance_to_Centroid'] > threshold

data_clean = data[~data['Is_Outlier']]
data_encoded = pd.get_dummies(data_clean, columns=['District', 'Neighbourhood'])

X = data_encoded.drop('Price (euro/m2)', axis=1)
y = data_encoded['Price (euro/m2)']

# Aplicar transformación de Box-Cox a la variable objetivo
pt = PowerTransformer(method='box-cox')
y_transformed = pt.fit_transform(y.values.reshape(-1, 1)).flatten()

X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)





2. Optimización de Hiperparámetros para Ridge

In [2]:
# Optimización de Hiperparámetros para Ridge
ridge_model = Ridge()
param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0],
    'fit_intercept': [True, False],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag']
}
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)

best_ridge_model = grid_search.best_estimator_


    3. Definir y Entrenar Modelos

In [3]:
# Definir y entrenar modelos
lasso_model = Lasso()
elasticnet_model = ElasticNet()
rf_model = RandomForestRegressor()
xgb_model = XGBRegressor()

# Entrenar los modelos
best_ridge_model.fit(X_train_scaled, y_train)
lasso_model.fit(X_train_scaled, y_train)
elasticnet_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
xgb_model.fit(X_train_scaled, y_train)


4. Evaluación de los Modelos

In [4]:
# Evaluación de los modelos
models = {
    'Ridge': best_ridge_model,
    'Lasso': lasso_model,
    'ElasticNet': elasticnet_model,
    'RandomForest': rf_model,
    'XGBoost': xgb_model
}

for name, model in models.items():
    y_pred_transformed = model.predict(X_test_scaled)
    y_pred = pt.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()
    y_test_exp = pt.inverse_transform(y_test.reshape(-1, 1)).flatten()
    
    mae = mean_absolute_error(y_test_exp, y_pred)
    mse = mean_squared_error(y_test_exp, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_exp, y_pred)
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    
    print(f'\nModel: {name}')
    print(f'MAE: {mae}, MSE: {mse}, RMSE: {rmse}, R²: {r2}')
    print(f'Cross-Validation R²: {cv_scores.mean()}, {cv_scores.std()}')



Model: Ridge
MAE: 0.4653604136352518, MSE: 0.4354874008047089, RMSE: 0.6599146920661101, R²: 0.9165749109341838
Cross-Validation R²: 0.9066066304658985, 0.018164272412134367

Model: Lasso
MAE: 1.8306683393229557, MSE: 5.223743848895104, RMSE: 2.285551103978011, R²: -0.0006978274131781426
Cross-Validation R²: -0.0015905273424927646, 0.0016290731300665426

Model: ElasticNet
MAE: 1.8306683393229557, MSE: 5.223743848895104, RMSE: 2.285551103978011, R²: -0.0006978274131781426
Cross-Validation R²: -0.0015905273424927646, 0.0016290731300665426

Model: RandomForest
MAE: 0.4267387910248906, MSE: 0.40935127387855264, RMSE: 0.6398056532092794, R²: 0.9215817348115704
Cross-Validation R²: 0.9093062563890463, 0.0185104196138005

Model: XGBoost
MAE: 0.4096108527824385, MSE: 0.3384140183547566, RMSE: 0.5817336317892895, R²: 0.9351709841198669
Cross-Validation R²: 0.9215240519484388, 0.014772281486153649


5. Combinación de Modelos (Ensemble Learning)

In [5]:
# Combinación de modelos utilizando VotingRegressor
ensemble_model = VotingRegressor(estimators=[
    ('ridge', best_ridge_model),
    ('lasso', lasso_model),
    ('elasticnet', elasticnet_model),
    ('rf', rf_model),
    ('xgb', xgb_model)
])
ensemble_model.fit(X_train_scaled, y_train)

# Evaluación del modelo de ensemble
y_pred_transformed = ensemble_model.predict(X_test_scaled)
y_pred = pt.inverse_transform(y_pred_transformed.reshape(-1, 1)).flatten()
y_test_exp = pt.inverse_transform(y_test.reshape(-1, 1)).flatten()

mae = mean_absolute_error(y_test_exp, y_pred)
mse = mean_squared_error(y_test_exp, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_exp, y_pred)
cv_scores = cross_val_score(ensemble_model, X_train_scaled, y_train, cv=5, scoring='r2')

print(f'\nModel: Ensemble')
print(f'MAE: {mae}, MSE: {mse}, RMSE: {rmse}, R²: {r2}')
print(f'Cross-Validation R²: {cv_scores.mean()}, {cv_scores.std()}')



Model: Ensemble
MAE: 0.8492066710457903, MSE: 1.1694877106425443, RMSE: 1.0814285508726613, R²: 0.7759645485921096
Cross-Validation R²: 0.7737546223243827, 0.014317386024455694


6. Predicción con los Nuevos Datos

In [6]:
# Cargar los nuevos datos
new_data = pd.DataFrame({
    'Year': [2044, 2044, 2044, 2044],
    'Trimester': [1, 2, 3, 4],
    'District': ['Eixample', 'Sant Martí', 'Gràcia', 'Ciutat Vella'],
    'Neighbourhood': ['Dreta de l\'Eixample', 'El Poblenou', 'Vila de Gràcia', 'El Raval']
})

# Preprocesamiento de los nuevos datos
new_data_encoded = pd.get_dummies(new_data, columns=['District', 'Neighbourhood'])
missing_cols = set(X.columns) - set(new_data_encoded.columns)
for c in missing_cols:
    new_data_encoded[c] = 0
new_data_encoded = new_data_encoded[X.columns]
new_data_scaled = scaler.transform(new_data_encoded)

# Realizar la predicción con el modelo de ensemble
predictions_transformed = ensemble_model.predict(new_data_scaled)
predictions = pt.inverse_transform(predictions_transformed.reshape(-1, 1)).flatten()
new_data['Predicted Price (euro/m2)'] = predictions

# Mostrar las predicciones
print(new_data[['Year', 'Trimester', 'District', 'Neighbourhood', 'Predicted Price (euro/m2)']])

# Guardar las predicciones en un archivo CSV
#new_data.to_csv('new_data_with_predictions.csv', index=False)


   Year  Trimester      District        Neighbourhood  \
0  2044          1      Eixample  Dreta de l'Eixample   
1  2044          2    Sant Martí          El Poblenou   
2  2044          3        Gràcia       Vila de Gràcia   
3  2044          4  Ciutat Vella             El Raval   

   Predicted Price (euro/m2)  
0                  10.464235  
1                  10.605186  
2                  10.610548  
3                  11.028605  
