In [1]:
from Format import *
from config import *
import pandas as pd
import os

data = pd.read_csv('../Data/test/all_data.csv')

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features = ['tipoPropiedad', 'terrenoEdificado', 
            'cantDormitorios', 'cantBanos',
            'barrioID', 'coordX', 'coordY', 
            'transporteCercano', 'saludCercana', 
            'ano', 'mes']
data_analisis = data[features]
target = data["precioUSD"]

X_train, X_test, y_train, y_test = train_test_split(data_analisis, target, train_size=0.8, random_state=33)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [4]:
models = dict()
r2_scores = []
for i in range(1, 20):
    models[f'knn{i}'] = KNeighborsRegressor(n_neighbors=i, weights='distance', algorithm='ball_tree')
    m = KNeighborsRegressor(n_neighbors=i, weights='distance', algorithm='ball_tree')
    m.fit(X_train_scaled, y_train)
    scores_r2 = cross_val_score(m, X_train_scaled, y_train, cv=5, scoring='r2')
    r2_scores.append((f'knn{i}', scores_r2.mean()))

In [5]:
from sklearn.ensemble import RandomForestRegressor

print("Creando Random Forest...")

best_depth = 18

random_forest_model = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=best_depth)
random_forest_model.fit(X_train_scaled, y_train)
best_r2 = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()
    
models['RF'] = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=best_depth)

r2_scores.append(('RF', best_r2))
print(f"Random Forest: r2 = {best_r2}, depth = {best_depth}")

Creando Random Forest...


'r2_scores.append((\'RF\', best_r2))\nprint(f"Random Forest: r2 = {best_r2}, depth = {best_depth}")'

In [6]:
from sklearn.ensemble import GradientBoostingRegressor

print("Creando XGBoost...")

best_depth = 8

gradient_boosting_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=best_depth, random_state=42)
gradient_boosting_model.fit(X_train_scaled, y_train)
best_r2 = cross_val_score(gradient_boosting_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

models['GBX'] = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=best_depth)

r2_scores.append(('GBX', best_r2))
print(f"XGBoost: r2 = {best_r2}, depth = {best_depth}")

Creando XGBoost...


'r2_scores.append((\'GBX\', best_r2))\nprint(f"XGBoost: r2 = {best_r2}, depth = {best_depth}")'

In [7]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

print("Creando AdaBoost con Decision Tree...")

best_depth = 14

ada_boost_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=best_depth), n_estimators=200, random_state=42)
ada_boost_model.fit(X_train_scaled, y_train)
best_r2 = cross_val_score(ada_boost_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

models['ABX-DT'] = AdaBoostRegressor(DecisionTreeRegressor(max_depth=best_depth), n_estimators=200, random_state=42)

r2_scores.append(('ABX-DT', best_r2))
print(f"AdaBoost DT: r2 = {best_r2}, depth = {best_depth}")

Creando AdaBoost con Decision Tree...


'r2_scores.append((\'ABX-DT\', best_r2))\nprint(f"AdaBoost DT: r2 = {best_r2}, depth = {best_depth}")'

In [8]:
r2_scores = sorted(r2_scores, key=lambda x: x[1], reverse=True)
r2_scores = [m for m in r2_scores if m[1] > 0.73]
r2_scores

In [9]:
top_models = dict()
for m in r2_scores:
    top_models[m[0]] = models[m[0]]
top_models

{'GBX': RandomForestRegressor(max_depth=8, n_estimators=200, random_state=42),
 'ABX-DT': AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=14),
                   n_estimators=200, random_state=42),
 'RF': RandomForestRegressor(max_depth=18, n_estimators=200, random_state=42),
 'knn8': KNeighborsRegressor(algorithm='ball_tree', n_neighbors=8, weights='distance'),
 'knn7': KNeighborsRegressor(algorithm='ball_tree', n_neighbors=7, weights='distance'),
 'knn9': KNeighborsRegressor(algorithm='ball_tree', n_neighbors=9, weights='distance'),
 'knn10': KNeighborsRegressor(algorithm='ball_tree', n_neighbors=10, weights='distance'),
 'knn11': KNeighborsRegressor(algorithm='ball_tree', n_neighbors=11, weights='distance'),
 'knn6': KNeighborsRegressor(algorithm='ball_tree', n_neighbors=6, weights='distance'),
 'knn12': KNeighborsRegressor(algorithm='ball_tree', n_neighbors=12, weights='distance')}

In [10]:
from sklearn.ensemble import VotingRegressor
from scipy.optimize import minimize
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

def objective(weights):
    weights_dict = dict(zip(top_models.keys(), weights))
    weighted_ensemble = VotingRegressor(estimators=list(top_models.items()), weights=list(weights_dict.values()))
    weighted_ensemble.fit(X_train, y_train)
    r2 = cross_val_score(weighted_ensemble, X_train, y_train, cv=5, scoring='r2').mean()
    print(f"r2 = {r2}, weights = {weights}")
    return -r2

print("Calculando pesos optimos...")
initial_weights = [-0.23468363,  0.75368092,  0.33406547, -0.15421233,  0.03304426, -0.00184314, 0.09632083, 
                   -0.01377198,  0.24562255, -0.05822294]
constraints = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
result = minimize(objective, initial_weights, constraints=constraints)
optimal_weights = result.x

print("Creando Voting...")
voting_model = VotingRegressor(estimators=list(top_models.items()), weights=list(optimal_weights))
voting_model.fit(X_train_scaled, y_train)
r2 = cross_val_score(voting_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

print(f"Voting: r2 = {r2}")

X_test_scaled = scaler.transform(X_test)
voting_preds = voting_model.predict(X_test_scaled)
r2_ensamble = r2_score(y_test, voting_preds)
rmse_ensamble = np.sqrt(mean_squared_error(y_test, voting_preds))

print(f"Voting: r2 = {r2_ensamble}, rmse = {rmse_ensamble}")

Creando Voting...


'voting_model.fit(X_train_scaled, y_train)\nr2 = cross_val_score(voting_model, X_train_scaled, y_train, cv=5, scoring=\'r2\').mean()\n\nprint(f"Voting: r2 = {r2}")\n\nX_test_scaled = scaler.transform(X_test)\nvoting_preds = voting_model.predict(X_test_scaled)\nr2_ensamble = r2_score(y_test, voting_preds)\nrmse_ensamble = np.sqrt(mean_squared_error(y_test, voting_preds))\n\nprint(f"Voting: r2 = {r2_ensamble}, rmse = {rmse_ensamble}")'

r2 = 0.8013229771027949

weights = [-0.23468363  0.75368092  0.33406547 -0.15421233  0.03304426 -0.00184314
  0.09632083 -0.01377198  0.24562255 -0.05822293]

In [12]:
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

print("Creando Stacking...")
stacking_model = StackingRegressor(estimators=list(top_models.items()), n_jobs=2)
stacking_model.fit(X_train_scaled, y_train)
r2 = cross_val_score(stacking_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

print(f"Voting: r2 = {r2}")

X_test_scaled = scaler.transform(X_test)
stacking_preds = stacking_model.predict(X_test_scaled)
r2_ensamble = r2_score(y_test, stacking_preds)
rmse_ensamble = np.sqrt(mean_squared_error(y_test, stacking_preds))

print(f"Voting: r2 = {r2_ensamble}, rmse = {rmse_ensamble}")

Creando Stacking...


['../models/unfitted_stacking.joblib']

In [34]:
import joblib

joblib.dump(scaler, '../models/scaler.joblib')
joblib.dump(voting_model, '../models/voting.joblib')
joblib.dump(stacking_model, '../models/stacking.joblib')

['../models/stacking.joblib']