In [28]:
from Format import *
from config import *
import pandas as pd

"""df = pd.read_excel(f"{path_unify}2023-12-01_df.xlsx")

data = DistanceCalculator().calcular_distancias(df)
data = DataFilter().formatDF(data)
data.to_csv("../Data/test/filt_12_01.csv", sep=";", index=False)"""

data = pd.read_csv("../Data/test/filt_12_01.csv", sep=";")
data = data.loc[data["tipoPropiedad"] != 3]

In [38]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

features = ['terrenoEdificado', 'coordX', 'coordY', 'barrioID', 'ano', 'mes', 'dia']
data_for_clustering = data[features]

# Escalar los datos para que todas las características tengan la misma escala
kmeans_scaler = StandardScaler()
scaled_data = kmeans_scaler.fit_transform(data_for_clustering)

# Determinar el número óptimo de clusters utilizando el método del codo
wcss = []
for i in range(1, 15):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=100, random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

# Visualizar el método del codo para encontrar el número óptimo de clusters
plt.plot(range(1, 15), wcss)
plt.title('Método del Codo')
plt.xlabel('Número de Clusters')
plt.ylabel('WCSS')  # Within-Cluster Sum of Squares
plt.show()

n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=500, n_init=200, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

# Agregar la información de los clusters al DataFrame original
data['clusterKM'] = clusters

In [39]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features = ['tipoPropiedad', 'terrenoEdificado', 
            'cantDormitorios', 'cantBanos',
            'barrioID', 'coordX', 'coordY', 
            'transporteCercano', 'saludCercana', 
            'ano', 'mes', 'clusterKM']
data_analisis = data[features]
target = data["precioUSD"]

X_train, X_test, y_train, y_test = train_test_split(data_analisis, target, train_size=0.8, random_state=33)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [40]:
models = dict()
r2_scores = []
for i in range(1, 20):
    models[f'knn{i}'] = KNeighborsRegressor(n_neighbors=i, weights='distance', algorithm='ball_tree')
    m = KNeighborsRegressor(n_neighbors=i, weights='distance', algorithm='ball_tree')
    m.fit(X_train_scaled, y_train)
    scores_r2 = cross_val_score(m, X_train_scaled, y_train, cv=5, scoring='r2')
    r2_scores.append((f'knn{i}', scores_r2.mean()))
    
r2_scores

[('knn1', 0.5636123951855962),
 ('knn2', 0.6507887855101081),
 ('knn3', 0.6793320196117629),
 ('knn4', 0.6905869836139874),
 ('knn5', 0.7015268563007065),
 ('knn6', 0.7099557890167736),
 ('knn7', 0.7080817660608002),
 ('knn8', 0.7071151414373955),
 ('knn9', 0.7081056487871125),
 ('knn10', 0.709083853761895),
 ('knn11', 0.7093460752637485),
 ('knn12', 0.7089680293989774),
 ('knn13', 0.7100686269297889),
 ('knn14', 0.7082369808935916),
 ('knn15', 0.7084883222859647),
 ('knn16', 0.7077272747431483),
 ('knn17', 0.7064190165517757),
 ('knn18', 0.70586973994986),
 ('knn19', 0.7047095377901087)]

In [42]:
from sklearn.ensemble import RandomForestRegressor

print("Creando Random Forest...")

best_depth = 0
best_r2 = 0

for i in range(1, 20):
    random_forest_model = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=i)
    random_forest_model.fit(X_train_scaled, y_train)
    r2 = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()
    if r2 > best_r2:
        best_r2 = r2
        best_depth = i
    else:
        break
    
models['RF'] = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=best_depth)

r2_scores.append(('RF', best_r2))
print(f"Random Forest: r2 = {best_r2}, depth = {best_depth}")

Creando Random Forest...
Random Forest: r2 = 0.7557865627423689, depth = 17


In [43]:
from sklearn.ensemble import GradientBoostingRegressor

print("Creando XGBoost...")

best_depth = 0
best_r2 = 0

for i in range(1, 20):
    gradient_boosting_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=i, random_state=42)
    gradient_boosting_model.fit(X_train_scaled, y_train)
    r2 = cross_val_score(gradient_boosting_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()
    if r2 > best_r2:
        best_r2 = r2
        best_depth = i
    else:
        break

models['GBX'] = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=best_depth)

r2_scores.append(('GBX', best_r2))
print(f"XGBoost: r2 = {best_r2}, depth = {best_depth}")

Creando XGBoost...
XGBoost: r2 = 0.767658794421455, depth = 6


In [45]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

print("Creando AdaBoost con Decision Tree...")

best_depth = 0
best_r2 = 0

for i in range(5, 20):
    ada_boost_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=i), n_estimators=200, random_state=42)
    ada_boost_model.fit(X_train_scaled, y_train)
    r2 = cross_val_score(ada_boost_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()
    if r2 > best_r2:
        best_r2 = r2
        best_depth = i
    else:
        break

models['ABX-DT'] = AdaBoostRegressor(DecisionTreeRegressor(max_depth=best_depth), n_estimators=200, random_state=42)

r2_scores.append(('ABX-DT', best_r2))
print(f"AdaBoost DT: r2 = {best_r2}, depth = {best_depth}")

Creando AdaBoost con Decision Tree...
AdaBoost DT: r2 = 0.7606965359912212, depth = 11


In [10]:
r2_scores = sorted(r2_scores, key=lambda x: x[1], reverse=True)
r2_scores = [m for m in r2_scores if m[1] > 0.75]
r2_scores

[('GBX', 0.7674925196931456),
 ('ABX-DT', 0.7606340042547588),
 ('RF', 0.7576072992256915)]

In [12]:
top_models = dict()
for m in r2_scores:
    top_models[m[0]] = models[m[0]]
top_models

{'GBX': RandomForestRegressor(max_depth=6, n_estimators=200, random_state=42),
 'ABX-DT': AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=11),
                   n_estimators=200, random_state=42),
 'RF': RandomForestRegressor(max_depth=16, n_estimators=200, random_state=42)}

In [16]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

print("Creando Voting...")
voting_model = VotingRegressor(estimators=list(top_models.items()))
voting_model.fit(X_train_scaled, y_train)
r2 = cross_val_score(voting_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

print(f"Voting: r2 = {r2}")

X_test_scaled = scaler.transform(X_test)
voting_preds = voting_model.predict(X_test_scaled)
r2_ensamble = r2_score(y_test, voting_preds)
rmse_ensamble = np.sqrt(mean_squared_error(y_test, voting_preds))

print(f"Voting: r2 = {r2_ensamble}, rmse = {rmse_ensamble}")

Creando Voting...
Voting: r2 = 0.7537507250601203
Voting: r2 = 0.7630196386479958, rmse = 39509.93901549374


In [48]:
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

print("Creando Stacking...")
stacking_model = StackingRegressor(estimators=list(top_models.items()), n_jobs=2)
stacking_model.fit(X_train_scaled, y_train)
r2 = cross_val_score(stacking_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

print(f"Voting: r2 = {r2}")

X_test_scaled = scaler.transform(X_test)
stacking_preds = stacking_model.predict(X_test_scaled)
r2_ensamble = r2_score(y_test, stacking_preds)
rmse_ensamble = np.sqrt(mean_squared_error(y_test, stacking_preds))

print(f"Voting: r2 = {r2_ensamble}, rmse = {rmse_ensamble}")

Creando Stacking...
Voting: r2 = 0.7636095395906397
Voting: r2 = 0.7853625661272382, rmse = 37601.304692903985
