<a href="https://colab.research.google.com/github/juliaronquetti/Modelo_predicao_SP/blob/main/4_imoveis_select_ML_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
start_time = time.time()

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

path = 'https://github.com/juliaronquetti/Modelo_predicao_SP/blob/main/'


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, cross_val_predict, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor, RANSACRegressor, TheilSenRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error, make_scorer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold

from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, median_absolute_error

import statsmodels.api as sm

In [None]:
def error_metrics(y_true, y_pred):

  metrics = {}

  metrics['R2'] = np.round(r2_score(y_true, y_pred),2)
  metrics['RMSE'] = root_mean_squared_error(y_true, y_pred).round(2)
  metrics['MAE'] = mean_absolute_error(y_true, y_pred).round(2)
  metrics['MedAE'] = median_absolute_error(y_true, y_pred).round(2)
  metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred).round(2)

  return metrics

In [None]:
def cv_metrics(cv_results,grid_search):

  metrics = {}

  metrics['R2'] = cv_results['mean_test_R2'][grid_search.best_index_].round(2)
  metrics['RMSE'] = cv_results['mean_test_RMSE'][grid_search.best_index_].round(2)
  metrics['MAE'] = cv_results['mean_test_MAE'][grid_search.best_index_].round(2)
  metrics['MedAE'] = cv_results['mean_test_MedAE'][grid_search.best_index_].round(2)
  metrics['MAPE'] = cv_results['mean_test_MAPE'][grid_search.best_index_].round(2)

  return metrics

In [None]:
from IPython.display import display, Latex, HTML
import warnings

def gera_df_stats_latex(df_stats, titulo):

  warnings.filterwarnings('ignore')
  # latex_table = df_stats.style.to_latex(hrules=True)
  latex_table = df_stats.to_latex(index=None)
  # display(Latex(latex_table))

  latex_table = latex_table.replace('tabular','array')
  latex_table = latex_table.replace('toprule','hline')
  latex_table = latex_table.replace('midrule','hline')
  latex_table = latex_table.replace('bottomrule','hline')

  latex_table = '\\textbf{' + titulo + '} \\\\' + latex_table

  warnings.filterwarnings('default')
  return latex_table

In [None]:
df = pd.read_csv(path + 'imoveis_clean_encoded.csv')

# if you want test with few records
np.random.seed(42)
df = df.iloc[ np.random.choice(df.index, size=1000, replace=False) ]

# selecionar a zona
# df = df[ df.zona_SUL == 1 ].reset_index(drop=True)
# df.drop(columns=['zona_LESTE','zona_NORTE','zona_OESTE','zona_SUL'])

# selecionar quartos
# df = df[ df.quartos == 3 ].reset_index(drop=True)
# df.drop(columns=['quartos'])

df.head()

Unnamed: 0,title,price,location,destaque,condominio,area_util,quartos,banheiros,vagas_na_garagem,iptu,academia,elevador,permitido_animais,piscina,portaria,salao_de_festas,condominio_fechado,seguranca_24h,portao_eletronico,area_murada,area_de_servico,armarios_na_cozinha,armarios_no_quarto,churrasqueira,mobiliado,quarto_de_servico,ar_condicionado,porteiro_24h,varanda,img_index,categoria_Casas,tipo_Casa de vila,tipo_Cobertura,tipo_Duplex ou triplex,tipo_Kitnet,tipo_Loft,tipo_Padrão,zona_LESTE,zona_NORTE,zona_OESTE,zona_SUL
17024,"Apartamento à venda em Pinheiros com 69 m², 2 ...",1795000,491,0,1000,69,2,2,1,10,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,20798,0,0,0,0,0,0,1,0,0,1,0
2910,Apartamento à venda Rua Doutor Fabrício Vampré...,2935000,700,0,1420,153,3,3,2,13,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,3454,0,0,0,0,0,0,1,0,0,0,1
344,Apartamento para venda possui 70 metros quadra...,360000,345,0,800,70,2,1,1,150,1,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,449,0,0,0,0,0,0,1,0,1,0,0
1034,"APARTAMENTO RESIDENCIAL em São Paulo - SP, Cam...",1090000,25,0,1850,165,3,4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1271,0,0,0,0,0,0,1,0,0,0,0
5466,Casa Térrea na Casa Verde - Terreno 9x30m²,600000,565,0,0,160,3,2,2,215,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6474,1,0,0,0,0,0,1,0,1,0,0


# Machine Learning Models

In [None]:
%%time
# Some GridSearchCV definitions
# scoring
scoring = {
    'R2': make_scorer(r2_score),
    'RMSE': make_scorer(mean_squared_error, squared=False),
    'MAE': make_scorer(mean_absolute_error),
    'MedAE': make_scorer(median_absolute_error),
    'MAPE': make_scorer(mean_absolute_percentage_error),
}

# CV k folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results_dict = {} # starts results history empty

CPU times: user 1.25 ms, sys: 36 µs, total: 1.29 ms
Wall time: 1.29 ms


## GridSearchCV definitions and start results history

In [None]:
%%time
model_test_name = '***'

df_model = df.copy()

# if you need to use a small sample
# np.random.seed(42)
# df_model = df_model.iloc[ np.random.choice(df.index, size=1000, replace=False) ]

df_model.drop(['title','img_index'], axis=1, inplace=True)

X = df_model.drop('price', axis=1)
y = df_model['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# numeric transformer = StandardScaler
numeric_features = df_model.drop('price', axis=1).select_dtypes(include=['number']).columns.tolist()
numeric_transformer = StandardScaler()

# combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# pipeline com pré-processamento e modelo
modelo = RandomForestRegressor() # default

modelos = {
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=1000, random_state=42),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=1000, random_state=42),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'HuberRegressor': HuberRegressor(max_iter=1000),
    'RANSACRegressor': RANSACRegressor(),
    'TheilSenRegressor': TheilSenRegressor()
}

model = make_pipeline(preprocessor, modelo)

# model parameters
param_grid = {
    'Decision Tree': {},
    'Random Forest': {},
    'Linear Regression': {},
    'Ridge Regression': {},
    'Lasso Regression': {},
    'Gradient Boosting': {},
    'K-Nearest Neighbors': {'n_neighbors':range(5,11)},
    'HuberRegressor': {},
    'RANSACRegressor': {},
    'TheilSenRegressor': {}
}

# GridSearchCV
for model_name, model in modelos.items():

  model_test_name = model_name

  grid_search = GridSearchCV(
      estimator=model,
      param_grid=param_grid[model_name],
      cv=kf,
      scoring=scoring,
      refit='RMSE',
      return_train_score=True,
      n_jobs=-1  # utilize todos os núcleos do processador
  )

  # train GridSearchCV
  grid_search.fit(X_train, y_train)

  # Imprime os melhores parâmetros encontrados
  # print("Melhores parâmetros:", grid_search.best_params_)
  # print()

  results_dict[model_test_name + ' (CV)'] = cv_metrics(grid_search.cv_results_,grid_search)

  y_pred = grid_search.predict(X_test)
  results_dict[model_test_name + ' (test)'] = error_metrics(y_test, y_pred)

  grid_search.fit(X, y)
  y_pred = grid_search.predict(X)
  results_dict[model_test_name + ' (all)'] = error_metrics(y, y_pred)

# exibe resultados
results_df = pd.DataFrame.from_dict(results_dict, orient='index', columns=['R2', 'RMSE', 'MAE', 'MedAE', 'MAPE'])
display(results_df.reset_index().rename(columns={'index':'Model'}).style.set_properties(**{'text-align': 'left'}).format({
    'R2': "{:.2f}",
    'RMSE': "{:.2f}",
    'MAE': "{:.2f}",
    'MedAE': "{:.2f}",
    'MAPE': "{:.2f}"
}))

print()



  _data = np.array(data, dtype=dtype, copy=copy,
  _data = np.array(data, dtype=dtype, copy=copy,


Unnamed: 0,Model,R2,RMSE,MAE,MedAE,MAPE
0,Decision Tree (CV),0.27,546747.59,358915.55,193695.2,0.41
1,Decision Tree (test),0.42,512713.7,347768.65,195000.5,0.41
2,Decision Tree (all),1.0,8.89,0.6,0.0,0.0
3,Random Forest (CV),0.66,373052.16,255488.21,158293.09,0.3
4,Random Forest (test),0.76,329438.77,222056.43,145815.74,0.27
5,Random Forest (all),0.96,130296.62,88421.16,57367.88,0.1
6,Linear Regression (CV),0.65,380773.76,273151.41,195334.6,0.34
7,Linear Regression (test),0.7,368234.29,267086.12,202154.93,0.36
8,Linear Regression (all),0.7,357826.18,255581.45,186409.49,0.33
9,Ridge Regression (CV),0.65,380096.23,271983.86,193974.46,0.34



CPU times: user 31.1 s, sys: 2.62 s, total: 33.7 s
Wall time: 2min 41s


In [None]:
results_df.sort_values(by='RMSE', ascending=True)

Unnamed: 0,R2,RMSE,MAE,MedAE,MAPE
Decision Tree (all),1.0,8.89,0.6,0.0,0.0
Gradient Boosting (all),0.99,71283.11,50316.73,35095.12,0.07
Random Forest (all),0.96,130296.62,88421.16,57367.88,0.1
Random Forest (test),0.76,329438.77,222056.43,145815.74,0.27
Gradient Boosting (test),0.75,336267.52,225367.68,146332.97,0.27
Linear Regression (all),0.7,357826.18,255581.45,186409.49,0.33
Lasso Regression (all),0.7,357826.19,255582.16,186382.17,0.33
Ridge Regression (all),0.7,358061.32,255868.6,185571.47,0.33
TheilSenRegressor (test),0.72,360653.48,261509.57,208875.59,0.35
TheilSenRegressor (all),0.69,360679.77,255383.71,182311.63,0.32


In [None]:
latex_table = gera_df_stats_latex(results_df,'Results')
display(Latex(latex_table))


<IPython.core.display.Latex object>

In [None]:
end_time = time.time()
execution_time = end_time - start_time

print(f"Tempo de execução: {execution_time/60:.2f} min")

Tempo de execução: 3.01 min


  and should_run_async(code)


# Save results

In [None]:
from google.colab import files

results_df.to_csv('imoveis_results_ML_selection.csv', index=None)
files.download('imoveis_results_ML_selection.csv')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>