# Importando bibliotecas

In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler

# Funções

In [95]:
# Funções   

def train(property_dfs,tipo):

    # particionando dados 

    df = property_dfs[tipo]

    X = df.drop(columns='Preço')
    y = df['Preço']

    # testa Lazy predict armazenando um vetor de todos os valores, a media, e o desvio padrao

    dict_result = {}

    for i in range(33):
            
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

        reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric= mean_absolute_percentage_error)
        models, _ = reg.fit(X_train, X_test, y_train, y_test)

        for model_name, methods in models.iterrows():

            if model_name not in dict_result:
                dict_result[model_name] = {}
                
            for met, val in methods.items():
                if met not in dict_result[model_name]:
                    dict_result[model_name][met] = {
                        "mean": [],
                        "media": 0,
                        "std": 0
                    }

                dict_result[model_name][met]["mean"].append(val)
    
    return dict_result    


# Normalização

In [89]:
def norm_min_max(df):

    columns_to_normalize = [col for col in df.columns if col != "Tipo" and df[col].dtype != object]

    for col in columns_to_normalize :
        min_values = df[columns_to_normalize].min()
        max_values = df[columns_to_normalize].max()        
    # min_values = df[columns_to_normalize].min()
    # max_values = df[columns_to_normalize].max()
        df[columns_to_normalize] = (df[columns_to_normalize] - min_values) / (max_values - min_values)
    
    return df

# Formatação dos dados

In [92]:
#Le e trata a database

tabela = '/home/igor/Documentos/stemis/Api_Vitrine/webscraping/Web Scraping_Setor1_Completo_Lat_Long.xlsx'
df = pd.read_excel(tabela)

df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)
df = df.drop(['ID', 'Cidade', 'Bairro'], axis=1)
df['Tipo'] = df['Tipo'].replace({'Ã£': 'ã', 'Ã©': 'é', 'Ã_x0081_': 'Á'}, regex=True)
df = df.drop(df[df['Tipo'].isin(['Salão', 'Terreno', 'Prédio', 'Sobrado', 'Flat', 'Ponto', 'Área', 'Laje'])].index)
df['Tipo'] = df['Tipo'].replace({'Kitnet': 'Studio'}, regex=True)

money_col = df[['Preço', 'Condomínio', 'IPTU']]
for col in money_col:
    df[col] = df[col].astype(str).apply(lambda x: x.replace('.', ''))
    df[col] = df[col].replace({'R\$ ': '', ',': '.'}, regex=True).astype('float64')
int_col = df[['Quartos', 'Banheiros', 'Vagas']]
for col in int_col:
    df[col] = df[col].astype(int)
df = df.reset_index(drop=True)

df = df.drop_duplicates()

property_types = df['Tipo'].unique()

property_dfs = {}

for property_type in property_types:
    property_dfs[property_type] = norm_min_max(df[df['Tipo'] == property_type])

print(property_dfs)


{'Studio':        Tipo   m2  Quartos  Banheiros  Vagas  Preço  Condomínio  IPTU  \
0    Studio 0.06     0.20       0.25   0.00   0.10        0.10  0.00   
47   Studio 0.12     0.40       0.25   0.33   0.07        0.20  0.26   
50   Studio 0.14     0.40       0.25   0.33   0.09        0.21  0.34   
72   Studio 0.14     0.80       0.50   0.00   0.18        0.00  0.37   
85   Studio 0.05     0.20       0.25   0.00   0.00        0.00  0.05   
129  Studio 0.26     1.00       0.75   1.00   0.36        0.00  0.60   
130  Studio 0.12     0.40       0.25   0.33   0.04        0.10  0.32   
168  Studio 0.66     1.00       1.00   1.00   0.48        0.08  0.70   
175  Studio 0.41     0.80       1.00   0.67   0.27        0.52  0.82   
178  Studio 0.07     0.40       0.25   0.00   0.06        0.12  0.19   
180  Studio 0.14     0.40       0.50   0.33   0.12        0.36  0.37   
190  Studio 0.08     0.20       0.25   0.00   0.04        0.23  0.27   
262  Studio 0.10     0.20       0.25   0.00   0.02   

# Apartamento

In [96]:
results_dict = train(property_dfs,"Apartamento")

results_dict_format = {}

#imprime
# for model in results_dict:
#     results_dict_format[model]={}
#     for metric in results_dict[model]:
#         mean = np.mean(results_dict[model][metric]["mean"])
#         std = np.std(results_dict[model][metric]["mean"])
#         results_dict_format[model][metric + "_media"] = mean
#         results_dict_format[model][metric + "_std"] = std

# df = pd.DataFrame(results_dict_format)
# df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])

for model in results_dict:
    results_dict_format[model] = {}
    for metric in results_dict[model]:
        
        mean = np.mean(results_dict[model][metric]["mean"])   
        std = np.std(results_dict[model][metric]["mean"])
        std_str = "{:.2f}".format(std)  # Desvio padrão com 2 casas decimais
        mean_str = "{:.2f}".format(mean)  # Média com 2 casas decimais
        mean_std_str = mean_str + " +/- " + std_str  # Média +/- Desvio padrão
            
        results_dict_format[model][metric + "_media"] = mean_std_str
        
            

df = pd.DataFrame(results_dict_format)
df_transposed = df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])
df_transposed.rename(columns={"mean_absolute_percentage_error_media": "MAPE_media"}, inplace=True)
df_transposed

100%|██████████| 42/42 [00:05<00:00,  7.95it/s]
100%|██████████| 42/42 [00:05<00:00,  7.33it/s]
100%|██████████| 42/42 [00:05<00:00,  7.48it/s]
100%|██████████| 42/42 [00:05<00:00,  7.82it/s]
100%|██████████| 42/42 [00:05<00:00,  7.96it/s]
100%|██████████| 42/42 [00:05<00:00,  7.59it/s]
100%|██████████| 42/42 [00:05<00:00,  7.60it/s]
100%|██████████| 42/42 [00:05<00:00,  7.72it/s]
100%|██████████| 42/42 [00:05<00:00,  7.45it/s]
100%|██████████| 42/42 [00:05<00:00,  7.91it/s]
100%|██████████| 42/42 [00:05<00:00,  8.05it/s]
100%|██████████| 42/42 [00:05<00:00,  7.66it/s]
100%|██████████| 42/42 [00:05<00:00,  7.51it/s]
100%|██████████| 42/42 [00:05<00:00,  7.95it/s]
100%|██████████| 42/42 [00:05<00:00,  7.70it/s]
100%|██████████| 42/42 [00:05<00:00,  7.84it/s]
100%|██████████| 42/42 [00:05<00:00,  7.61it/s]
100%|██████████| 42/42 [00:05<00:00,  7.68it/s]
100%|██████████| 42/42 [00:05<00:00,  8.05it/s]
100%|██████████| 42/42 [00:05<00:00,  7.82it/s]
100%|██████████| 42/42 [00:05<00:00,  7.

Unnamed: 0,Adjusted R-Squared_media,R-Squared_media,RMSE_media,Time Taken_media,MAPE_media
SVR,-0.51 +/- 0.69,-0.42 +/- 0.65,0.08 +/- 0.01,0.01 +/- 0.00,1043414036729.47 +/- 1583347136039.85
QuantileRegressor,-0.18 +/- 0.03,-0.10 +/- 0.03,0.08 +/- 0.02,2.53 +/- 0.16,135176221154.91 +/- 205412993983.52
GaussianProcessRegressor,-580.14 +/- 1576.10,-545.27 +/- 1481.54,0.95 +/- 1.35,0.01 +/- 0.00,14234400075465.40 +/- 34663722326635.12
LassoLarsIC,0.16 +/- 0.76,0.21 +/- 0.71,0.07 +/- 0.05,0.02 +/- 0.01,152659081191.66 +/- 249800076341.50
SGDRegressor,0.26 +/- 0.58,0.31 +/- 0.54,0.06 +/- 0.04,0.01 +/- 0.00,157718041499.48 +/- 263167016663.99
OrthogonalMatchingPursuitCV,-0.01 +/- 0.95,0.05 +/- 0.89,0.07 +/- 0.05,0.01 +/- 0.01,161262505346.04 +/- 287675041997.66
KernelRidge,0.15 +/- 0.77,0.20 +/- 0.72,0.07 +/- 0.05,0.01 +/- 0.00,161461519157.31 +/- 263953581198.28
LinearRegression,0.14 +/- 0.78,0.19 +/- 0.73,0.07 +/- 0.05,0.01 +/- 0.01,162253382638.30 +/- 265171657297.17
TransformedTargetRegressor,0.14 +/- 0.78,0.19 +/- 0.73,0.07 +/- 0.05,0.01 +/- 0.00,162253382638.30 +/- 265171657297.17
Lars,0.14 +/- 0.78,0.19 +/- 0.73,0.07 +/- 0.05,0.01 +/- 0.01,162253382638.30 +/- 265171657297.17


# Studio   

In [49]:
results_dict = train(property_dfs,"Studio")

results_dict_format = {}

#imprime
for model in results_dict:
    results_dict_format[model]={}
    for metric in results_dict[model]:
        mean = np.mean(results_dict[model][metric]["mean"])
        std = np.std(results_dict[model][metric]["mean"])
        results_dict_format[model][metric + "_media"] = mean
        results_dict_format[model][metric + "_std"] = std

df = pd.DataFrame(results_dict_format)
df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])

100%|██████████| 42/42 [00:00<00:00, 45.91it/s]
100%|██████████| 42/42 [00:00<00:00, 46.95it/s]
100%|██████████| 42/42 [00:00<00:00, 47.13it/s]
100%|██████████| 42/42 [00:00<00:00, 48.97it/s]
100%|██████████| 42/42 [00:00<00:00, 46.33it/s]
100%|██████████| 42/42 [00:00<00:00, 43.38it/s]
100%|██████████| 42/42 [00:00<00:00, 43.97it/s]
100%|██████████| 42/42 [00:00<00:00, 43.34it/s]
100%|██████████| 42/42 [00:00<00:00, 47.31it/s]
100%|██████████| 42/42 [00:00<00:00, 54.18it/s]
100%|██████████| 42/42 [00:00<00:00, 46.99it/s]
100%|██████████| 42/42 [00:00<00:00, 51.86it/s]
100%|██████████| 42/42 [00:00<00:00, 53.35it/s]
100%|██████████| 42/42 [00:00<00:00, 50.81it/s]
100%|██████████| 42/42 [00:00<00:00, 51.24it/s]
100%|██████████| 42/42 [00:00<00:00, 48.92it/s]
100%|██████████| 42/42 [00:00<00:00, 49.42it/s]
100%|██████████| 42/42 [00:00<00:00, 46.45it/s]
100%|██████████| 42/42 [00:00<00:00, 55.34it/s]
100%|██████████| 42/42 [00:00<00:00, 42.28it/s]
100%|██████████| 42/42 [00:00<00:00, 44.

Unnamed: 0,Adjusted R-Squared_media,Adjusted R-Squared_std,R-Squared_media,R-Squared_std,RMSE_media,RMSE_std,Time Taken_media,Time Taken_std,mean_absolute_percentage_error_media,mean_absolute_percentage_error_std
KNeighborsRegressor,-1.73,1.44,0.22,0.41,1847.1,989.23,0.01,0.0,0.37,0.1
SVR,-3.31,0.42,-0.23,0.12,2322.84,1032.87,0.01,0.0,0.42,0.08
QuantileRegressor,-3.32,0.42,-0.23,0.12,2324.0,1032.92,0.02,0.01,0.42,0.08
AdaBoostRegressor,-1.82,1.93,0.19,0.55,1687.82,807.63,0.07,0.01,0.46,0.16
XGBRegressor,-1.34,2.07,0.33,0.59,1512.97,791.11,0.16,0.04,0.46,0.14
RandomForestRegressor,-1.6,1.69,0.26,0.48,1643.14,783.72,0.05,0.01,0.49,0.16
BaggingRegressor,-1.56,1.47,0.27,0.42,1639.39,757.92,0.01,0.0,0.5,0.17
GradientBoostingRegressor,-2.13,2.91,0.11,0.83,1583.86,685.31,0.03,0.01,0.52,0.21
ExtraTreesRegressor,-1.88,3.32,0.18,0.95,1570.33,707.8,0.04,0.0,0.52,0.16
PassiveAggressiveRegressor,-2.03,3.66,0.13,1.05,1612.77,767.19,0.01,0.0,0.56,0.14


# Galpão

In [36]:
results_dict = train(property_dfs,"Galpão")

results_dict_format = {}

#imprime
for model in results_dict:
    results_dict_format[model]={}
    for metric in results_dict[model]:
        mean = np.mean(results_dict[model][metric]["mean"])
        std = np.std(results_dict[model][metric]["mean"])
        results_dict_format[model][metric + "_media"] = mean
        results_dict_format[model][metric + "_std"] = std

df = pd.DataFrame(results_dict_format)
df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])

  0%|          | 0/42 [00:00<?, ?it/s]

100%|██████████| 42/42 [00:01<00:00, 41.02it/s]
100%|██████████| 42/42 [00:00<00:00, 45.72it/s]
100%|██████████| 42/42 [00:00<00:00, 44.08it/s]
100%|██████████| 42/42 [00:00<00:00, 45.70it/s]
100%|██████████| 42/42 [00:00<00:00, 54.17it/s]
100%|██████████| 42/42 [00:00<00:00, 64.35it/s]
100%|██████████| 42/42 [00:00<00:00, 64.33it/s]
100%|██████████| 42/42 [00:00<00:00, 60.67it/s]
100%|██████████| 42/42 [00:00<00:00, 54.86it/s]
100%|██████████| 42/42 [00:00<00:00, 51.05it/s]
100%|██████████| 42/42 [00:00<00:00, 42.80it/s]
100%|██████████| 42/42 [00:01<00:00, 41.80it/s]
100%|██████████| 42/42 [00:00<00:00, 48.52it/s]
100%|██████████| 42/42 [00:00<00:00, 53.50it/s]
100%|██████████| 42/42 [00:01<00:00, 40.91it/s]
100%|██████████| 42/42 [00:00<00:00, 52.50it/s]
100%|██████████| 42/42 [00:00<00:00, 42.38it/s]
100%|██████████| 42/42 [00:00<00:00, 44.51it/s]
100%|██████████| 42/42 [00:00<00:00, 45.95it/s]
100%|██████████| 42/42 [00:00<00:00, 60.15it/s]
100%|██████████| 42/42 [00:00<00:00, 52.

Unnamed: 0,Adjusted R-Squared_media,Adjusted R-Squared_std,R-Squared_media,R-Squared_std,RMSE_media,RMSE_std,Time Taken_media,Time Taken_std,mean_absolute_percentage_error_media,mean_absolute_percentage_error_std
SVR,10.91,1.39,-0.24,0.17,13686.04,10366.97,0.01,0.0,0.9,0.56
LinearSVR,14.06,6.37,-0.63,0.8,14149.36,10329.43,0.01,0.0,0.96,0.01
QuantileRegressor,10.99,1.79,-0.25,0.22,13666.09,10362.1,0.02,0.01,0.96,0.55
MLPRegressor,14.18,6.72,-0.65,0.84,14131.74,10282.04,0.03,0.01,0.99,0.0
RANSACRegressor,14.54,25.06,-0.69,3.13,12267.24,9577.72,0.04,0.02,1.25,0.83
PassiveAggressiveRegressor,14.95,36.61,-0.74,4.58,11097.55,8106.82,0.01,0.0,1.28,0.91
NuSVR,14.26,15.83,-0.66,1.98,13561.15,10250.98,0.01,0.0,1.45,0.73
HuberRegressor,22.63,63.26,-1.7,7.91,12960.75,9914.28,0.02,0.01,1.51,1.15
AdaBoostRegressor,44.15,145.78,-4.39,18.22,13474.27,9363.44,0.06,0.02,1.68,2.68
GaussianProcessRegressor,242.47,721.66,-29.18,90.21,16908.76,7940.98,0.01,0.0,2.69,2.43


# Casa

In [38]:
results_dict = train(property_dfs,"Casa")

results_dict_format = {}

#imprime
for model in results_dict:
    results_dict_format[model]={}
    for metric in results_dict[model]:
        mean = np.mean(results_dict[model][metric]["mean"])
        std = np.std(results_dict[model][metric]["mean"])
        results_dict_format[model][metric + "_media"] = mean
        results_dict_format[model][metric + "_std"] = std

df = pd.DataFrame(results_dict_format)
df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])

100%|██████████| 42/42 [00:00<00:00, 42.49it/s]
100%|██████████| 42/42 [00:01<00:00, 37.71it/s]
100%|██████████| 42/42 [00:01<00:00, 37.38it/s]
100%|██████████| 42/42 [00:01<00:00, 39.60it/s]
100%|██████████| 42/42 [00:01<00:00, 38.03it/s]
100%|██████████| 42/42 [00:01<00:00, 41.80it/s]
100%|██████████| 42/42 [00:00<00:00, 43.73it/s]
100%|██████████| 42/42 [00:01<00:00, 36.32it/s]
100%|██████████| 42/42 [00:01<00:00, 39.48it/s]
100%|██████████| 42/42 [00:01<00:00, 40.62it/s]
100%|██████████| 42/42 [00:01<00:00, 41.42it/s]
100%|██████████| 42/42 [00:00<00:00, 42.19it/s]
100%|██████████| 42/42 [00:00<00:00, 43.07it/s]
100%|██████████| 42/42 [00:00<00:00, 43.04it/s]
100%|██████████| 42/42 [00:00<00:00, 49.17it/s]
100%|██████████| 42/42 [00:00<00:00, 43.23it/s]
100%|██████████| 42/42 [00:00<00:00, 47.01it/s]
100%|██████████| 42/42 [00:00<00:00, 42.94it/s]
100%|██████████| 42/42 [00:00<00:00, 43.38it/s]
100%|██████████| 42/42 [00:01<00:00, 41.53it/s]
100%|██████████| 42/42 [00:01<00:00, 41.

Unnamed: 0,Adjusted R-Squared_media,Adjusted R-Squared_std,R-Squared_media,R-Squared_std,RMSE_media,RMSE_std,Time Taken_media,Time Taken_std,mean_absolute_percentage_error_media,mean_absolute_percentage_error_std
ExtraTreesRegressor,0.34,0.62,0.62,0.36,923.8,483.75,0.04,0.0,0.35,0.06
RandomForestRegressor,0.34,0.41,0.62,0.23,942.49,418.87,0.06,0.0,0.36,0.06
BaggingRegressor,0.29,0.37,0.6,0.21,980.03,423.7,0.01,0.0,0.37,0.06
KNeighborsRegressor,-0.43,0.84,0.18,0.48,1406.88,628.54,0.01,0.0,0.38,0.05
GradientBoostingRegressor,0.19,0.56,0.54,0.32,1001.55,353.47,0.04,0.01,0.38,0.06
XGBRegressor,0.3,0.66,0.6,0.38,900.22,291.36,0.17,0.03,0.38,0.07
AdaBoostRegressor,0.26,0.44,0.58,0.25,979.58,380.53,0.07,0.01,0.38,0.05
PassiveAggressiveRegressor,-0.15,0.97,0.34,0.55,1265.58,702.98,0.01,0.0,0.42,0.09
DecisionTreeRegressor,-0.31,1.53,0.25,0.87,1188.12,478.68,0.01,0.0,0.43,0.09
SVR,-1.04,0.19,-0.17,0.11,1767.56,694.69,0.01,0.0,0.44,0.05


# Sala

In [39]:
results_dict = train(property_dfs,"Sala")

results_dict_format = {}

#imprime
for model in results_dict:
    results_dict_format[model]={}
    for metric in results_dict[model]:
        mean = np.mean(results_dict[model][metric]["mean"])
        std = np.std(results_dict[model][metric]["mean"])
        results_dict_format[model][metric + "_media"] = mean
        results_dict_format[model][metric + "_std"] = std

df = pd.DataFrame(results_dict_format)
df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])

100%|██████████| 42/42 [00:01<00:00, 39.33it/s]
100%|██████████| 42/42 [00:00<00:00, 46.21it/s]
100%|██████████| 42/42 [00:01<00:00, 39.77it/s]
100%|██████████| 42/42 [00:00<00:00, 43.45it/s]
100%|██████████| 42/42 [00:00<00:00, 43.25it/s]
100%|██████████| 42/42 [00:01<00:00, 40.66it/s]
100%|██████████| 42/42 [00:00<00:00, 42.78it/s]
100%|██████████| 42/42 [00:01<00:00, 39.64it/s]
100%|██████████| 42/42 [00:01<00:00, 41.12it/s]
100%|██████████| 42/42 [00:01<00:00, 38.70it/s]
100%|██████████| 42/42 [00:01<00:00, 41.45it/s]
100%|██████████| 42/42 [00:01<00:00, 37.64it/s]
100%|██████████| 42/42 [00:00<00:00, 48.89it/s]
100%|██████████| 42/42 [00:01<00:00, 39.38it/s]
100%|██████████| 42/42 [00:01<00:00, 37.22it/s]
100%|██████████| 42/42 [00:01<00:00, 39.37it/s]
100%|██████████| 42/42 [00:01<00:00, 37.35it/s]
100%|██████████| 42/42 [00:01<00:00, 39.12it/s]
100%|██████████| 42/42 [00:00<00:00, 48.41it/s]
100%|██████████| 42/42 [00:00<00:00, 44.30it/s]
100%|██████████| 42/42 [00:01<00:00, 38.

Unnamed: 0,Adjusted R-Squared_media,Adjusted R-Squared_std,R-Squared_media,R-Squared_std,RMSE_media,RMSE_std,Time Taken_media,Time Taken_std,mean_absolute_percentage_error_media,mean_absolute_percentage_error_std
ExtraTreesRegressor,0.19,0.75,0.52,0.45,2575.85,1090.0,0.04,0.01,0.48,0.13
KNeighborsRegressor,-0.01,0.38,0.4,0.22,3157.42,1527.32,0.01,0.01,0.54,0.12
GradientBoostingRegressor,-0.32,2.01,0.22,1.18,2993.19,1015.45,0.03,0.01,0.54,0.21
RandomForestRegressor,0.17,0.67,0.51,0.4,2679.44,1167.31,0.06,0.01,0.55,0.18
BaggingRegressor,0.15,0.69,0.5,0.41,2691.35,1241.18,0.02,0.01,0.56,0.19
XGBRegressor,0.0,0.83,0.41,0.49,2847.73,1050.39,0.18,0.04,0.56,0.21
SVR,-1.01,0.15,-0.18,0.09,4376.95,1578.85,0.01,0.0,0.59,0.11
QuantileRegressor,-1.01,0.15,-0.19,0.09,4378.47,1578.99,0.03,0.01,0.59,0.11
DecisionTreeRegressor,-0.67,3.24,0.01,1.91,3078.09,1255.5,0.01,0.01,0.62,0.24
ExtraTreeRegressor,-1.03,3.22,-0.2,1.91,3360.41,1146.65,0.01,0.0,0.63,0.23


# Cobertura

In [43]:
results_dict = train(property_dfs,"Cobertura")

results_dict_format = {}

#imprime
for model in results_dict:
    results_dict_format[model]={}
    for metric in results_dict[model]:
        mean = np.mean(results_dict[model][metric]["mean"])
        std = np.std(results_dict[model][metric]["mean"])
        results_dict_format[model][metric + "_media"] = mean
        results_dict_format[model][metric + "_std"] = std

df = pd.DataFrame(results_dict_format)
df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])

  0%|          | 0/42 [00:00<?, ?it/s]

100%|██████████| 42/42 [00:00<00:00, 49.27it/s]
100%|██████████| 42/42 [00:00<00:00, 49.00it/s]
100%|██████████| 42/42 [00:00<00:00, 61.74it/s]
100%|██████████| 42/42 [00:00<00:00, 63.28it/s]
100%|██████████| 42/42 [00:00<00:00, 59.02it/s]
100%|██████████| 42/42 [00:00<00:00, 74.81it/s]
100%|██████████| 42/42 [00:00<00:00, 71.13it/s]
100%|██████████| 42/42 [00:00<00:00, 52.61it/s]
100%|██████████| 42/42 [00:00<00:00, 61.71it/s]
100%|██████████| 42/42 [00:00<00:00, 55.44it/s]
100%|██████████| 42/42 [00:00<00:00, 60.76it/s]
100%|██████████| 42/42 [00:00<00:00, 58.10it/s]
100%|██████████| 42/42 [00:00<00:00, 51.74it/s]
100%|██████████| 42/42 [00:00<00:00, 58.27it/s]
100%|██████████| 42/42 [00:00<00:00, 50.36it/s]
100%|██████████| 42/42 [00:00<00:00, 47.10it/s]
100%|██████████| 42/42 [00:00<00:00, 60.30it/s]
100%|██████████| 42/42 [00:00<00:00, 59.33it/s]
100%|██████████| 42/42 [00:00<00:00, 55.88it/s]
100%|██████████| 42/42 [00:00<00:00, 60.21it/s]
100%|██████████| 42/42 [00:00<00:00, 55.

Unnamed: 0,Adjusted R-Squared_media,Adjusted R-Squared_std,R-Squared_media,R-Squared_std,RMSE_media,RMSE_std,Time Taken_media,Time Taken_std,mean_absolute_percentage_error_media,mean_absolute_percentage_error_std
ExtraTreesRegressor,2.13,1.48,-1.25,2.97,1258.7,1040.66,0.04,0.0,0.42,0.15
SVR,2.04,1.24,-1.08,2.49,1418.23,1098.02,0.01,0.0,0.44,0.17
QuantileRegressor,2.07,1.26,-1.14,2.53,1420.95,1091.1,0.01,0.0,0.45,0.17
AdaBoostRegressor,2.47,2.02,-1.95,4.04,1457.66,1077.36,0.06,0.02,0.46,0.19
ExtraTreeRegressor,2.63,2.08,-2.26,4.17,1525.73,1109.2,0.01,0.0,0.47,0.16
KNeighborsRegressor,3.85,6.68,-4.7,13.35,1483.34,1112.98,0.01,0.0,0.55,0.4
NuSVR,2.97,2.81,-2.93,5.62,1511.23,984.0,0.01,0.0,0.57,0.25
RandomForestRegressor,5.25,11.15,-7.49,22.3,1500.07,1007.55,0.06,0.01,0.63,0.42
GammaRegressor,2.98,2.48,-2.96,4.95,1766.48,1600.65,0.01,0.0,0.64,0.43
XGBRegressor,6.27,11.53,-9.54,23.07,1854.57,1100.04,0.08,0.03,0.64,0.29


# Loja

In [57]:
results_dict = train(property_dfs,"Loja")

results_dict_format = {}

#imprime
for model in results_dict:
    results_dict_format[model]={}
    for metric in results_dict[model]:
        mean = np.mean(results_dict[model][metric]["mean"])
        std = np.std(results_dict[model][metric]["mean"])
        results_dict_format[model][metric + "_media"] = mean
        results_dict_format[model][metric + "_std"] = std

df = pd.DataFrame(results_dict_format)
df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])

100%|██████████| 42/42 [00:01<00:00, 24.10it/s]
100%|██████████| 42/42 [00:01<00:00, 25.68it/s]
100%|██████████| 42/42 [00:01<00:00, 32.27it/s]
100%|██████████| 42/42 [00:01<00:00, 25.95it/s]
100%|██████████| 42/42 [00:01<00:00, 25.65it/s]
100%|██████████| 42/42 [00:01<00:00, 24.37it/s]
100%|██████████| 42/42 [00:01<00:00, 29.68it/s]
100%|██████████| 42/42 [00:01<00:00, 33.56it/s]
100%|██████████| 42/42 [00:01<00:00, 28.09it/s]
100%|██████████| 42/42 [00:01<00:00, 25.46it/s]
100%|██████████| 42/42 [00:01<00:00, 26.07it/s]
100%|██████████| 42/42 [00:01<00:00, 29.18it/s]
100%|██████████| 42/42 [00:01<00:00, 27.17it/s]
100%|██████████| 42/42 [00:01<00:00, 26.46it/s]
100%|██████████| 42/42 [00:01<00:00, 25.03it/s]
100%|██████████| 42/42 [00:01<00:00, 24.45it/s]
100%|██████████| 42/42 [00:01<00:00, 27.03it/s]
100%|██████████| 42/42 [00:01<00:00, 23.56it/s]
100%|██████████| 42/42 [00:01<00:00, 28.56it/s]
100%|██████████| 42/42 [00:01<00:00, 31.67it/s]
100%|██████████| 42/42 [00:01<00:00, 39.

Unnamed: 0,Adjusted R-Squared_media,Adjusted R-Squared_std,R-Squared_media,R-Squared_std,RMSE_media,RMSE_std,Time Taken_media,Time Taken_std,mean_absolute_percentage_error_media,mean_absolute_percentage_error_std
SVR,-0.35,0.05,-0.11,0.04,3052.27,867.9,0.01,0.0,0.49,0.06
QuantileRegressor,-0.35,0.05,-0.11,0.04,3054.57,868.27,0.13,0.03,0.49,0.06
RANSACRegressor,-0.06,0.29,0.13,0.24,2648.27,776.89,0.05,0.01,0.53,0.23
NuSVR,-0.32,0.05,-0.08,0.04,3016.59,864.53,0.01,0.0,0.57,0.08
PassiveAggressiveRegressor,-0.03,0.41,0.15,0.33,2598.09,737.36,0.01,0.0,0.57,0.25
ExtraTreesRegressor,0.04,0.59,0.21,0.49,2392.41,606.17,0.05,0.0,0.59,0.26
HuberRegressor,-0.2,0.62,0.01,0.51,2701.59,697.61,0.02,0.01,0.64,0.29
RandomForestRegressor,0.05,0.65,0.21,0.53,2368.28,555.36,0.07,0.01,0.66,0.25
XGBRegressor,-0.41,1.36,-0.16,1.12,2724.31,638.37,0.19,0.05,0.68,0.39
GradientBoostingRegressor,-0.2,1.03,0.01,0.85,2575.93,566.0,0.04,0.01,0.68,0.28


# Novo Database

In [77]:

tabela = '/home/igor/Documentos/stemis/Api_Vitrine/webscraping/dataSetVitrine.xlsx'
df = pd.read_excel(tabela)

df.drop(df.columns[df.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)
df = df.drop(['Bairro'], axis=1)

money_cols = ['Preço']  # Lista das colunas de dinheiro

for col in money_cols:
    df[col] = df[col].astype(str).apply(lambda x: x.replace('.', ''))
    df[col] = df[col].replace({'R\$ ': '', ',': '.'}, regex=True).astype('float64')

int_cols = ['Quartos', 'Banheiros', 'Vagas']  # Lista das colunas de números inteiros

for col in int_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')

df = df.drop_duplicates()

df = df.reset_index(drop=True)

df.to_csv('dataSetVitrine.csv', index=False)

property_types = df['Tipo'].unique()
property_dfs = {}

for property_type in property_types:
    property_dfs[property_type] = df[df['Tipo'] == property_type]



# Apartamento Novo

In [78]:
results_dict = train(property_dfs,"Apartamento")

results_dict_format = {}

#imprime
for model in results_dict:
    results_dict_format[model]={}
    for metric in results_dict[model]:
        mean = np.mean(results_dict[model][metric]["mean"])
        std = np.std(results_dict[model][metric]["mean"])
        results_dict_format[model][metric + "_media"] = mean
        results_dict_format[model][metric + "_std"] = std

df = pd.DataFrame(results_dict_format)
df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])

  0%|          | 0/42 [00:00<?, ?it/s]

100%|██████████| 42/42 [00:00<00:00, 51.14it/s]
100%|██████████| 42/42 [00:00<00:00, 56.79it/s]
100%|██████████| 42/42 [00:00<00:00, 43.83it/s]
100%|██████████| 42/42 [00:00<00:00, 47.44it/s]
100%|██████████| 42/42 [00:01<00:00, 40.80it/s]
 14%|█▍        | 6/42 [00:00<00:00, 50.52it/s]


KeyboardInterrupt: 

# Casa Novo

In [68]:
results_dict = train(property_dfs,"Casa")

results_dict_format = {}

#imprime
for model in results_dict:
    results_dict_format[model]={}
    for metric in results_dict[model]:
        mean = np.mean(results_dict[model][metric]["mean"])
        std = np.std(results_dict[model][metric]["mean"])
        results_dict_format[model][metric + "_media"] = mean
        results_dict_format[model][metric + "_std"] = std

df = pd.DataFrame(results_dict_format)
df.transpose().sort_values(by=["mean_absolute_percentage_error_media"])

100%|██████████| 42/42 [00:01<00:00, 23.87it/s]
100%|██████████| 42/42 [00:01<00:00, 27.96it/s]
100%|██████████| 42/42 [00:01<00:00, 22.85it/s]
100%|██████████| 42/42 [00:01<00:00, 22.92it/s]
100%|██████████| 42/42 [00:01<00:00, 28.05it/s]
100%|██████████| 42/42 [00:01<00:00, 29.25it/s]
100%|██████████| 42/42 [00:01<00:00, 35.61it/s]
100%|██████████| 42/42 [00:01<00:00, 26.01it/s]
100%|██████████| 42/42 [00:01<00:00, 22.64it/s]
100%|██████████| 42/42 [00:01<00:00, 24.15it/s]
100%|██████████| 42/42 [00:02<00:00, 18.94it/s]
100%|██████████| 42/42 [00:01<00:00, 23.79it/s]
100%|██████████| 42/42 [00:01<00:00, 29.55it/s]
100%|██████████| 42/42 [00:01<00:00, 25.66it/s]
100%|██████████| 42/42 [00:01<00:00, 28.19it/s]
100%|██████████| 42/42 [00:01<00:00, 29.96it/s]
100%|██████████| 42/42 [00:01<00:00, 27.53it/s]
100%|██████████| 42/42 [00:01<00:00, 27.95it/s]
100%|██████████| 42/42 [00:01<00:00, 24.57it/s]
100%|██████████| 42/42 [00:01<00:00, 30.91it/s]
100%|██████████| 42/42 [00:01<00:00, 32.

Unnamed: 0,Adjusted R-Squared_media,Adjusted R-Squared_std,R-Squared_media,R-Squared_std,RMSE_media,RMSE_std,Time Taken_media,Time Taken_std,mean_absolute_percentage_error_media,mean_absolute_percentage_error_std
AdaBoostRegressor,1.0,0.0,1.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0
ExtraTreesRegressor,1.0,0.0,1.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0
KNeighborsRegressor,1.0,0.0,1.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
BaggingRegressor,1.0,0.0,1.0,0.0,0.0,0.0,0.03,0.01,0.0,0.0
ExtraTreeRegressor,1.0,0.0,1.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
RandomForestRegressor,1.0,0.0,1.0,0.0,0.0,0.0,0.05,0.01,0.0,0.0
DecisionTreeRegressor,1.0,0.0,1.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
LinearRegression,1.0,0.0,1.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
TransformedTargetRegressor,1.0,0.0,1.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
BayesianRidge,1.0,0.0,1.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
