In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder


In [2]:
def get_dataset(url, file_name):
    
    print("Getting page url")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.set_page_load_timeout(60)
    driver.get(url)
    
    WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, "sc-ftvSup.xPMpo")))
    
    print("Loading more information")

       
    print("Collecting data")
    enderecos = []
    regioes = []
    tipos_valores = []
    outras_informacoes = []
    for i in range(50):
        enderecos.append([x.text for x in driver.find_elements(By.CSS_SELECTOR, 'span[data-testid="house-card-address"]')])
        regioes.append([x.text for x in driver.find_elements(By.CSS_SELECTOR, 'span[data-testid="house-card-region"]')])
        tipos_valores.append([x.text for x in  driver.find_elements(By.CLASS_NAME, 'sc-gsnTZi.iRsaMY.sc-crXcEl.jddosl.CozyTypography')])
        outras_informacoes.append(driver.find_elements(By.CSS_SELECTOR, 'small[data-testid="house-card-area"]'))
        WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, "sc-ftvSup.xPMpo")))
        elem = driver.find_elements(By.CLASS_NAME, "sc-ftvSup.xPMpo")
        elem[0].click()
        time.sleep(1)
        print(i)


    outras_informacoes_clean = []
    for i in outras_informacoes:
        for j in i:
            outras_informacoes_clean.append(j.text)

    enderecos = [item for sublist in enderecos for item in sublist]
    regioes = [item for sublist in regioes for item in sublist]
    tipos_valores = [item for sublist in tipos_valores for item in sublist]

    tipo = []
    valores = []

    for i in range(len(tipos_valores)):
        if tipos_valores[i].startswith("R$"):
            valores.append(tipos_valores[i])
        else:
            tipo.append(tipos_valores[i])
    metragem = []
    n_quartos = []
    n_vagas = []
    for info in outras_informacoes_clean:
        splited_info  = info.split('•')
        metragem.append(int(splited_info[0].strip('m² ')))
        n_quartos.append(int(splited_info[1].strip(" quartos ")))
        n_vagas.append(int((splited_info[2].strip(" vagas "))))
    valores = [item.strip("R$ ").replace(".","")  for item in valores]
    regioes = [item.strip("R$ ").replace(".","") for item in regioes]
    
    print("Building the dataset")

    real_state = {"tipo": tipo, "enderecos" : enderecos, "regioes" :  regioes, "valores" : valores, "metragem" : metragem, "n_quartos" : n_quartos, "n_vagas" : n_vagas}
    df = pd.DataFrame(real_state)
    df.to_csv(file_name, index=False)
    
    print("Done")


In [3]:
file_name= "real_state.csv"
if not os.path.exists(file_name):
    get_dataset('https://www.quintoandar.com.br/comprar/imovel/pinheiros-sao-paulo-sp-brasil', file_name)
df = pd.read_csv(file_name)

In [4]:
df = df.drop_duplicates()

In [5]:
df.enderecos.value_counts()

Rua Fernão Dias              36
Rua Fradique Coutinho        35
Rua Cardeal Arcoverde        30
Rua Teodoro Sampaio          28
Rua Mourato Coelho           28
                             ..
Conselheiro Pereira Pinto     1
Mourato Coelho                1
Rua dos Cariris               1
Rua Marcos Azevedo            1
Antônio da Silveira           1
Name: enderecos, Length: 90, dtype: int64

In [6]:

label_encoder = LabelEncoder()

df['rua_encoded'] = label_encoder.fit_transform(df['enderecos'])

In [7]:
df = df.drop(columns=["enderecos", "regioes"] )

In [8]:
ohe = OneHotEncoder()
transformed = ohe.fit_transform(df[['tipo']])
df = df.drop(columns=['tipo'])
df[ohe.categories_[0]] = transformed.toarray()

In [9]:
df

Unnamed: 0,valores,metragem,n_quartos,n_vagas,rua_encoded,Apartamento,Casa,Studio e kitnet
0,1250000,85,2,2,77,1.0,0.0,0.0
1,820000,44,2,1,40,1.0,0.0,0.0
2,1380000,110,3,1,40,1.0,0.0,0.0
3,780000,69,2,2,30,1.0,0.0,0.0
4,850000,65,2,1,56,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
14020,1200000,97,3,0,22,1.0,0.0,0.0
14021,3000000,230,4,0,24,0.0,1.0,0.0
14022,2100000,343,0,5,20,0.0,1.0,0.0
14023,1600000,139,3,1,1,0.0,1.0,0.0


In [10]:
X = df.drop(columns=["valores"])
y = df.valores
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
linear_regression = LinearRegression()

linear_regression.fit(X_train, y_train)

y_pred = linear_regression.predict(X_test)

In [12]:
r2 = r2_score(y_test, y_pred)

print("R-squared Score:", r2)

R-squared Score: 0.41259434528175465


In [16]:
rf = RandomForestRegressor(n_estimators=10, random_state=10)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("R-squared Score:", r2)

R-squared Score: 0.6119924725080088
