In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sn



In [2]:
def get_dataset(url, file_name):
    
    print("Getting page url")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.set_page_load_timeout(60)
    driver.get(url)
    
    WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, "sc-ftvSup.xPMpo")))
    
    print("Loading more information")

       
    print("Collecting data")
    enderecos = []
    regioes = []
    tipos_valores = []
    outras_informacoes = []
    for i in range(20):
        enderecos.append([x.text for x in driver.find_elements(By.CSS_SELECTOR, 'span[data-testid="house-card-address"]')])
        regioes.append([x.text for x in driver.find_elements(By.CSS_SELECTOR, 'span[data-testid="house-card-region"]')])
        tipos_valores.append([x.text for x in  driver.find_elements(By.CLASS_NAME, 'sc-gsnTZi.iRsaMY.sc-crXcEl.jddosl.CozyTypography')])
        outras_informacoes.append(driver.find_elements(By.CSS_SELECTOR, 'small[data-testid="house-card-area"]'))
        WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, "sc-ftvSup.xPMpo")))
        elem = driver.find_elements(By.CLASS_NAME, "sc-ftvSup.xPMpo")
        elem[0].click()
        time.sleep(1)
        print(i)


    outras_informacoes_clean = []
    for i in outras_informacoes:
        for j in i:
            outras_informacoes_clean.append(j.text)

    enderecos = [item for sublist in enderecos for item in sublist]
    regioes = [item for sublist in regioes for item in sublist]
    tipos_valores = [item for sublist in tipos_valores for item in sublist]

    tipo = []
    valores = []

    for i in range(len(tipos_valores)):
        if tipos_valores[i].startswith("R$"):
            valores.append(tipos_valores[i])
        else:
            tipo.append(tipos_valores[i])
    metragem = []
    n_quartos = []
    n_vagas = []
    for info in outras_informacoes_clean:
        splited_info  = info.split('•')
        metragem.append(int(splited_info[0].strip('m² ')))
        n_quartos.append(int(splited_info[1].strip(" quartos ")))
        n_vagas.append(int((splited_info[2].strip(" vagas "))))
    valores = [item.strip("R$ ").replace(".","")  for item in valores]
    regioes = [item.strip("R$ ").replace(".","") for item in regioes]
    
    print("Building the dataset")

    real_state = {"tipo": tipo, "enderecos" : enderecos, "regioes" :  regioes, "valores" : valores, "metragem" : metragem, "n_quartos" : n_quartos, "n_vagas" : n_vagas}
    df = pd.DataFrame(real_state)
    df.to_csv(file_name, index=False)
    
    print("Done")


In [None]:
regioes = ["vila-olimpia", "jardim-paulista", "jardim-europa"]
for regiao in regioes:
    file_name= f"real_state_{regiao}.csv"
    if not os.path.exists(file_name):
        get_dataset(f'https://www.quintoandar.com.br/comprar/imovel/{regiao}-sao-paulo-sp-brasil', file_name)


In [4]:
df_pinheiros = pd.read_csv("real_state_pinheiros.csv")
df_jardim_paulista = pd.read_csv("real_state_jardim-paulista.csv")
df_jardim_europa = pd.read_csv("real_state_jardim-europa.csv")

In [6]:
df = pd.concat([df_pinheiros, df_jardim_paulista, df_jardim_europa], axis=0)

In [7]:
df = df.drop_duplicates()

In [11]:
df = df.drop(columns=["enderecos"] )

In [12]:
ohe = OneHotEncoder()
transformed = ohe.fit_transform(df[['tipo']])
df = df.drop(columns=['tipo'])
df[ohe.categories_[0]] = transformed.toarray()

In [10]:
ohe = OneHotEncoder()
transformed_2 = ohe.fit_transform(df[['regioes']])
df = df.drop(columns=['regioes'])
df[ohe.categories_[0]] = transformed_2.toarray()

In [14]:
X = df.drop(columns=["valores"])
y = df.valores
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
linear_regression = LinearRegression()

linear_regression.fit(X_train, y_train)

y_pred = linear_regression.predict(X_test)

In [16]:
r2 = r2_score(y_test, y_pred)

print("R-squared Score:", r2)

R-squared Score: 0.6724248017032823


In [17]:
rf = RandomForestRegressor(n_estimators=10, random_state=10)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)

print("R-squared Score:", r2)

R-squared Score: 0.8489202812334167
