### Setup

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
base_site = "https://www.zapimoveis.com.br/venda/imoveis/pb+joao-pessoa/?pagina=1&onde=,Para%C3%ADba,Jo%C3%A3o%20Pessoa,,,,BR%3EParaiba%3ENULL%3EJoao%20Pessoa,-7.119495,-34.845011&transacao=Venda&tipo=Im%C3%B3vel%20usado"

In [None]:
response = requests.get(base_site, headers=headers)
response

In [None]:
html = response.content

### Choosing a parser

In [None]:
soup = BeautifulSoup(html, 'lxml')

### Obtaining the element containing all the data

In [None]:
divs = soup.find_all('div', {'class': 'card-listing'})
divs

### Extracting data

In [None]:
prices = [div.find("strong").string.strip("\n R$") for div in divs]
prices

In [None]:
addresses = [div.find("p", {"class": "simple-card__address"}).string for div in divs]
addresses

In [None]:
areas = [div.find("li", {"class": "feature__item text-small js-areas"}).find_all("span")[1].text.split()[0] for div in divs]
areas

In [None]:
import pandas as pd

In [None]:
imoveis_info = pd.DataFrame()

imoveis_info["preco"] = prices
imoveis_info["endereco"] = addresses

imoveis_info

In [None]:
# imoveis_info.to_csv("imoveis_info.csv", index = False, header = True)

In [None]:
# imoveis_info.to_excel("imoveis_info.xlsx", index = False, header = True)

#### Search on multiple pages

In [None]:
url_pages = []

# Mount link to pages on range of 2 to max 5, where the first page is disregarded
for i in range(2,5):
    url_pages.append("https://www.zapimoveis.com.br/venda/imoveis/pb+joao-pessoa/?pagina=" + str(i) + "&onde=,Para%C3%ADba,Jo%C3%A3o%20Pessoa,,,,BR%3EParaiba%3ENULL%3EJoao%20Pessoa,-7.119495,-34.845011&transacao=Venda&tipo=Im%C3%B3vel%20usado")

url_pages

In [None]:
i = 0

for url in url_pages:
    page = requests.get(url, headers=headers)
    
    if page.status_code == 200:
        print("Get data from URL: {0}".format(url))
    else:
        print('Status code {0}: Skipping URL #{1}: {2}'.format(page.status_code, i+1, url))
        i = i+1
        continue
        
    page_html = page.content
    page_soup = BeautifulSoup(page_html, 'lxml')
    
    divs_page = page_soup.find_all('div', {'class': 'card-listing'})
    
    prices_page = [div_page.find("strong").string.strip("\n R$") for div_page in divs_page]
    addresses_page = [div_page.find("p", {"class": "simple-card__address"}).string for div_page in divs_page]
    
    for price_page in prices_page:
        prices.append(price_page)
        
    for addresse_page in addresses_page:
        addresses.append(addresse_page)
    
    i = i+1
    

In [None]:
imoveis_info_all = pd.DataFrame()

imoveis_info_all["preco"] = prices
imoveis_info_all["endereco"] = addresses

pd.set_option("max_rows", None)

imoveis_info_all

### Geocoding addresses

In [None]:
from geopy.geocoders import Nominatim

In [None]:
geolocator = Nominatim(user_agent="imoveis_zap_beautiful_soup")

In [None]:
longitude = []
latitude = []

i = 1

for address in addresses:
    print(i)
    try:
        address_geo = geolocator.geocode(address)
        if address_geo == None:
            print("Endereço: {0} - Não encontrado".format(address))
            longitude_ = 0.0
            latitude_ = 0.0
            longitude.append(longitude_)
            latitude.append(latitude_)
            i = i+1
            continue

        print("Geocoding {0}".format(address))
        longitude_ = float(address_geo.longitude)
        latitude_ = float(address_geo.latitude)

        longitude.append(longitude_)
        latitude.append(latitude_)
    except:
        print("Geocode Timeout")
        longitude_ = 0.0
        latitude_ = 0.0
        longitude.append(longitude_)
        latitude.append(latitude_)
    i = i+1

In [None]:
imoveis_info_all["longitude"] = longitude
imoveis_info_all["latitude"] = latitude

imoveis_info_all

In [None]:
import folium

In [None]:
map = folium.Map(
    location=[imoveis_info_all["latitude"][0],imoveis_info_all["longitude"][0]],
    tiles='cartodbpositron',
    zoom_start=12,
)

In [None]:
imoveis_info_all.apply(lambda row:folium.CircleMarker(
    location=[row["latitude"], row["longitude"]],
    popup=row["endereco"] + "\n <strong>" + row["preco"] +"</strong>",
    fill=True
).add_to(map), axis=1)
map