In [None]:
import datetime
import re
import time

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By

from sqlalchemy import create_engine

from utils import get_creds

In [None]:
def get_offers_ids(page_source):
    """
    Takes otodom listing page and get all offer ids from that page
    """

    soup = BeautifulSoup(page_source, "html.parser")

    listing_items = soup.find_all("li", {"data-cy": "listing-item"})

    offer_urls = []
    for item in listing_items:
        a = item.find("a", {"data-cy": "listing-item-link"})
        href_array = a["href"].split("/")

        offer_id = href_array[len(href_array) - 1]

        offer_urls.append(offer_id)

    return offer_urls

In [None]:
def save_df(df, csv=True, db=True):
    """
    Saves otodom offer ids into CSV or/and DB table
    """
    ## Save to CSV
    if csv:
        csv_path = f"results_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(csv_path, index=False)
        print(f"Results saved into {csv_path}")

    ## Save to DB
    if db:
        # Get database credentials
        credentials = get_credentials()

        # Saving to PostgreSQL DB
        engine = create_engine(
            f"postgresql://{username}:{password}@{host}:{port}/{database}"
        )

        table_name = "otodom_offers_ids"
        df.to_sql(table_name, engine, if_exists="append", index=False)
        print(f"Results saved to PostgreSQL DB into table: {database}.{table_name}")

In [None]:
def crawler(driver, actions, url):
    """
    Crawls otodom listing to do some actions and
    use pagination till its end
    """
    driver.get(url)  # open URL in Browser
    time.sleep(1)

    try:
        print("Accepting cookies")
        driver.find_element(
            By.ID, "onetrust-accept-btn-handler"
        ).click()  # accept cookies
    except NoSuchElementException:
        print("Cookies already accepted")

    driver.execute_script(
        "window.scrollTo(0, document.body.scrollHeight);"
    )  # scroll down

    soup = BeautifulSoup(driver.page_source, "html.parser")  # get html

    ## Find pages number
    pages = soup.find_all("button", {"data-cy": re.compile("^pagination.go-to-page-")})
    pages_numbers = [int(x.get_text()) for x in pages]
    last_page_number = max(pages_numbers)

    print(f"Total pages: {last_page_number}")

    for i in range(0, 5):
        print(f"Current URL: {driver.current_url}")
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);"
        )  # scroll down

        ## Creatinfg df with offers ids
        df = pd.DataFrame(get_offers_ids(driver.page_source), columns=["offer_id"])
        df.insert(loc=0, column="create_timestamp", value=datetime.datetime.now())
        df.insert(loc=1, column="listing_url", value=driver.current_url)

        ## Saving to DB
        save_df(df, csv=False, db=True)

        ## Wait and go to the next page
        time.sleep(1)
        pagination_button = driver.find_element(
            By.XPATH, f"//*[@data-cy='pagination.next-page']"
        )
        actions.move_to_element(pagination_button).perform()

        if pagination_button.is_enabled():
            pagination_button.click()
        else:
            break

In [None]:
driver = webdriver.Chrome()
actions = ActionChains(driver)

In [None]:
listing_url = "https://www.otodom.pl/pl/oferty/sprzedaz/mieszkanie/warszawa?distanceRadius=0&locations=%5Bcities_6-26%5D&viewType=listing"
crawler(driver, actions, listing_url)

In [None]:
driver.find_element(By.XPATH, f"//*[@data-cy='pagination.go-to-page-2']").click()

In [None]:
element = driver.find_element(By.XPATH, f"//*[@data-cy='pagination.go-to-page-2']")

actions = ActionChains(driver)
actions.move_to_element(element).perform()