In [1]:
import logging
import random
import sqlite3
import time
from typing import Tuple

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.remote.webdriver import WebDriver

from carGPT.scraper.scraper.translations import TRANSLATIONS
from carGPT.database.database import Database

In [2]:
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


In [3]:
class ChromeDriveConnection:
    def __enter__(self):
        self.driver = webdriver.Chrome()
        return self.driver

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.driver.quit()

In [4]:
class DatabaseConnection:
    def __enter__(self):
        self.connection = sqlite3.connect("ads.db")
        return self.connection

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.connection.close()


In [6]:
class NjuskaloScraper:
    page_template = "https://www.njuskalo.hr/auti?page={page_num}"

    def __init__(self):
        self.page_num = 1
        self.db = Database().db

    def get_ads(self, driver: WebDriver):
        ads = (
            driver.find_element(
                By.CSS_SELECTOR, ".EntityList--ListItemRegularAd"
            )
            .find_element(By.CLASS_NAME, "EntityList-items")
            .find_elements(By.CLASS_NAME, "EntityList-item")
        )
        logger.info(f"Found {len(ads)} ads on the page")
        return ads

    @staticmethod
    def get_ad_columns(
        driver: WebDriver,
    ) -> Tuple[list[WebElement], list[WebElement]]:
        ad_info = driver.find_element(
            By.CLASS_NAME, "ClassifiedDetailBasicDetails-list"
        )
        ad_left_column = ad_info.find_elements(
            By.CLASS_NAME, "ClassifiedDetailBasicDetails-listTerm"
        )
        ad_right_column = ad_info.find_elements(
            By.CLASS_NAME, "ClassifiedDetailBasicDetails-listDefinition"
        )
        return ad_left_column, ad_right_column

    @staticmethod
    def get_ad_details(
        left_column: list[WebElement], right_column: list[WebElement]
    ) -> dict[str, str]:
        ad_details = {}
        for prop_name, prop_value in zip(left_column, right_column):
            prop_name = prop_name.find_element(
                By.CLASS_NAME, "ClassifiedDetailBasicDetails-textWrapContainer"
            ).text
            prop_value = prop_value.find_element(
                By.CLASS_NAME, "ClassifiedDetailBasicDetails-textWrapContainer"
            ).text
            try:
                ad_details[TRANSLATIONS[prop_name]] = prop_value
            except KeyError:
                logger.error(f"No key for: {prop_name} - value: {prop_value}")

        return ad_details

    def extract_article_info(self, driver: WebDriver) -> dict[str, str]:
        left_column, right_column = self.get_ad_columns(driver)
        ad_details = self.get_ad_details(left_column, right_column)
        published_elem = driver.find_element(
            By.CLASS_NAME, "ClassifiedDetailSystemDetails-listData"
        )
        print(published_elem.text)
        return ad_details

    @staticmethod
    def get_ad_links(page_ads: list[WebElement]) -> list[str]:
        ad_links = []
        for ad in page_ads:
            try:
                ad_class = ad.get_attribute("class")
                if "EntityList-bannerContainer" in ad_class:
                    logger.info("Skipping something that is not an add")
                    continue
                article = ad.find_element(By.TAG_NAME, "article")
                article_title = article.find_element(
                    By.CLASS_NAME, "entity-title"
                )
                article_link = article_title.find_element(By.TAG_NAME, "a")
                article_link_url = article_link.get_attribute("href")
                ad_links.append(article_link_url)
            except Exception as e:
                logger.error(f"Error happened: {e}")
            break
        return ad_links

    def save_article(self, article_info: dict[str, str]) -> None:
        with DatabaseConnection() as db_conn:
            cursor = db_conn.cursor()
            columns = ", ".join(article_info.keys())
            values = ", ".join([f'"{val}"' for val in article_info.values()])
            insert_str = f"INSERT INTO ads ({columns}) VALUES ({values})"
            logger.info(f"Insert into DB string:\n\t{insert_str}")
            cursor.execute(insert_str)
            db_conn.commit()

    def handle_link(self, link: str, driver: WebDriver) -> None:
        driver.get(link)
        logger.info(f"Went to page {link}")
        article_info = self.extract_article_info(driver)
        self.save_article(article_info)
        sleep_time = random.randint(1, 10)
        logger.info(f"Sleeping for {sleep_time}s")
        time.sleep(sleep_time)

    def handle_page(self, driver: WebDriver) -> None:
        page_ads = self.get_ads(driver)
        ad_links = self.get_ad_links(page_ads)
        for link in ad_links:
            self.handle_link(link, driver)

    def start(self) -> None:
        with ChromeDriveConnection() as driver:
            driver.get(self.page_template.format(page_num=self.page_num))
            self.handle_page(driver)

In [7]:
njws = NjuskaloScraper()
njws.start()

2024-12-09 21:29:51,349 - INFO - Found 33 ads on the page
2024-12-09 21:29:51,942 - INFO - Went to page https://www.njuskalo.hr/auti/seat-ibiza-1.6-tdi-oglas-45576612
2024-12-09 21:29:52,197 - INFO - Insert into DB string:
	INSERT INTO ads (location, make, model, type, manufacture_year, registered_until, mileage, engine, power, displacement, transmission, condition) VALUES ("Osječko-baranjska, Osijek, Sjenjak", "Seat", "Ibiza", "1,6 TDI", "2013. godište", "10 / 2025", "170000 km", "Diesel", "66 kW", "1.598 cm3", "Mehanički mjenjač", "rabljeno")
2024-12-09 21:29:52,209 - INFO - Sleeping for 7s


09.12.2024. u 21:29
