In [3]:
import os
import re
import pandas as pd
from io import StringIO
from pathlib import Path
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [4]:
PROJECT_ROOT = Path(os.getenv("PROJECT_ROOT"))
DIR_BASE = PROJECT_ROOT / 'data/input/incidents/'

In [None]:
years = list(range(2018, 2023))
voivodeships = [
    "dolnośląskie",
    "kujawsko-pomorskie",
    "lubelskie",
    "lubuskie",
    "mazowieckie",
    "małopolskie",
    "opolskie",
    "podkarpackie",
    "podlaskie",
    "pomorskie",
    "warmińsko-mazurskie",
    "wielkopolskie",
    "zachodniopomorskie",
    "łódzkie",
    "śląskie",
    "świętokrzyskie"
]


In [None]:
def format_url(voivodeship, year):
    base_url = "https://sewik.pl/search"
    params = {
        "filter_form%5Bvoivodeship%5D": f"WOJ.+{voivodeship.upper()}",
        "filter_form%5BfromDate%5D": f"{year}-01-01",
        "filter_form%5BtoDate%5D": f"{year}-12-31",
        "filter_form%5Bcategories%5D": "Czas+zdarze%C5%84"
    }
    formatted_params = "&".join([f"{key}={value}" for key, value in params.items()])
    return f"{base_url}?{formatted_params}"

In [None]:
def scrape_page(url):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)
        
        button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.ID, "filter_form_reports"))
        )
        button.click()
        
        WebDriverWait(driver, 70).until(
            EC.presence_of_element_located((By.TAG_NAME, 'table'))
        )
        
        tables = driver.find_elements(By.TAG_NAME, 'table')
        
        if tables:
            return driver.page_source

    finally:
        driver.quit()


In [None]:
def convert_tables_to_csv(html_content, output_dir='csv_output'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    soup = BeautifulSoup(html_content, 'html.parser')
    tables = soup.find_all('table')
    if not tables:
        print("No tables found in the HTML content.")
        return
    
    for i, table in enumerate(tables, start=1):
        h3 = table.find_previous('h3')
        if h3:
            title = h3.text.strip()
            filename = re.sub(r'[^\w\s-]', '', title).strip().lower()
            filename = re.sub(r'[-\s]+', '-', filename)
        else:
            filename = f'table_{i}'
        
        df = pd.read_html(StringIO(str(table)))[0]
        
        csv_filename = f'{filename}.csv'
        csv_path = os.path.join(output_dir, csv_filename)
        
        df.to_csv(csv_path, index=False, encoding='utf-8-sig')
        print(f"Saved table '{filename}' to {csv_path}")
    
    print(f"Converted {len(tables)} tables to CSV files in {output_dir}")


In [None]:
def process_case(year, voivodeship):
    output_dir = DIR_BASE / str(year) / voivodeship
    if os.path.exists(output_dir):
        return

    url = format_url(voivodeship, year)
    html = scrape_page(url)
    if not html:
        print(f"Failed to scrape {year}, {voivodeship}")
        return

    convert_tables_to_csv(html, output_dir)

In [None]:
for year in years:
    for voivodeship in voivodeships:
        process_case(year, voivodeship)