In [61]:
import time
import os
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import pandas as pd
import wget
from extract_articles_pisrs import process_pdf

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 2000)

FILE_EXTENSIONS = ["docx", "doc", "pdf", "zip", "xlsx"]
ROOT_URL = "https://www.fu.gov.si"
MAIN_URL=ROOT_URL+"/podrocja"
METADATA_DIR = "/Users/juankostelec/Google_drive/Projects/tax_backend/data"
RAW_DATA_DIR = "/Users/juankostelec/Google_drive/Projects/tax_backend/data/raw_files"
PROCESSED_DATA_DIR = "/Users/juankostelec/Google_drive/Projects/tax_backend/data/processed_files"

In [3]:
driver = webdriver.Chrome()
driver.get(MAIN_URL)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
div_content = soup.find("div", id="content")
podrocja_elements = div_content.find_all("div", class_="card dark")

data_links = []
for group_idx, podrocje in enumerate(podrocja_elements):
    
    anchors = podrocje.find_all("a")
    for podrocje_idx, anchor in enumerate(anchors):
        podrocje_name = anchor.text
        
        # Get the text inside the <em> element (if it exists)
        em_text = None
        em_element = anchor.find("em")
        if em_element:
            em_text = em_element.text.strip()

        # Get the entire text of the anchor
        full_text = anchor.text

        # Subtract the <em> text from the full text to get the remaining text
        if em_text:
            remaining_text = full_text.replace(em_text, "").strip()
        else:
            remaining_text = full_text.strip()

        podrocje_name = remaining_text
        podrocje_description = em_text

        href_value = anchor.get("href")
        data_links.append([group_idx, podrocje_idx, podrocje_name, podrocje_description, href_value])

# Now we have all the links to the data we need to scrape
df = pd.DataFrame(data_links, columns=["group_idx", "podrocje_idx", "podrocje_name", "podrocje_description", "href"])

main_table = df.query("href == '#'")\
    .rename(columns={"podrocje_name":"group",
                     "podrocje_description": "group_description"})\
    .drop(columns=["href","podrocje_idx"])  # This gives the Podrocje id, podrocje name
df = pd.merge(df, main_table, on="group_idx", how="inner").query("href != '#'").drop(columns=["podrocje_description"])

# Convert HREFs to full URLs, add a flag if the URL links to final source, or a new webpage
df["is_final_source"] = df["href"].apply(lambda x: x.startswith("http"))
df["url"] = df["href"].apply(lambda x: ROOT_URL+x if not x.startswith("http") else x)
df["href_is_file"] = df["url"].apply(lambda x: x.endswith(".pdf") or x.endswith(".docx") or x.endswith(".doc") or x.endswith(".xlsx") or x.endswith(".xls") or x.endswith(".zip"))
df["href_type"] = df[["is_final_source", "href_is_file"]].apply(lambda x: "file" if x[0] and x[1] else "website_source" if x[0] else "website_details", axis=1)

df.to_csv(os.path.join(METADATA_DIR, "data_links.csv"), index=False)
driver.close()


# We need to consider 3 cases:
# 1. The URL links to a final source, and the source is a file (e.g. a PDF file)
# 2. The URL links to a final source, and the source is a webpage
# 3. The URL links to a new webpage


In [26]:
# Loop through each unique website and extract the HTML using Selenium and BeautifulSoup
website_data = []
href_data = []
new_websites = df.query("is_final_source == False and href_is_file == False")["url"].values.tolist()
cleaned_websites = ["/".join(x.split("/")[:-1]) if x.split("/")[-1].startswith("#") else "/".join(x.split("/")) for x in new_websites]
for idx, (url, raw_url) in enumerate(zip(cleaned_websites, new_websites)):
    if idx%10 == 0:
        print("Started scraping {}/{}: {}".format(idx, len(new_websites), raw_url))
    
    # Open the website using Selenium
    driver = webdriver.Chrome()
    driver.get(url)

    # Extract the HTML using BeautifulSoup
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # Close the Selenium driver
    driver.close()

    # Get the text in the <h1> element  --> this gives the title of the page
    h1_text = None
    h1_element = soup.find("h1")
    if h1_element:
        h1_text = h1_element.text.strip()

    opis_text = None
    zakonodaja_type = None
    for h2 in soup.find_all('h2'):
        if h2.find("a") is None:
            continue
        a_text = h2.find('a').text.strip()
        
        # Exclude the text from the <i> elements to get only "Zakonodaja"
        for i in h2.find_all('i'):
            a_text = a_text.replace(i.text, '').strip()
        if "Opis" in a_text:
            opis_parent_element = h2.parent
            if opis_parent_element:
                opis_p_elements = opis_parent_element.find_all("p")
                opis_text = " ".join([x.text.strip() for x in opis_p_elements])
        elif "Zakonodaja" in a_text:
            zakonodaja_sister_elements = h2.find_next_siblings()
            for element in zakonodaja_sister_elements:
                a_elements = element.find_all("a", href=True)                
                for a in a_elements:
                    # This is an anchor element, so extract the text and the href and save them as separate values
                    zakonodaja_text = a.text.strip()
                    zakonodaja_href = a.get("href")
                    href_data.append([raw_url, zakonodaja_text, zakonodaja_href])

        elif "Podrobnejši opisi" in a_text:
            podrobnejsi_opisi_sister_elements = h2.find_next_siblings()
            for element in podrobnejsi_opisi_sister_elements:
                a_elements = element.find_all("a", href=True)                
                for a in a_elements:
                    # This is an anchor element, so extract the text and the href and save them as separate values
                    podrobnejsi_opisi_text = a.text.strip()
                    podrobnejsi_opisi_href = a.get("href")
                    href_data.append([raw_url, podrobnejsi_opisi_text, podrobnejsi_opisi_href])
    website_data.append([raw_url, h1_text, opis_text])


# Save the data to a CSV file
df_website_data = pd.DataFrame(website_data, columns=["url", "title", "opis"])
df_website_data.to_csv(os.path.join(METADATA_DIR, "website_data.csv"), index=False)

href_data = pd.DataFrame(href_data, columns=["url", "source_desc", "source_href"])
href_data.to_csv(os.path.join(METADATA_DIR, "href_data.csv"), index=False)





Started scraping 0/136: https://www.fu.gov.si/davki_in_druge_dajatve/poslovanje_z_nami/vpis_v_davcni_register_in_davcna_stevilka/
Started scraping 1/136: https://www.fu.gov.si/davki_in_druge_dajatve/poslovanje_z_nami/davcni_postopek/
Started scraping 2/136: https://www.fu.gov.si/davki_in_druge_dajatve/poslovanje_z_nami/podelitev_statusa_zaradi_spodbujanja_prostovoljnega_izpolnjevanja_obveznosti/
Started scraping 3/136: https://www.fu.gov.si/davki_in_druge_dajatve/poslovanje_z_nami/apa_sporazum/
Started scraping 4/136: https://www.fu.gov.si/davki_in_druge_dajatve/poslovanje_z_nami/e_davki/
Started scraping 5/136: https://www.fu.gov.si/davki_in_druge_dajatve/poslovanje_z_nami/vrocanje/
Started scraping 6/136: https://www.fu.gov.si/navodila_pojasnila_in_smernice/
Started scraping 7/136: https://www.fu.gov.si/davki_in_druge_dajatve/podrocja/dohodnina/letna_odmera_dohodnine/
Started scraping 8/136: https://www.fu.gov.si/davki_in_druge_dajatve/podrocja/dohodnina/dohodnina_dohodek_iz_zaposlit

In [31]:
# Combine all the tables into a single one
df = pd.read_csv(os.path.join(METADATA_DIR, "data_links.csv"))
df_hrefs = pd.read_csv(os.path.join(METADATA_DIR, "href_data.csv"))
df_podrocja = pd.read_csv(os.path.join(METADATA_DIR, "website_data.csv"))


df_sources = df.query("href_type != 'website_details'").rename(
    columns={
        "group":"group_name",
        "group_description":"group_desc",
        "podrocje_name":"podrocje_name",
        "url": "file_url"}
)
df_sources = df_sources[["group_name", "group_desc", "podrocje_name", "file_url"]].drop_duplicates()

df_links = df.query("href_type == 'website_details'").rename(
    columns={
        "group":"group_name",
        "group_description":"group_desc",
        "podrocje_name":"podrocje_name",
        "url":"podrocje_url"})
df_links = df_links[["group_name", "group_desc", "podrocje_name", "podrocje_url"]].drop_duplicates()

df_hrefs = df_hrefs.rename(
    columns={
        "url":"podrocje_url",
        "source_desc":"file_desc",
        "source_href":"file_url"})
df_hrefs = df_hrefs[["podrocje_url", "file_desc", "file_url"]].drop_duplicates()

df_podrocja = df_podrocja.rename(
    columns={
        "url":"podrocje_url",
        "title":"podrocje_title",
        "opis":"podrocje_opis"})
df_podrocja = df_podrocja[["podrocje_url", "podrocje_title", "podrocje_opis"]].drop_duplicates()

df_enriched_links = pd.merge(df_links, df_hrefs, on="podrocje_url", how="left")
# Check which podrocje has no files:
df_enriched_links = pd.merge(df_enriched_links, df_podrocja, on="podrocje_url", how="left")
df_sources["podrocje_url"] = [None] * len(df_sources)
df_sources["file_desc"] = [None] * len(df_sources)
df_sources["podrocje_opis"] = [None] * len(df_sources)
df_sources = df_sources[['group_name', 'group_desc', 'podrocje_name', 'podrocje_url',
       'file_desc', 'file_url', 'podrocje_opis']]
df_enriched_links = df_enriched_links[['group_name', 'group_desc', 'podrocje_name', 'podrocje_url',
       'file_desc', 'file_url', 'podrocje_opis']]



def get_file_type(href):

    for file_extension in FILE_EXTENSIONS:
        if str(href).endswith(file_extension):
            return file_extension
    if str(href).startswith("/"):
        # relative path to a website
        return "website"
    elif str(href).startswith("http"):
        return "website"
    else:
        return None

# https://www.fu.gov.si/carina/poslovanje_z_nami/carinski_predpisi/#c1496
df_combined = pd.concat([df_enriched_links, df_sources], axis=0)
df_combined["file_type"] = df_combined["file_url"].apply(lambda x: get_file_type(x))
df_combined = df_combined.drop_duplicates()
updated_urls = []
for url in df_combined["file_url"]:
    if str(url).startswith("/"):
        updated_urls.append(os.path.join("https://www.fu.gov.si/", url))
    else:
        updated_urls.append(url)
df_combined["file_url"] = updated_urls
df_combined.to_csv(os.path.join(METADATA_DIR, "furs_data.csv"), index=False)

# Extract and encode data from PISRS.SI domain

In [46]:
df_pisrs = df_combined[df_combined["file_url"].str.startswith("http://www.pisrs.si") == True ].copy()
df_furs = df_combined[df_combined["file_url"].str.startswith("https://www.fu.gov.si") == True].copy()
df_eurlax = df_combined[df_combined["file_url"].str.startswith("http://eur-lex.europa.eu") == True].copy()   
df_other = df_combined[~df_combined.index.isin(df_pisrs.index.tolist() + df_furs.index.tolist() + df_eurlax.index.tolist())].copy()

1564 521 578 139 326


In [58]:
for idx, row in df_pisrs[["file_url"]].iterrows():
    file_url = row["file_url"]
    # Open the website using Selenium
    try:
        driver = webdriver.Chrome()
        driver.get(file_url)

        # Extract the HTML using BeautifulSoup
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # Extract the law title if available, this is the text in the <h1> element
        h1_text = None
        h1_element = soup.find("h1")
        if h1_element:
            h1_text = h1_element.text.strip()[:-1].strip()

        # Extract the URL from the website containing the link to the actual pdf/doc file        
        div_element = soup.find("div", id="fileBtns")
        if div_element:        
            a_elements = div_element.find_all("a", href=True)
            for a in a_elements:
                href_value = a.get("href")
                complete_url_path = os.path.join(file_url.rsplit("/", maxsplit=1)[0], href_value)
                if href_value.endswith("pdf"):
                    # Use the actual name of the law for the file name, with the correct extension
                    if h1_text:
                        source_name = h1_text + ".pdf"
                    else:
                        source_name = href_value.split("/")[-1]

                    # Download file if it does not already exist  
                    if not os.path.exists(os.path.join(RAW_DATA_DIR, source_name)):
                        print(f"Downloading pdf/doc file of law: {source_name} from website {complete_url_path}" )
                        wget.download(complete_url_path, os.path.join(RAW_DATA_DIR, source_name)) 

                        # Process file and extract the articles from the law
                        process_pdf(os.path.join(RAW_DATA_DIR, source_name), PROCESSED_)))          
    except Exception as e:
        print("Tried to get the website html, but did not work. Exception: ", e)

# Close the Selenium driver
driver.close()


# Now loop over the files and encode them and add to a vector database
for 



http://www.pisrs.si/Pis.web/npbDocPdf?idPredpisa=ZAKO8647&idPredpisaChng=ZAKO6792&type=pdf
H1 text:  Zakon o finančni upravi (ZFU)
Source name:  Zakon o finančni upravi (ZFU).pdf
http://www.pisrs.si/Pis.web/npbDocPdf?idPredpisa=ZAKO8346&idPredpisaChng=ZAKO1603&type=pdf
H1 text:  Zakon o splošnem upravnem postopku (ZUP)
Source name:  Zakon o splošnem upravnem postopku (ZUP).pdf
Downloading pdf/doc file of law: Zakon o splošnem upravnem postopku (ZUP).pdf from website http://www.pisrs.si/Pis.web/npbDocPdf?idPredpisa=ZAKO8346&idPredpisaChng=ZAKO1603&type=pdf
http://www.pisrs.si/Pis.web/npbDocPdf?idPredpisa=PRAV12962&type=pdf
H1 text:  Pravilnik o vsebini, obliki in načinu dostave podatkov o plačilu dohodka osebi, ki se po drugem odstavku 58. člena Zakona o davčnem postopku šteje za plačnika davk
Source name:  Pravilnik o vsebini, obliki in načinu dostave podatkov o plačilu dohodka osebi, ki se po drugem odstavku 58. člena Zakona o davčnem postopku šteje za plačnika davk.pdf
Downloading pd

KeyboardInterrupt: 

# Extract and encode data from fu.gov.si domain

In [None]:
# Basically I have data coming from 3 domains:
# fu.gov.si (mainzl doc files, and ad hoc created things it seem)
# pisrs.si (mainly pdf files, nicely structured, can also access the html of the website to parse the law instead of the PDF)
# eur-lex.europa.eu (don't always have the pdf available,  nicely structured, can also access the html of the website to parse the law instead of the PDF)