## Scraping of Retsinformation.dk

In [1]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import re

In [2]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)

HF_TOKEN = os.environ.get("HF_TOKEN")
from huggingface_hub import HfApi, HfFolder

HfFolder.save_token(HF_TOKEN)

## Step 1: Scrape all URLS of vejledninger

In [3]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

def extract_urls_from_page(url):
    """
    Extract all unique URLs from a single search page.

    Args:
    url (str): URL of the page to scrape.

    Returns:
    list: A list of extracted URLs.
    """
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    with webdriver.Chrome(options=options) as driver:
        driver.get(url)
        
        # Wait for the specific element to be loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "document-entry"))
        )
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        search_results = soup.find("div", class_="search-result-list")

        urls = []
        for div in search_results.find_all('div', class_='document-entry'):
            url = div.get('about')
            if url:
                urls.append(url)

    return urls

def make_url_list(n=5, url_prefix='https://www.retsinformation.dk'):
    """
    Function to loop over N pages of search results and extract all URLs.

    Args:
    n (int): Number of pages to scrape.
    url_prefix (str): Prefix to complete the URLs.

    Returns:
    list: A list of all extracted and complete URLs.
    """
    base_url = f'{url_prefix}/documents?dt=180&h=false&page={{page}}&ps=100&r=30'
    all_urls = []

    for page_no in range(0, n):
        full_url = base_url.format(page=page_no)
        page_urls = extract_urls_from_page(full_url)
        complete_urls = [url_prefix + url for url in page_urls]
        all_urls.extend(complete_urls)

    return all_urls

In [4]:
#Took 10 seconds to run
url_list = make_url_list()

In [6]:
#Return first 5 URLs of the list
url_list[:5]

['https://www.retsinformation.dk/eli/retsinfo/2024/9001',
 'https://www.retsinformation.dk/eli/retsinfo/2024/9000',
 'https://www.retsinformation.dk/eli/retsinfo/2023/10095',
 'https://www.retsinformation.dk/eli/retsinfo/2023/10093',
 'https://www.retsinformation.dk/eli/retsinfo/2023/10092']

## Step 2: Scrape HTML content of vejledninger (bs object)

**OBS**
Nogle vejledninger har tilsynedeladende ikke title i "Titel2" format, men derimod bare "Titel' feks tilfældet for pulverlakering vejledning, andre har class TITLE, og andre igen har ingen men blot font size = 5.... Og et par har slet ingen formatering af overskriften i HTML. 

In [8]:
def extract_title(content_div):
    """
    Extracts the title from a given BeautifulSoup div element.

    This function searches for the title of the content in the provided div element. It checks for various HTML tags and classes to find the title. If none of the specified tags and classes are found, it returns None.

    Args:
        content_div (BeautifulSoup element): The BeautifulSoup element representing a div from which the title is to be extracted.

    Returns:
        str or None: The extracted title as a string, or None if no title is found.
    """
    
    title_elements = [
        {"tag": "p", "class_": "Titel2"},
        {"tag": "p", "class_": "Titel"},
        {"tag": "h1", "class_": "TITLE"},
        {"tag": "font", "attrs": {"size": "5"}}
    ]

    for elem in title_elements:
        if "attrs" in elem:
            title = content_div.find(elem["tag"], **elem["attrs"])
        else:
            title = content_div.find(elem["tag"], class_=elem["class_"])
        
        if title:
            extracted_title = title.get_text(strip=True).replace("\n", "")
            print(f"Title extracted: {extracted_title}")  # Debugging statement
            return extracted_title

    print("No title extracted; returning None.")  # Debugging statement
    return None



def scrape_content(urls):
    """
    Scrapes and collects HTML content from a given list of URLs.

    This function navigates to each URL, waits for the page's content to load, and then extracts  HTML content within the "document-content" div using BS. 
    It attempts to identify and use the title of the content key in the resulting dictionary.

    Args:
        urls (list of str): A list of URLs to be scraped.

    Returns:
        dict: A dictionary with titles as keys and the corresponding HTML content as values. If the title can't be determined, the URL is used as the key.
    """
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    
    driver = webdriver.Chrome(options=options)
    result_dict = {}

    for url in urls:
        try:
            driver.get(url)

            # Wait for the specific element to be loaded
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "document-content"))
            )

            # Get the page source and parse it with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, "html.parser")
            content_div = soup.find("div", class_="document-content")

            if content_div:
                title = extract_title(content_div) or str(url)
                if title in result_dict:
                    print(f"Duplicate key detected: {title}. URL: {url}")
                else:
                    result_dict[title] = content_div
                    print(f"Content added for {title}")  # Confirm successful addition
            else:
                print("Content not found for URL:", url)

        except Exception as e:
            print(f"An error occurred while processing {url}: {e}")

    # Close the WebDriver
    driver.quit()

    return result_dict

In [9]:
#Took 6 minutes to run
vejledninger_raw2 = scrape_content(url_list)

Title extracted: Vejledning om regulering af satser fra 1. januar 2024 efter lov om arbejdsskadesikring, lov om sikring mod følger af arbejdsskade, lov om arbejdsskadeforsikring og lov om forsikring mod følger af ulykkestilfælde
Content added for Vejledning om regulering af satser fra 1. januar 2024 efter lov om arbejdsskadesikring, lov om sikring mod følger af arbejdsskade, lov om arbejdsskadeforsikring og lov om forsikring mod følger af ulykkestilfælde
Title extracted: Vejledning om satser i 2024 for betaling af udgifter til transport m.v. i forbindelse med lægebehandling, der er begæret af Arbejdsmarkedets Erhvervssikring eller Ankestyrelsen
Content added for Vejledning om satser i 2024 for betaling af udgifter til transport m.v. i forbindelse med lægebehandling, der er begæret af Arbejdsmarkedets Erhvervssikring eller Ankestyrelsen
Title extracted: Vejledning om obligatorisk selvbooking af jobsamtaler for forskellige målgrupper
Content added for Vejledning om obligatorisk selvbooki

In [10]:
len(vejledninger_raw2)

433

In [57]:
len(url_list)
#Remove any empty values in the url list
url_list_clean = [x for x in url_list if x != None]
len(url_list_clean)

435

In [7]:
len(url_list)

433

In [31]:
def extract_title(content_div):
    """
    Extracts the title from a given BeautifulSoup div element.

    This function searches for the title of the content in the provided div element. It checks for various HTML tags and classes to find the title. If none of the specified tags and classes are found, it returns None.

    Args:
        content_div (BeautifulSoup element): The BeautifulSoup element representing a div from which the title is to be extracted.

    Returns:
        str or None: The extracted title as a string, or None if no title is found.
    """
    
    title_elements = [
        {"tag": "p", "class_": "Titel2"},
        {"tag": "p", "class_": "Titel"},
        {"tag": "h1", "class_": "TITLE"},
        {"tag": "font", "attrs": {"size": "5"}}
    ]

    for elem in title_elements:
        if "attrs" in elem:
            title = content_div.find(elem["tag"], **elem["attrs"])
        else:
            title = content_div.find(elem["tag"], class_=elem["class_"])
        
        if title:
            return title.get_text(strip=True).replace("\n", "")

    return None


def scrape_content(urls):
    """
    Scrapes and collects HTML content from a given list of URLs.

    This function navigates to each URL, waits for the page's content to load, and then extracts  HTML content within the "document-content" div using BS. 
    It attempts to identify and use the title of the content key in the resulting dictionary.

    Args:
        urls (list of str): A list of URLs to be scraped.

    Returns:
        dict: A dictionary with titles as keys and the corresponding HTML content as values. If the title can't be determined, the URL is used as the key.
    """
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    
    driver = webdriver.Chrome(options=options)
    result_dict = {}

    for url in urls:
        try:
            driver.get(url)

            # Wait for the specific element to be loaded
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "document-content"))
            )

            # Get the page source and parse it with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, "html.parser")
            content_div = soup.find("div", class_="document-content")

            if content_div:
                title = extract_title(content_div) or str(url)
                result_dict[title] = content_div
            else:
                print("Content not found for URL:", url)

        except Exception as e:
            print(f"An error occurred while processing {url}: {e}")

    # Close the WebDriver
    driver.quit()

    return result_dict

In [60]:
def extract_title(content_div):
    """
    Extracts the title from a given BeautifulSoup div element.

    This function searches for the title of the content in the provided div element. It checks for various HTML tags and classes to find the title. If none of the specified tags and classes are found, it returns None.

    Args:
        content_div (BeautifulSoup element): The BeautifulSoup element representing a div from which the title is to be extracted.

    Returns:
        str or None: The extracted title as a string, or None if no title is found.
    """
    
    title_elements = [
        {"tag": "p", "class_": "Titel2"},
        {"tag": "p", "class_": "Titel"},
        {"tag": "h1", "class_": "TITLE"},
        {"tag": "font", "attrs": {"size": "5"}}
    ]

    for elem in title_elements:
        if "attrs" in elem:
            title = content_div.find(elem["tag"], **elem["attrs"])
        else:
            title = content_div.find(elem["tag"], class_=elem["class_"])
        
        if title:
            return title.get_text(strip=True).replace("\n", "")

    return None


def scrape_content(urls):
    """
    Scrapes and collects HTML content from a given list of URLs.

    This function navigates to each URL, waits for the page's content to load, and then extracts  HTML content within the "document-content" div using BS. 
    It attempts to identify and use the title of the content key in the resulting dictionary.

    Args:
        urls (list of str): A list of URLs to be scraped.

    Returns:
        list of dicts: A list of dicts where each dicts contains the keys title, html_content and url (if title is not identified, url link i used)
        
    """
    
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    
    driver = webdriver.Chrome(options=options)
    vejledning_list = []


    for url in urls:
        vejledning_dict = {}
        vejledning_dict['url'] = url
        try:
            driver.get(url)

            # Wait for the specific element to be loaded
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "document-content"))
            )

            # Get the page source and parse it with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, "html.parser")
            content_div = soup.find("div", class_="document-content")

            if content_div:
                title = extract_title(content_div)              
                vejledning_dict["title"] = title
                vejledning_dict["html_content"] = content_div            
            else:
                print("Content not found for URL:", url)
                vejledning_dict["html_content"] = None
                vejledning_dict["title"] = url
                
            vejledning_list.append(vejledning_dict)
            
        except Exception as e:
            print(f"An error occurred while processing {url}: {e}")

    # Close the WebDriver
    driver.quit()

    return vejledning_list

In [64]:
#Took 6 minutes to run
vejledninger_raw_html_list = scrape_content(url_list)

In [65]:
#Inspecting the list of dicts
print(f'Number of vejledninger: {len(vejledninger_raw_html_list)}')
print(f'Each dict containing the following keys: {vejledninger_raw_html_list[0].keys()}')
#Each vejledning contains a dict with the following keys
print(f'The first item has the following title: {vejledninger_raw_html_list[0]["title"]}')
print(f'and has the following url: {vejledninger_raw_html_list[0]["url"]}')
print('and contains a html object of type: ', type(vejledninger_raw_html_list[0]["html_content"]))


Number of vejledninger: 433
Each dict containing the following keys: dict_keys(['url', 'title', 'html_content'])
The first item has the following title: Vejledning om regulering af satser fra 1. januar 2024 efter lov om arbejdsskadesikring, lov om sikring mod følger af arbejdsskade, lov om arbejdsskadeforsikring og lov om forsikring mod følger af ulykkestilfælde
and has the following url: https://www.retsinformation.dk/eli/retsinfo/2024/9001
and contains a html object of type:  <class 'bs4.element.Tag'>


## Step 3: Extract and clean text

In [57]:
def extract_text(bs_obj):
    """
    Extracts and concatenates text content from all <p> tags in the provided BeautifulSoup object.

    Args:
        bs_obj (BeautifulSoup): A BeautifulSoup object representing parsed HTML content.

    Returns:
        str: A string containing all text from <p> tags, separated by line breaks.
    """
    paragraphs = bs_obj.find_all("p")
    text = "\n".join(p.get_text(strip=True) for p in paragraphs)
    return text


def clean_text(text):
    """
    Cleans the given text by replacing sequences of two or more line breaks with a double line break.
    Function intended for adding further text cleaning steps if needed.

    Args:
        text (str): The text to be cleaned.

    Returns:
        str: The cleaned text with standardized line breaks.
    """
    import re
    cleaned_text = re.sub(r'\n{2,}', '\n\n', text)
    return cleaned_text


def create_clean_vejl_list(vejledninger_raw_list):
    """
    Creates a dictionary of cleaned text for each item in the vejledninger data.

    This function iterates over vejledninger_raw_dict, extracts and cleans the text for each item, and stores it in a new key-value pair called cleaned_content under each url.

    Args:
        vejledninger_raw (list): A list of dictionaries containing the keys title, url and html_content 

    Returns:
        A list of dicts: A copy of the input list with the addition of the key-value pair cleaned_content
    """
    vejledninger_list_cleaned = []
    for vejledning in vejledninger_raw_list:
        vejledning["text_content"] = clean_text(extract_text(vejledning["html_content"]))
        vejledninger_list_cleaned.append(vejledning)
    return vejledninger_list_cleaned

In [73]:
vejledninger_text_list = create_clean_vejl_list(vejledninger_raw_html_list)

In [79]:
print(f'Example of first 1000 chars of extracted and cleaned text: \n\n  {vejledninger_text_list[0]["text_content"][:1000]} ...')

Example of first 1000 chars of extracted and cleaned text: 

  Vejledning om regulering af satser fra 1. januar 2024 efter lov om arbejdsskadesikring, lov om sikring mod følger af arbejdsskade, lov om arbejdsskadeforsikring og lov om forsikring mod følger af ulykkestilfælde
Indledning
Efter lov om arbejdsskadesikring, jf. lovbekendtgørelse nr. 1186 af 19. august 2022 med de ændringer, der følger af lov nr. 1541 af 12. december 2023, og lov om sikring mod følger af arbejdsskade, jf. lovbekendtgørelse nr. 943 af 16. oktober 2000, skal der med virkning fra 1. januar 2024 efter indstilling fra bestyrelsen for Arbejdsmarkedets Erhvervssikring ske regulering af lovens årslønsbeløb, godtgørelsesbeløb, overgangsbeløb samt løbende erstatninger.
Reguleringen af satserne fastsættes af Arbejdstilsynets direktør efter bemyndigelse fra beskæftigelsesministeren.
Satser efter loven reguleres med 2 procent tillagt tilpasningsprocenten for finansåret 2024 (jf. lov om en satsreguleringsprocent). Tilpasni

In [91]:
str_html = str(vejledninger_text_list[0]['html_content'])

In [None]:
# Reconverting string to bs4.element.Tag
reconverted_tag = BeautifulSoup(str_html, 'html.parser')


# Pushing to HF

In [96]:
#Convert the bs4 tag object into a str before creating the dataset for all objects in the list
for vejledning in vejledninger_text_list:
    vejledning['html_content'] = str(vejledning['html_content'])


In [99]:
#Create hugging face dataset

from datasets import Dataset
ds_vejledning = Dataset.from_list(vejledninger_text_list)

# add readme
ds_vejledning.info.description = """# Vejledninger fra Retsinformation.dk
Datasættet indeholder alle vejledninger scrapet fra Retsinformation.dk (pr. januar 2024).
Datasættet indeholder 3 kolonner: 
- title: Navnet på den givne vejledning (såfremt det fremgik af html'en, ellers fremgår URL) (str)
- text_content: Det fulde tekstindhold af vejledningen (str)
- url: Link til den givne vejledning på Retsinformation.dk (str)
- html_content: Den rå html af vejledningen (str(bs4.element.Tag)) (str)

Teksten er renset og formateret således at der er 1 linjeskift (\n) mellem hver sektion ( <p> tag), 
medmindre der er indsat en eller flere tomme sektioner i træk hvormed der i stedet er indsat 2 linjeskift (\n\n)
"""

ds_vejledning.info.dataset_name = "Vejledninger fra Retsinformation.dk"
#ds_vejledning.info.config_name = "retsinformation"

ds_vejledning.info

DatasetInfo(description="# Vejledninger fra Retsinformation.dk\nDatasættet indeholder alle vejledninger scrapet fra Retsinformation.dk (pr. januar 2024).\nDatasættet indeholder 3 kolonner: \n- title: Navnet på den givne vejledning (såfremt det fremgik af html'en, ellers fremgår URL) (str)\n- text_content: Det fulde tekstindhold af vejledningen (str)\n- url: Link til den givne vejledning på Retsinformation.dk (str)\n- html_content: Den rå html af vejledningen (str(bs4.element.Tag)) (str)\n\nTeksten er renset og formateret således at der er 1 linjeskift (\n) mellem hver sektion ( <p> tag), \nmedmindre der er indsat en eller flere tomme sektioner i træk hvormed der i stedet er indsat 2 linjeskift (\n\n)\n", citation='', homepage='', license='', features={'url': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'html_content': Value(dtype='string', id=None), 'text_content': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, task_templates=Non

In [9]:
# upload dataset
#Transform to DatasetDict
from datasets import DatasetDict
ds_dict = DatasetDict({"train": ds_vejledning})
ds_dict.push_to_hub(repo_id='dk_retrieval_benchmark', config_name='retsinformation')
#ds_vejledning.push_to_hub(repo_id="dk_retrieval_benchmark", config_name="retsinformation")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/2.07k [00:00<?, ?B/s]