## Scraping of Retsinformation.dk

In [1]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import re

In [2]:
# load the cvr_list.cvr.txt file
# remove newline characters
with open("cvr_list.cvr.txt", "r") as f:
    cvr_list = f.readlines()
    cvr_list = [x.strip() for x in cvr_list]

In [7]:
# Remove empty strings
cvr_list = list(filter(None, cvr_list))

In [9]:
len(cvr_list)

2865

In [15]:
# Remove duplicates
cvr_list = list(set(cvr_list))

In [16]:
len(cvr_list)

1140

In [11]:
cvr_list[:5]

[13081387, 11517498, 11517498, 11517498, 11517498]

In [17]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import random


def get_company_info(cvr_number):
    try:
        url_to_scrape = f"https://datacvr.virk.dk/enhed/virksomhed/{cvr_number}?fritekst={cvr_number}&sideIndex=0&size=10"
        options = webdriver.ChromeOptions()
        # options.add_argument("--headless")  # Uncomment to run browser in headless mode
        with webdriver.Chrome(options=options) as driver:
            driver.get(url_to_scrape)

            # Wait for the specific element to be loaded
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.ID, "accordion-antal-ansatte-content")
                )
            )

            # Get the page source and create a BeautifulSoup object
            html_content = driver.page_source
            soup = BeautifulSoup(html_content, "html.parser")

            # Extract the company name
            company_name_tag = soup.find("h1", class_="h2 mt-0 mb-lg-9")
            if not company_name_tag:
                return {
                    "virk_navn": "Html info not found",
                    "ansatte": "Html info not found",
                }

            company_name = company_name_tag.get_text(strip=True)

            # Extract the number of employees using BeautifulSoup
            table_body = soup.find("div", id="accordion-antal-ansatte-content").find(
                "tbody"
            )
            if not table_body:
                return {"virk_navn": company_name, "ansatte": "Html info not found"}

            rows = table_body.find_all("tr")
            first_row = rows[0]
            employees_td = first_row.find("td", {"data-title": "Ansatte"})
            if not employees_td:
                return {"virk_navn": company_name, "ansatte": "Html info not found"}

            number_of_employees = employees_td.find("span").text
            number_of_employees = int(number_of_employees)

            # Create a dictionary with the company name and number of employees
            company_info = {"virk_navn": company_name, "ansatte": number_of_employees}
            return company_info

    except Exception as e:
        return {"virk_navn": "Server error", "ansatte": "Server error"}


def scrape_multiple_companies(cvr_numbers):
    all_company_info = {}
    for cvr_number in cvr_numbers:
        try:
            company_info = get_company_info(cvr_number)
            # Assuming the company_info dictionary has only one key-value pair
            company_name, number_of_employees = next(iter(company_info.items()))
            all_company_info[cvr_number] = {
                "virk_navn": company_name,
                "ansatte": number_of_employees,
            }
        except Exception as e:
            print(f"Failed to scrape CVR number {cvr_number}: {e}")
            all_company_info[cvr_number] = {"virk_navn": None, "ansatte": None}

        time_to_sleep = random.uniform(1, 5)  # Random delay between 1 and 5 seconds
        time.sleep(time_to_sleep)
    return all_company_info


# Example usage:
cvr_numbers = cvr_list[:200]  # Replace with your list of CVR numbers
all_company_info = scrape_multiple_companies(cvr_numbers)
print(
    all_company_info
)  # This should print out a list of dictionaries with company names and number of employees

{30580737: {'virk_navn': 'virk_navn', 'ansatte': 'KORAL GULD & SØLV ApS'}, 18581507: {'virk_navn': 'virk_navn', 'ansatte': 'TREND BAZAAR A/S'}, 17932292: {'virk_navn': 'virk_navn', 'ansatte': 'REMA 1000, 730 ApS'}, 24217604: {'virk_navn': 'virk_navn', 'ansatte': 'Johnny Olsen, 446 Kolding ApS'}, 30005252: {'virk_navn': 'virk_navn', 'ansatte': 'TORBEN KLINKBY, 854 SNEJBJERG ApS'}, 26728460: {'virk_navn': 'virk_navn', 'ansatte': 'JAN MØRUP ÅRHUS ApS'}, 34304017: {'virk_navn': 'virk_navn', 'ansatte': 'Expert K.P. RADIO ApS'}, 41639954: {'virk_navn': 'virk_navn', 'ansatte': 'Sport 2021 ApS'}, 26869781: {'virk_navn': 'virk_navn', 'ansatte': 'Morten Dalgas, 738 Randers ApS'}, 40075291: {'virk_navn': 'virk_navn', 'ansatte': 'Nuuday A/S'}, 35954716: {'virk_navn': 'virk_navn', 'ansatte': 'Salling Group A/S'}, 10268702: {'virk_navn': 'virk_navn', 'ansatte': 'MØRUP DETAIL ApS'}, 30531620: {'virk_navn': 'virk_navn', 'ansatte': 'KJÆR & SOMMERFELDT A/S'}, 30525477: {'virk_navn': 'virk_navn', 'ansatt

In [20]:
# Export all_company_info to acsv
# The current structure is a list of nested dicts
# {30580737: {'virk_navn': 'virk_navn', 'ansatte': 'KORAL GULD & SØLV ApS'},
import csv

with open("all_company_info.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["cvr", "virk_navn", "ansatte"])
    for key in all_company_info.keys():
        writer.writerow(
            [key, all_company_info[key]["virk_navn"], all_company_info[key]["ansatte"]]
        )

In [12]:
all_company_info

{13081387: {'virk_navn': 'HUNTERS HOUSE. JAGT & FISKERI A/S', 'ansatte': 27},
 11517498: {'virk_navn': '365discount A/S', 'ansatte': 9624}}

In [13]:
cvr_list[:5]

[13081387, 11517498, 11517498, 11517498, 11517498]

**More efficient version not opening and closing multiple browser windows**

In [31]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import random


def get_company_info(driver, cvr_number):
    try:
        url_to_scrape = f"https://datacvr.virk.dk/enhed/virksomhed/{cvr_number}?fritekst={cvr_number}&sideIndex=0&size=10"
        driver.get(url_to_scrape)

        # Wait for the specific element to be loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "accordion-antal-ansatte-content"))
        )

        # Get the page source and create a BeautifulSoup object
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, "html.parser")

        # Extract the company name
        company_name_tag = soup.find("h1", class_="h2 mt-0 mb-lg-9")
        if not company_name_tag:
            return {
                "virk_navn": "Html info not found",
                "ansatte": "Html info not found",
            }

        company_name = company_name_tag.get_text(strip=True)

        # Extract the number of employees using BeautifulSoup
        table_body = soup.find("div", id="accordion-antal-ansatte-content").find(
            "tbody"
        )
        if not table_body:
            return {"virk_navn": company_name, "ansatte": "Html info not found"}

        rows = table_body.find_all("tr")
        first_row = rows[0]
        employees_td = first_row.find("td", {"data-title": "Ansatte"})
        aarsvaerk_td = first_row.find("td", {"data-title": "Årsværk"})
        if not employees_td:
            return {"virk_navn": company_name, "ansatte": "Html info not found"}
        if not aarsvaerk_td:
            return {"virk_navn": company_name, "årsværk": "Html info not found"}

        number_of_employees = employees_td.find("span").text
        number_of_employees = int(number_of_employees)

        # Create a dictionary with the company name and number of employees
        company_info = {
            "virk_navn": company_name,
            "ansatte": number_of_employees,
            "årsværk": aarsvaerk_td,
        }
        return company_info

    except Exception as e:
        return {"virk_navn": "Server error", "ansatte": "Server error"}


def scrape_multiple_companies(cvr_numbers):
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # Uncomment to run browser in headless mode
    with webdriver.Chrome(options=options) as driver:
        all_company_info = {}
        unique_cvr_numbers = set(
            cvr_numbers
        )  # Remove duplicates by converting the list to a set
        for cvr_number in unique_cvr_numbers:
            company_info = get_company_info(driver, cvr_number)
            all_company_info[cvr_number] = company_info

            time_to_sleep = random.uniform(1, 5)  # Random delay between 1 and 5 seconds
            time.sleep(time_to_sleep)
        return all_company_info


# Example usage:
cvr_numbers = cvr_list[:3]  # Replace with your list of CVR numbers
company_info_sample = scrape_multiple_companies(cvr_numbers)
print(
    company_info_sample
)  # This should print out a list of dictionaries with company names and number of employees

SyntaxError: expression expected after dictionary key and ':' (3842269867.py, line 54)

In [26]:
# Export all_company_info to acsv
# The current structure is a list of nested dicts
# {30580737: {'virk_navn': 'virk_navn', 'ansatte': 'KORAL GULD & SØLV ApS'},
import csv

# using utf 8 encoding to maintain special characters like æ ø å


with open("company_info_sample.csv", "w", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["cvr", "virk_navn", "ansatte"])
    for key in company_info_sample.keys():
        writer.writerow(
            [
                key,
                company_info_sample[key]["virk_navn"],
                company_info_sample[key]["ansatte"],
            ]
        )

# Old


In [28]:
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def get_html_as_bs4_object(url):
    """
    Get the entire HTML of the page as a BeautifulSoup object.

    Args:
    url (str): URL of the page to scrape.

    Returns:
    BeautifulSoup: BeautifulSoup object containing the page's HTML.
    """
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # Uncomment to run browser in headless mode
    with webdriver.Chrome(options=options) as driver:
        driver.get(url)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "accordion-antal-ansatte-button"))
        )
        # Get the page source and convert it to a BeautifulSoup object
        soup = BeautifulSoup(driver.page_source, "html.parser")

    return soup


# Example usage:
url_to_scrape = "https://datacvr.virk.dk/enhed/virksomhed/30208102?fritekst=30208102&sideIndex=0&size=10"  # Replace with the URL you want to scrape
bs4_object = get_html_as_bs4_object(url_to_scrape)

In [11]:
element = bs4_object.find(id="accordion-antal-ansatte-content")

# Check if the element was found
if element:
    # The `element` variable now contains the specific element with the given ID
    # You can now access its content and any nested content within it
    content = (
        element.get_text()
    )  # This will give you all the text within the element, without any HTML tags

    # If you want to keep the HTML structure within the element, you can convert it back to a string
    html_content = str(element)

    # Do something with the content or html_content
    print(content)  # Print text content
    print(html_content)  # Print HTML content
else:
    print('Element with ID "accordion-antal-ansatte-content" not found.')

Antal ansatte pr månedPeriodeAnsatteÅrsværkDecember 20231715November 20231714Oktober 20231915September 20232017August 20232017Vis alleAntal ansatte pr kvartalPeriodeAnsatteÅrsværk3. kvt 201910-19 medarbejdere10-19 årsværk2. kvt 201910-19 medarbejdere10-19 årsværk1. kvt 201910-19 medarbejdere10-19 årsværk4. kvt 201810-19 medarbejdere10-19 årsværk3. kvt 201810-19 medarbejdere10-19 årsværkVis alle
<div aria-hidden="true" class="accordion-content" id="accordion-antal-ansatte-content"><div class="antalAnsatte"><div class="accordion-content-inner" data-v-509209b4=""><span class="bold" data-v-509209b4="">Antal ansatte pr måned</span><div class="table--responsive-scroll" data-v-509209b4=""><table aria-label="tabel-aria_label" class="table table--borderless table-md-responsive-headers bg-normal" data-v-509209b4=""><thead data-v-509209b4=""><tr data-v-509209b4=""><th data-v-509209b4="" scope="col">Periode</th><th data-v-509209b4="" scope="col">Ansatte</th><th data-v-509209b4="" scope="col">Årsvæ

In [29]:
element = bs4_object.find(id="accordion-antal-ansatte-content")

In [10]:
company_name_tag = bs4_object.find("h1", class_="h2 mt-0 mb-lg-9")
company_name = company_name_tag.get_text(strip=True)
print(company_name)  # This should print out "3D-EMPIRE A/S"

In [30]:
table_body = bs4_object.find("div", id="accordion-antal-ansatte-content").find("tbody")

# Find all the rows in the table body
rows = table_body.find_all("tr")

# Assuming you want the number 17 from the first row, which is the number of employees in December 2023
first_row = rows[0]

# Find the 'td' element with the data-title attribute 'Ansatte' which contains the number of employees
employees_td = first_row.find("td", {"data-title": "Årsværk"})

# Extract the text (number of employees) from the 'span' tag within this 'td'
number_of_employees = employees_td.find("span").text

# Now you have the number as a string, you can convert it to an integer if necessary
number_of_employees = int(number_of_employees)

print(number_of_employees)  # This should print out 17

15


In [7]:
bs4_object

<html lang="da"><head><meta charset="utf-8"/><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="width=device-width,initial-scale=1" name="viewport"/><link href="/favicon.ico" rel="icon"/><title>Enhedsvisning</title><link href="/css/view-Abonnementer.072d2b1c.css" rel="prefetch"/><link href="/css/view-Artikel.6a7c66f7.css" rel="prefetch"/><link href="/css/view-BestilDokumenter.630078a7.css" rel="prefetch"/><link href="/css/view-EidLogin.630078a7.css" rel="prefetch"/><link href="/css/view-Enhedsvisning.2c6cbe08.css" rel="prefetch"/><link href="/css/view-Kurv.39597aee.css" rel="prefetch"/><link href="/css/view-Oekonomioverblik.68a08215.css" rel="prefetch"/><link href="/css/view-RegistreringstidendeDetaljer.72cc5d1c.css" rel="prefetch"/><link href="/css/view-RegistreringstidendeResultater.67a20ed0.css" rel="prefetch"/><link href="/css/view-Soegeresultater.e35f8ca2.css" rel="prefetch"/><link href="/js/view-Abonnementer.736aa6c1.js" rel="prefetch"/><link href="/js/view-Arti

In [14]:
# Beautify the element
formatted_element = BeautifulSoup(str(element), "html.parser")
print(formatted_element.prettify())  # Print the formatted HTML

<div aria-hidden="true" class="accordion-content" id="accordion-antal-ansatte-content">
 <div class="antalAnsatte">
  <div class="accordion-content-inner" data-v-509209b4="">
   <span class="bold" data-v-509209b4="">
    Antal ansatte pr måned
   </span>
   <div class="table--responsive-scroll" data-v-509209b4="">
    <table aria-label="tabel-aria_label" class="table table--borderless table-md-responsive-headers bg-normal" data-v-509209b4="">
     <thead data-v-509209b4="">
      <tr data-v-509209b4="">
       <th data-v-509209b4="" scope="col">
        Periode
       </th>
       <th data-v-509209b4="" scope="col">
        Ansatte
       </th>
       <th data-v-509209b4="" scope="col">
        Årsværk
       </th>
      </tr>
     </thead>
     <tbody data-v-509209b4="">
      <tr data-v-509209b4="">
       <td data-title="Periode" data-v-509209b4="">
        <span data-v-509209b4="">
         December 2023
        </span>
       </td>
       <td data-title="Ansatte" data-v-509209b4="