In [31]:
import requests
from bs4 import BeautifulSoup
import os
import time

In [32]:
# load files from pages folder
pages = os.listdir('pages')

In [33]:
def soup_from_file(name):
    with open('pages/' + name, 'r') as f:
        return BeautifulSoup(f.read(), 'html.parser')
    

In [34]:
# Translate dict keys from danish to english
def translate_dict(info_dict):
    # Kursuskode -> Course code
    replace_table = {
        "Kursuskode": "Course code",
        "Sprog": "Language",
        "Point": "Credit",
        "Placering": "Placement",
        "Skemagruppe": "Schedule",
        "Sommer": "Summer",
        "Vinter": "Winter",
        "Forår": "Spring",
        "Efterår": "Autumn",
        "Kursusansvarlige": "Course Coordinators",
    }

    new_dict = {}
    for key, value in info_dict.items():
        new_key = replace_table.get(key, key)
        new_dict[new_key] = replace_table.get(value, value)
    return new_dict



In [35]:
def extract_info(name):
    soup = soup_from_file(name)
    info_dict = {}
    # Title
    info_dict["Title"] = soup.find("title").text.strip()
    # Grab all elements in panel-body and dt
    panel_bodies = soup.find_all("div", class_="panel-body")
    # find the panel body that has the dt and dd
    panel_body = [body for body in panel_bodies if body.find_all("dt")][0]
    

    # Print number of panel-bodies
    panel_keys = panel_body.find_all("dt")
    panel_vals = panel_body.find_all("dd")
    
    for panel_key, panel_val in zip(panel_keys, panel_vals):
        info_dict[panel_key.text] = panel_val.text

    info_dict = translate_dict(info_dict)

    # If there is no schedule
    if "Schedule" not in info_dict.keys():
        if info_dict["Placement"] in ["Spring", "Summer", "Autumn", "Winter"]:
            info_dict["Schedule"] = info_dict["Placement"]
        #else:
        #    info_dict["Schedule"] = None

    h5s = soup.find_all("h5")
    for h5 in h5s:
        if "Course Coordinators" in h5.text or "Kursusansvarlige" in h5.text:
            lis = h5.find_next_sibling("ul").find_all("li")
            # Remove the span tags
            for li in lis:
                for span in li.find_all("span"):
                    span.decompose()

            
            # Split by <li>[0]
            info_dict["Course Coordinators"] = [li.text.strip() for li in h5.find_next_sibling("ul").find_all("li")]
            

    # Assert that dict contains "course code" 
    assert_keys = ["Course code", "Title", "Language", "Schedule", "Placement", "Credit", "Course Coordinators", "Lecturers"
    for key in assert_keys:
        assert key in info_dict.keys(), f"Key {key} not in dict"

    return info_dict

print(extract_info(pages[1900]))

{'Title': 'Fødevarepolitik', 'Language': 'Dansk', 'Course code': 'NIFB21002U', 'Credit': '7,5 ECTS', 'Niveau': 'Bachelor', 'Varighed': '1 blok', 'Placement': 'Blok 2', 'Schedule': 'A (tirs 8-12 + tors 8-17)', 'Kursuskapacitet': 'Ingen begrænsning\n\nDer kan være færre pladser i eftertilmeldingsperioden', 'Course Coordinators': ['Carsten Daugbjerg']}


In [36]:
succesfully_parsed = 0

parsed_pages = []
for (i, page) in enumerate(pages):
    try:
        parsed_pages.append(extract_info(page))
        succesfully_parsed += 1
    except Exception as e:
        print('Error parsing ' + page + ': ' + str(e))
    print(f"Parsed {i} out of {str(len(pages))}", end='\r')
print('Succesfully parsed ' + str(succesfully_parsed) + ' out of ' + str(len(pages)))

Error parsing APSB21015U.html: Key Schedule not in dict
Error parsing ASDK20005U.html: Key Schedule not in dict
Error parsing AØKK08431U.html: Key Schedule not in dict
Error parsing AØKK08434U.html: Key Schedule not in dict
Error parsing AØKK08436U.html: Key Schedule not in dict
Error parsing HMØB0105FU.html: Key Credit not in dict
Error parsing JJUA14124U.html: Key Credit not in dict
Error parsing JJUA54019U.html: Key Credit not in dict
Error parsing JJUA54029U.html: Key Credit not in dict
Error parsing LSLS10132U.html: Key Schedule not in dict
Error parsing SGBB20008E.html: Key Schedule not in dict
Error parsing SGBK20013E.html: Key Schedule not in dict
Error parsing SGBK20014E.html: Key Schedule not in dict
Error parsing SGBK20015E.html: Key Schedule not in dict
Error parsing SMEA15022E.html: Key Schedule not in dict
Error parsing SMEA15025E.html: Key Schedule not in dict
Error parsing SMEA15031E.html: Key Schedule not in dict
Error parsing SMEA15033E.html: Key Schedule not in dict


In [37]:
# Print all unique keys
all_keys = []
for page in parsed_pages:
    for key in page.keys():
        if key not in all_keys:
            all_keys.append(key)
            print(key)

Title
Language
Course code
Credit
Level
Duration
Placement
Schedule
Course Coordinators
Niveau
Varighed
Kursuskapacitet
Course capacity


In [38]:
for elem in parsed_pages[2900].keys():
    print(elem, parsed_pages[2900][elem])

Title Neuroplasticitet. Hjernens foranderlighed - fra teori til praksis
Language Dansk
Course code SNRM20003U
Credit 7,5 ECTS
Niveau Master
Varighed 1 semester
Placement Spring
Schedule Detaljeret program findes på e-læringsportalen
Absalon
Kursuskapacitet 30
Course Coordinators ['Jens Bo Nielsen', 'Rasmus Feld Frisk']
