In [66]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import sys
import re
from os.path import exists
import pdfplumber
import os
import xmltodict

base_url = 'https://www.th-luebeck.de'

In [67]:
class Program:
    def __init__(self, name, identifier, qualification, parent, url = ''):
        self.name = name
        self.identifier = identifier
        self.qualification = qualification
        self.parent = parent
        self.url = url
    

    def to_dict(self):
        return {
            'Studiengang': '',
            'Name': self.name,
            'Identifier': self.identifier,
            'Bildungsstufe': self.qualification, # UNKNOWN, BACHELOR, MASTER, PH_D
            'Organisationseinheit': self.parent
        }
        

class Faculty:
    def __init__(self, name, identifier, abbreviation, parent = '', url = ''):
        self.name = name
        self.identifier = identifier
        self.abbreviation = abbreviation
        self.parent = parent
        self.url = url
    

    def to_dict(self):
        return {
            'OE': '',
            'Name': self.name,
            'Identifier': self.identifier,
            'abbreviation': self.abbreviation,
            'parent': self.parent
        }
        

class Stupo:
    def __init__(self, name, identifier, parent, desc = '', url = '', duration = ''):
        self.name = name
        self.identifier = identifier
        self.desc = desc
        self.url = url
        self.parent = parent
        self.url = url
        self.duration = duration
    

    def to_dict(self):
        return {
            'Stupo': '',
            'Name': self.name,
            'Identifier': self.identifier,
            'Beschreibungstext': self.desc,
            'Beschreibungslink': self.url,
            'parent': self.parent,
            'Dauer': self.duration
        }
        

class Programarea:
    def __init__(self, name, identifier, parent, freeChoice = False):
        self.name = name
        self.identifier = identifier
        self.parent = parent
        self.freeChoice = freeChoice
    

    def to_dict(self):
        return {
            'Studiengangsbereich': '',
            'Name': self.name,
            'Identifier': self.identifier,
            'freie Wahl': self.freeChoice,
            'parent': self.parent
        }
        

class Modul:
    def __init__(self, name, identifier, parent, desc = '', url = '', duration = '', credits = ''):
        self.name = name
        self.identifier = identifier
        self.desc = desc
        self.url = url
        self.parent = parent
        self.url = url
        self.duration = duration
        self.credits = credits
    

    def to_dict(self):
        return {
            'Modul': '',
            'Name': self.name,
            'identifier': self.identifier,
            'Beschreibungstext': self.desc,
            'Beschreibungslink': self.url,
            'parent': self.parent,
            'Dauer': self.duration,
            'credits': self.credits
        }

def parseQualification(qualification):
    qualification = qualification.lower()
    if 'bachelor' in qualification:
        return 'BACHELOR'
    elif 'master' in qualification:
        return 'MASTER'


class Modulhandbuch:

    def __init__(self):
        self.faculties = []
        self.programs = []
        self.stupos = []
        self.programareas = []
        self.modules = []


    def addFaculty(self, name, identifier, abbreviation, parent = '', url = ''):
        faculty = Faculty(name, identifier, abbreviation, parent, url)
        self.faculties.append(faculty)
        return faculty


    def getFaculty(self, identifier):
        if identifier == '':
            return None
            
        for faculty in self.faculties:
            if faculty.identifier == identifier:
                return faculty


    def addProgram(self, name, identifier, qualification, parent, url = ''):
        program = Program(name, identifier, qualification, parent, url)
        self.programs.append(program)
        return program


    def getProgram(self, name, qualification):
        if name == '' and qualification == '':
            return None
            
        for program in self.programs:
            if program.name == name and program.qualification == qualification:
                return program


    def addStupo(self, name, identifier, parent, desc = '', url = '', duration = ''):
        stupo = Stupo(name, identifier, parent, desc, url, duration)
        self.stupos.append(stupo)
        return stupo


    def getStupo(self, parent=''):
        if parent == '':
            return None
        
        for stupo in self.stupos:
            if stupo.parent == parent:
                return stupo


    def addProgramarea(self, name, identifier, parent, freeChoice = False):
        programarea = Programarea(name, identifier, parent, freeChoice)
        self.programareas.append(programarea)
        return programarea


    def getProgramarea(self, identifier):
        if identifier == '':
            return None

        for programarea in self.programareas:
            if programarea.identifier == identifier:
                return programarea


    def addModul(self, name, identifier, parent, desc = '', url = '', duration = '', credits=''):
        modul = Modul(name, identifier, parent, desc, url, duration, credits)
        self.modules.append(modul)
        return modul
    
    def updateModul(self, index, name, identifier, parent, desc = '', url = '', duration = '', credits=''):
        if index not in range(len(self.modules)):
            return
        
        modul = Modul(name, identifier, parent, desc, url, duration, credits)
        self.modules[index] = modul
        return modul


    def getModul(self, identifier='', name='', childof=''):
        if name == '' and identifier == '':
            return None
        

        for i in range(len(self.modules)):
            modul = self.modules[i]
            if identifier:
                if modul.identifier != identifier:
                    continue
            else:
                if modul.name != name:
                    continue
            
            if not childof:
                continue

            if  childof not in modul.identifier:
                continue

            return modul


    def getModulIndex(self, identifier='', name='', childof=''):
        if name == '' and identifier == '':
            return None
        

        for i in range(len(self.modules)):
            modul = self.modules[i]
            if identifier:
                if modul.identifier != identifier:
                    continue
            else:
                if modul.name != name:
                    continue
            
            if childof and childof not in modul.identifier:
                continue

            return i
        
        print(identifier, name, childof)

    
    def getModulChildren(self, parentid):
        children = []

        for i in range(len(self.modules)):
            modul = self.modules[i]

            if parentid in modul.identifier and modul.identifier != parentid:
                children.append(modul)
        
        return children


    def toCsv(self):
        dict = [faculty.to_dict() for faculty in self.faculties]
        df = pd.DataFrame.from_dict(dict)
        df.to_csv('output/csv/faculties.csv', index = False, header=True)

        dict = [program.to_dict() for program in self.programs]
        df = pd.DataFrame.from_dict(dict)
        df.to_csv('output/csv/programs.csv', index = False, header=True)

        dict = [stupo.to_dict() for stupo in self.stupos]
        df = pd.DataFrame.from_dict(dict)
        df.to_csv('output/csv/stupos.csv', index = False, header=True)

        dict = [programarea.to_dict() for programarea in self.programareas]
        df = pd.DataFrame.from_dict(dict)
        df.to_csv('output/csv/programareas.csv', index = False, header=True)

        dict = [modul.to_dict() for modul in self.modules]
        df = pd.DataFrame.from_dict(dict)
        df.to_csv('output/csv/modules.csv', index = False, header=True)

In [68]:
modulhandbuch = Modulhandbuch()

In [69]:
# Create Faculties. This is manageable and dosn'nt change often. Therefore it is hardcoded.
modulhandbuch.addFaculty('Angewandte Naturwissenschaften', '11', 'AN', url='https://www.th-luebeck.de/hochschule/fachbereich-angewandte-naturwissenschaften/studiengaenge/')
modulhandbuch.addFaculty('Bauwesen', '12', 'BAU', url='https://www.th-luebeck.de/hochschule/fachbereich-bauwesen/studiengaenge/')
modulhandbuch.addFaculty('Elektrotechnik und Informatik', '13', 'EI', url='https://www.th-luebeck.de/hochschule/fachbereich-elektrotechnik-und-informatik/studiengaenge/')
modulhandbuch.addFaculty('Maschinenbau und Wirtschaft', '14', 'MW', url='https://www.th-luebeck.de/hochschule/fachbereich-maschinenbau-und-wirtschaft/studiengaenge/')

<__main__.Faculty at 0x2cf9fae1cf0>

In [70]:
abbreviations = []

# Creates an abbreviation based on a given string for use as an identifier.
def getAbbreviation(name, qualification = '', check_duplicate=True, word_length = 3, max_words = 5):
    abbr = ''
    split = re.split(' |-', name)
    counter = 1
    while True:
        for i in range(min(len(split), max_words)):
            word = split[i]
            word = word.upper()
            # word = re.sub(r'\s', '', word)
            if len(word) <= 1 or (len(word) <= 3 and word != split[i]) or split[i][0] == '(':
                continue
            
        
            if i != 0:
                abbr +='-'

            abbr += word[:word_length]

        if qualification == 'BACHELOR':
            abbr += '-BA'
        elif qualification == 'MASTER':
            abbr += '-MA'
        
        if not check_duplicate:
            break

        if counter > 1:
            abbr += '-' + str(counter)

        if abbr not in abbreviations:
            abbreviations.append(abbr)
            break
        if word_length > 3:
            counter += 1
        else:
            word_length += 1
        
        print('Duplicate abbreviation: ' + abbr)
    
    return abbr

In [71]:
program_blacklist = ['Wirtschaftsingenieur Lebensmittelindustrie']


def scrapePrograms():
    for faculty in modulhandbuch.faculties:
        r = requests.get(faculty.url)
        
        # Check status code for response received
        if r.status_code not in range(200, 300):
            print('Status ' + str(r.status_code) + ' at  ' + faculty.url + '/weiteres')
            continue

        # Parsing the HTML
        soup = bs(r.content, 'html.parser')

        # Finding table of studyprogrammes
        main = soup.find(id='main')
        table = soup.find('table')

        # find all the anchor tags with 'href'
        for link in table.find_all('a'):
            qualification = parseQualification(link.parent.parent.parent.findAll('td')[2].find('abbr').get('title'))

            text = link.text
            name = ''
            match = re.search(r'(.+) \((.+)\)', text)
            if match:
                name = match.group(1)
                identifier = match.group(2)
            else:
                name = text
                identifier = getAbbreviation(name, qualification)

            if identifier.upper() != identifier:
                identifier = getAbbreviation(name, qualification)

            if name in program_blacklist:
                continue
            
            parent = faculty.identifier
            url = base_url + link.get('href')

            modulhandbuch.addProgram(name, faculty.abbreviation + '_' + identifier, qualification, parent, url)


scrapePrograms()

Duplicate abbreviation: WIR-LEB-BA


In [72]:
def scrapeDuration(pdf):
    for page in pdf.pages:
        text = page.extract_text()
        split = text.split('\n')
        for i in range(len(split)):
            line = split[i]
            match = re.match(r'.*Die Regelstudienzeit beträgt (.+) .*Semester.', line, re.IGNORECASE)
            if match:
                dur = match.group(1)
                if dur == 'ein' or dur == '1':
                    duration = '6 Monate'
                elif dur == 'zwei' or dur == '2':
                    duration = '12 Monate'
                elif dur == 'drei' or dur == '3':
                    duration = '18 Monate'
                elif dur == 'vier' or dur == '4':
                    duration = '24 Monate'
                elif dur == 'fünf' or dur == '5':
                    duration = '30 Monate'
                elif dur == 'sechs' or dur == '6':
                    duration = '36 Monate'
                elif dur == 'sieben' or dur == '7':
                    duration = '42 Monate'
                elif dur == 'acht' or dur == '8':
                    duration = '48 Monate'
                return duration
            else:
                match = re.match(r'.*Die Regelstudienzeit beträgt (.+) .*jahr', line, re.IGNORECASE)
                if match:
                    dur = match.group(1)
                    if dur == 'ein' or dur == '1':
                        duration = '12 Monate'
                    elif dur == 'zwei' or dur == '2':
                        duration = '24 Monate'
                    elif dur == 'drei' or dur == '3':
                        duration = '36 Monate'
                    elif dur == 'vier' or dur == '4':
                        duration = '48 Monate'
                    return duration
    
    return ''

In [73]:
def scrapeStupos():
    for program in modulhandbuch.programs:
        # if program.parent != '12':
        if program.parent != '13' and program.parent != '12':
            # print('Skipped: ' + program.identifier)
            continue

        faculty = modulhandbuch.getFaculty(identifier=program.parent)
        if faculty == None:
            print('Couldnt find faculy for: ' + program.parent)
            return

        dirname = 'output/pdf/' + faculty.abbreviation + '/stupos/' + program.identifier
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        
        r = requests.get(program.url)
    
        # Check status code for response received
        if r.status_code not in range(200, 300):
            print('Status ' + str(r.status_code) + ' at  ' + program.url)
            continue

        # Parsing the HTML
        soup = bs(r.content, 'html.parser')

        stupo_links = [ 
            link['href'] for link in soup.find_all('a', href=True) if re.match(r'.*Satzungen%20zu%20Studium%20und%20Prfungen.*_(S)?PO(_\d+)?\.pdf$', link['href'])
        ]

        url = ''
        desc = ''
        duration = ''
        
        if stupo_links or len(stupo_links) >= 1:
            url = stupo_links[0]
            date = ''
            match = re.match(r'.*Satzungen%20zu%20Studium%20und%20Prfungen.*_(S)?PO_(\d+)\.pdf$', url)
            if match:
                date = match.group(2)

            
            filename = dirname + '/SPO_' + program.identifier + date + '.pdf'
            if not exists(filename):
                # Download stupo pdf.
                pdf_response = requests.get(url)
                with open(filename, 'wb') as f:
                    f.write(pdf_response.content)

            with pdfplumber.open(filename) as pdf:
                page = pdf.pages[0]
                text = page.extract_text()
                split = text.split('\n')
                # print(text)
                desc_index = None
                for i in range(len(split)):
                    line = split[i]
                    if date == '':
                        match = re.match(r'Vom \d+\. \w+ (\d{4})', line)
                        if match:
                            date = match.group(1)

                    if desc_index == None:
                        if re.sub(r'\s', '', line) == '§2' and 'Studiengang' in split[i+1]:
                            desc_index = i+2
                            break
                
                
                if desc_index:
                    for i in range(desc_index, len(split)):
                        if re.sub(r'\s', '', split[i]) == '§3':
                            desc = desc.strip()
                            break

                        desc += split[i] + '\n'
                    

                duration = scrapeDuration(pdf)

        # Handle multiple diffrent SPO pdfs. Choose the latest SPO.
        # if len(stupo_links) > 1:
        #     dates = []
        #     for link in stupo_links:
        #         match = re.match(r'.*Satzungen%20zu%20Studium%20und%20Prfungen.*SPO_(\d+)\.pdf$', url)
        #         if match:
        #             dates.append(match.group(1))
        #         else:
        #             dates.append(0)
            
        #     print(dates)

        not_found = ''
        if not url:
            not_found += 'URL '
        if not desc:
            not_found += 'Description '
        if not duration:
            not_found += 'Duration '

        if not_found != '':
            print(not_found + 'not found for: ' + program.identifier)

        stupo_identifier = program.identifier + date
        stupo_name = program.name
        if program.qualification == 'BACHELOR':
            stupo_name += ' Bachelor'
        elif program.qualification == 'MASTER':
            stupo_name += ' Master'

        modulhandbuch.addStupo(stupo_name + ' SPO ' + date, stupo_identifier, program.identifier, url=url, desc=desc, duration=duration)
        modulhandbuch.addProgramarea(stupo_name + ' Pflichtmodule', stupo_identifier + '_PF', stupo_identifier, freeChoice=False)
        modulhandbuch.addProgramarea(stupo_name + ' Wahlpflichtmodule', stupo_identifier + '_WAPF', stupo_identifier, freeChoice=True)
        modulhandbuch.addProgramarea(stupo_name + ' Wahlmodule', stupo_identifier + '_WA', stupo_identifier, freeChoice=True)


scrapeStupos()

In [74]:
def scrapeModulhandbuchPDF_v1(pdf, url, stupo, en=False):
    def scrapeName(split, en=False):
        if en:
            match = re.search(r'1. *(.*) *', split[4])
            if match:
                return match.group(1).strip()
        match = re.search(r'1. *(.*) {2,}.*$', split[4])
        if match:
            return match.group(1).strip()
        
        return ''

    def scrapeModulart(split, en=False):
        if en:
            match = re.search(r'Module type *(.*) *Self-study hours', split[5])
            if match:
                return match.group(1).strip()

        match = re.search(r'Modulart *(\w*) *Eigenstudiumsstunden', split[5])
        if match:
            return match.group(1).strip()
        
        return ''

    def scrapeAbbreviation(split, en=False):
        if en:
            match = re.search(r'Module abbreviation *(.*) *Seminar', split[15])
            if match:
                return match.group(1).strip()
        
        match = re.search(r'Modulkürzel *(.*) *Seminar', split[15])
        if match:
            return match.group(1).strip()
        
        return ''

    def scrapeCreditsAndDuration(split, en=False):
        if en:
            match = re.search(r'2. ECTS *(\d+([\.,]\d?)?) *CP 3. Workload (\d+) h', split[3])
            if match:
                return match.group(1).strip(), match.group(3).strip() + ' Stunden'
        match = re.search(r'2. ECTS-Leistungspunkte *(\d+([\.,]\d?)?) *LP 3. Arbeitsaufwand (\d+) h', split[3])
        if match:
            return match.group(1).strip(), match.group(3).strip() + ' Stunden'
        
        return '', ''

    def scrapeDescription(split, en=False):
        desc = ''
        kenntnisse_start = False
        kenntnisse = ''
        fertigkeiten_start = False
        fertigkeiten = ''
        kompetenzen_start = False
        kompetenzen = ''

        for line in split:
            if line.startswith('8. Kenntnisse') or line.startswith('8. Knowledge'):
                kenntnisse_start = True
                line = re.sub(r'8\. (Kenntnisse|Knowledge) *', '', line)
            elif line.startswith('9. Fertigkeiten') or line.startswith('9. Skills'):
                kenntnisse_start = False
                fertigkeiten_start = True
                line = re.sub(r'9\. (Fertigkeiten|Skills) *', '', line)
            elif line.startswith('10. Kompetenzen') or line.startswith('10. Learning outcomes'):
                fertigkeiten_start = False
                kompetenzen_start = True
                line = re.sub(r'10\. (Kompetenzen|Learning outcomes) *', '', line)
        
            if kenntnisse_start:
                kenntnisse += '\n' + line
            elif fertigkeiten_start:
                fertigkeiten += '\n' + line
            elif kompetenzen_start:
                kompetenzen += '\n' + line

        kenntnisse = cleanText(kenntnisse)
        fertigkeiten = cleanText(fertigkeiten)
        kompetenzen = cleanText(kompetenzen)
        
        if en:
            if kenntnisse:
                desc += 'Knowledge\n' + kenntnisse + '\n\n'
            if fertigkeiten:
                desc += 'Skills\n' + fertigkeiten + '\n\n'
            if kompetenzen:
                desc += 'Learning outcomes\n' + kompetenzen
        else:
            if kenntnisse:
                desc += 'Kenntnisse\n' + kenntnisse + '\n\n'
            if fertigkeiten:
                desc += 'Fertigkeiten\n' + fertigkeiten + '\n\n'
            if kompetenzen:
                desc += 'Kompetenzen\n' + kompetenzen

        return desc
    
    def scrapeLehrveranstaltungen(split, parent, en=False):
        if en:
            if 'Lecture x' in split[11]:
                modulhandbuch.addModul(parent.name + ' (Lecture)', parent.identifier + '_V', parent.identifier, '', parent.url, '', '')
            if 'Excercise x' in split[12]:
                modulhandbuch.addModul(parent.name + ' (Excercise)', parent.identifier + '_Ü', parent.identifier, '', parent.url, '', '')
            if 'Practical training x' in split[13]:
                modulhandbuch.addModul(parent.name + ' (Practical training)', parent.identifier + '_Pr', parent.identifier, '', parent.url, '', '')
            if 'Project work x' in split[14]:
                modulhandbuch.addModul(parent.name + ' (Project work)', parent.identifier + '_Pj', parent.identifier, '', parent.url, '', '')
            if 'Seminar x' in split[15]:
                modulhandbuch.addModul(parent.name + ' (Seminar)', parent.identifier + '_S', parent.identifier, '', parent.url, '', '')
            if 'Excursion x' in split[16]:
                modulhandbuch.addModul(parent.name + ' (Excursion)', parent.identifier + '_E', parent.identifier, '', parent.url, '', '')
        else:
            if 'Vorlesung (V) x' in split[11]:
                modulhandbuch.addModul(parent.name + ' (Vorlesung)', parent.identifier + '_V', parent.identifier, '', parent.url, '', '')
            if 'Übung (Ü) x' in split[12]:
                modulhandbuch.addModul(parent.name + ' (Übung)', parent.identifier + '_Ü', parent.identifier, '', parent.url, '', '')
            if 'Praktikum (PR) x' in split[13]:
                modulhandbuch.addModul(parent.name + ' (Praktikum)', parent.identifier + '_Pr', parent.identifier, '', parent.url, '', '')
            if 'Projekt (Pj) x' in split[14]:
                modulhandbuch.addModul(parent.name + ' (Projekt)', parent.identifier + '_Pj', parent.identifier, '', parent.url, '', '')
            if 'Seminar (S) x' in split[15]:
                modulhandbuch.addModul(parent.name + ' (Seminar)', parent.identifier + '_S', parent.identifier, '', parent.url, '', '')
            if 'Exkursion (E) x' in split[16]:
                modulhandbuch.addModul(parent.name + ' (Exkursion)', parent.identifier + '_E', parent.identifier, '', parent.url, '', '')


    programarea_pf = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_PF')
    if not programarea_pf:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_PF')
        return
    programarea_wapf = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_WAPF')
    if not programarea_wapf:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_WAPF')
        return
    programarea_wa = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_WA')
    if not programarea_wa:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_WA')
        return

    pages_count = len(pdf.pages)
    
    for i in range(pages_count):
        page = pdf.pages[i]
        layout_text = page.extract_text(layout=True)
        layout_split = layout_text.split('\n')
        text = page.extract_text(layout=False)
        split = text.split('\n')
        # first_5_lines = ' '.join(split[:5])
        # print(split)

        name = scrapeName(layout_split, en)
        if not name:
            print('Couldnt get name of module in ' + stupo.name)
            return
        modulart = scrapeModulart(split, en)
        if not modulart:
            print('Couldnt get Verpflichtungsgrad of ' + name)
            return
        if modulart == 'Pflichtmodul' or modulart == 'Compulsory':
            parent = programarea_pf
        elif modulart == 'Vertiefungsmodul' or modulart == 'Elective':
            parent = programarea_wapf
        else:
            print('Unkwown verpflichtungsgrad "' + modulart + '" of ' + name)
            continue

        abbreviation = scrapeAbbreviation(split, en)
        if not abbreviation:
            abbreviation = getAbbreviation(name, check_duplicate=False, word_length=2, max_words=2)
            
        identifier = parent.identifier + '_' + abbreviation

        desc = scrapeDescription(split, en)
        credits, duration = scrapeCreditsAndDuration(split, en)

        # print('\nname: ')
        # print(name)
        # print('\nkürzel: ')
        # print(abbreviation)
        # print('\nidentifier: ')
        # print(identifier)
        # print('\ndesc: ')
        # print(desc)
        # print('\ncredits: ')
        # print(credits)
        # print('\nduration: ')
        # print(duration)
        modul = modulhandbuch.addModul(name, identifier, parent.identifier, desc, url, '', credits)
        scrapeLehrveranstaltungen(split, modul, en)

In [75]:
def cleanText(text):
    if not text:
        return ''
 
    text = re.sub(r' *\n *(\d+(\.(\d+)?)*) +(.+)', r'<br>\1 \4', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\* +', '\n- ', text)
    text = re.sub(r'• +', '\n- ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'<br>', '\n', text)
    return text.strip()


def scrapeModulhandbuchPDF_v2(pdf, url, stupo, en=False, version=1):
    def scrapeName(text, en = False, version=1):
        if en:
            match = re.search(r'Module: *(.*) *Level', text)
            if match:
                return match.group(1).strip()
            else:
                match = re.search(r'Module Course: *(.*) *\(of Module:', text)
                if match:
                    return match.group(1).strip()
        
        if version == 2:
            match = re.search(r'Modulname: *(.*) *Modulnummer', text)
            if match:
                return match.group(1).strip()
            else:
                match = re.search(r'Lehrveranstaltungsname.*: *(.*) *Lehrveranstaltungsnr.', text)
                if match:
                    return match.group(1).strip()
        
        match = re.search(r'Modul: *(.*) *Niveau', text)
        if match:
            return match.group(1).strip()
        else:
            match = re.search(r'Lehrveranstaltung: *(.*) *\(zu Modul:', text)
            if match:
                return match.group(1).strip()
        
        return ''

    def scrapeParentName(text, en = False):
        if en:
            match = re.search(r'\(of Module: *(.*) *\) *Course Type', text)
            if match:
                return match.group(1).strip()
        
        match = re.search(r'\(zu Modul: *(.*) *\) *Lehrveranstaltungsart', text)
        if match:
            return match.group(1).strip()
        
        return ''
        
    def scrapeLehrveranstaltungsart(text, en = False):
        if en:
            match = re.search(r'Course Type *(.*) *Form of Learning', text)
            if match:
                return match.group(1).strip()
        
        match = re.search(r'Lehrveranstaltungsart *(.*) *Lernform', text)
        if match:
            return match.group(1).strip()
        
        return ''
        
    def scrapeAbbreviation(text, en = False):
        if en:
            match = re.search(r'Short Name *(.*) *Responsible Lecturers', text)
            if match:
                return match.group(1).strip().replace(' ', '-')

        match = re.search(r'ürzel *(.*) *Modulname englisch', text)
        if match:
            return match.group(1).strip().replace(' ', '-')
        
        return ''
        
    def scrapeVerpflichtungsgrad(text, en = False):
        if en:
            match = re.search(r'Compulsory/elective *(.*) *ECTS', text)
            if match:
                return match.group(1).strip()
        
        match = re.search(r'Verpflichtungsgrad *(.*) *ECTS', text)
        if match:
            return match.group(1).strip()
        
        return ''
        
    def scrapeDuration(text, en = False):
        if en:    
            match = re.search(r'Workload \(hours\) *(\d+) *(Frequency|Teaching Language)', text)
            if match:
                return match.group(1).strip() + ' Stunden'
            
        match = re.search(r'Arbeitsaufwand in Stunden *(\d+) *(Angebotshäufigkeit|Lehrsprache)', text)
        if match:
            return match.group(1).strip() + ' Stunden'
        
        return ''
        
    def scrapeCredits(text, en = False):
        if en:
            match = re.search(r'ECTS Credit Points *(\d+([\.,]\d?)?) *(Semester of Studies|Participation Limit)', text)
            if match:
                return match.group(1).strip()
        
        match = re.search(r'ECTS-Leistungspunkte *(\d+([\.,]\d?)?) *(Fachsemester|Teilnahmebeschränkung)', text)
        if match:
            return match.group(1).strip()
        
        return ''

    def scrapeLernergebnisse(split, en = False):
        start = False
        lernergebnisse = ''

        for line in split:
            if line.startswith('Lernergebnisse') or line.startswith('Learning Outcomes'):
                start = True
                line = re.sub(r'(Lernergebnisse|Learning Outcomes)', '', line)
            elif line.startswith('Teilnahmevoraussetzungen') or line.startswith('Participation Prerequisites'):
                break
        
            if start:
                lernergebnisse += '\n' + line

        lernergebnisse = cleanText(lernergebnisse)
        if lernergebnisse:
            if en:
                lernergebnisse = 'Learning Outcomes\n' + lernergebnisse
            else:
                lernergebnisse = 'Lernergebnisse\n' + lernergebnisse
        
        return lernergebnisse

    def scrapeLehrinhalte(split, en = False):
        start = False
        lehrinhalte = ''

        for line in split:
            if line.startswith('Lehrinhalte') or line.startswith('Contents'):
                start = True
                line = re.sub(r'(Lehrinhalte|Contents)', '', line)
            elif line.startswith('Literatur'):
                break
        
            if start:
                lehrinhalte += '\n' + line

        lehrinhalte = cleanText(lehrinhalte)
        if lehrinhalte:
            if en:
                lehrinhalte = 'Contents\n' + lehrinhalte
            else:
                lehrinhalte = 'Lehrinhalte\n' + lehrinhalte
        return lehrinhalte
        
    def scrapeDescription(split, en = False):
        lernergebnisse = scrapeLernergebnisse(split, en)
        lehrinhalte = scrapeLehrinhalte(split, en)

        if lernergebnisse and lehrinhalte:
            return lernergebnisse + '\n\n' + lehrinhalte
        elif lernergebnisse:
            return lernergebnisse
        elif lehrinhalte:
            return lehrinhalte
        return ''

    def isModul(text):
        if version == 2:
            return 'Modulname:' in text
        else:
            if en:
                return 'Module:' in text
            else:
                return 'Modul:' in text or 'Module:' in text

    def isLehrveranstaltung(text):
        if version == 2:
            return 'Lehrveranstaltung(en) des Moduls' in text
        else:
            if en:
                return text.startswith('Module Course:')
            else:
                return text.startswith('Lehrveranstaltung:')
    

    programarea_pf = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_PF')
    if not programarea_pf:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_PF')
        return
    programarea_wapf = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_WAPF')
    if not programarea_wapf:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_WAPF')
        return
    programarea_wa = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_WA')
    if not programarea_wa:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_WA')
        return

    pages_count = len(pdf.pages)

    last_modul_name = ''
    
    for i in range(pages_count):
        page = pdf.pages[i]
        text = page.extract_text(layout=False)
        split = text.split('\n')

        first_5_lines = ' '.join(split[:5])

        is_modul = isModul(first_5_lines)

        if not is_modul:
            continue
        
        first_20_lines = ' '.join(split[:20])
        
        is_lehrveranstaltung = isLehrveranstaltung(first_5_lines)

        if is_lehrveranstaltung:
            if i+1 < pages_count:
                next_text = pdf.pages[i+1].extract_text(layout=False)
                next_split = next_text.split('\n')
                next_5_lines = ' '.join(next_split[:5])

                if not isModul(next_5_lines):
                    text += '\n' + next_text
                    if version == 2:
                        split = split[:len(split)-3] + next_split
                    else:
                        split = split[:len(split)-1] + next_split
        else:
            if i+1 < pages_count:
                next_text = pdf.pages[i+1].extract_text(layout=False)
                next_split = next_text.split('\n')
                next_5_lines = ' '.join(next_split[:5])

                if not isModul(next_5_lines):
                    text += '\n' + next_text
                    if version == 2:
                        split = split[:len(split)-3] + next_split
                    else:
                        split = split[:len(split)-1] + next_split
        
        name = scrapeName(first_5_lines, en, version)
        if not name:
            print('Couldnt get name of module in programarea ' + stupo.name)
            print(first_5_lines)
            return
        if is_lehrveranstaltung:
            parent_name = ''
            if version == 2:
                if last_modul_name:
                    parent_name = last_modul_name
            else:
                parent_name = scrapeParentName(first_5_lines, en)
            
            if not parent_name:
                print('Couldnt find parent name of: ' + name)
                continue

            parent = modulhandbuch.getModul(name=parent_name, childof=stupo.identifier)
            if not parent:
                print('Couldnt find parent modul with name: ' + parent_name)
                continue
            
            lehrveranstaltungsart = scrapeLehrveranstaltungsart(first_5_lines, en)
            if lehrveranstaltungsart == '':
                identifier = parent.identifier + '_' + getAbbreviation(name, check_duplicate=False, word_length=2, max_words=2)
            elif lehrveranstaltungsart == 'Vorlesung' or lehrveranstaltungsart == 'Lecture':
                identifier = parent.identifier + '_V'
            elif lehrveranstaltungsart == 'Projekt' or lehrveranstaltungsart == 'Project Work':
                identifier = parent.identifier + '_Pj'
            elif lehrveranstaltungsart == 'Praktikum' or lehrveranstaltungsart == 'Practical Training':
                identifier = parent.identifier + '_Pr'
            elif lehrveranstaltungsart == 'Übung' or lehrveranstaltungsart == 'Exercise':
                identifier = parent.identifier + '_Ü'
            elif lehrveranstaltungsart == 'Seminar' or lehrveranstaltungsart == 'Seminar':
                identifier = parent.identifier + '_S'
            elif lehrveranstaltungsart == 'Exkursion' or lehrveranstaltungsart == 'Excursion':
                identifier = parent.identifier + '_Ex'
            elif lehrveranstaltungsart == 'Online-':
                identifier = parent.identifier + '_O'
                lehrveranstaltungsart = 'Online'
            else:
                print('Unkwon Lehrveranstaltungsart "' + lehrveranstaltungsart + '" of ' + name)
                print(text)
                continue
            
            if name == parent.name:
                name += ' (' + lehrveranstaltungsart + ')'
        else:
            last_modul_name = name
            verpflichtungsgrad = scrapeVerpflichtungsgrad(first_20_lines, en)
            if not verpflichtungsgrad:
                print('Couldnt get Verpflichtungsgrad of ' + name)
                continue
            if verpflichtungsgrad == 'Pflicht' or verpflichtungsgrad == 'Compulsory':
                parent = programarea_pf
            elif verpflichtungsgrad == 'Wahlpflicht' or verpflichtungsgrad == 'Compulsory elective':
                parent = programarea_wapf
            elif verpflichtungsgrad == 'Wahl' or verpflichtungsgrad == 'Elective':
                parent = programarea_wa
            elif verpflichtungsgrad == '(Nicht festgelegt)':
                if '. Fachsemester' in first_5_lines:
                    parent = programarea_pf
                else:
                    parent = programarea_wa
            else:
                print('Unkwown verpflichtungsgrad "' + verpflichtungsgrad + '" of ' + name)
                continue
            abbreviation = scrapeAbbreviation(first_5_lines, en)
            if not abbreviation:
                abbreviation = getAbbreviation(name, check_duplicate=False, word_length=2, max_words=2)
            
            identifier = parent.identifier + '_' + abbreviation

        desc = scrapeDescription(split, en)
        # duration = scrapeDuration(first_20_lines, en)
        credits = scrapeCredits(first_20_lines, en)

        # if en:
        #     print('\nname: ')
        #     print(name)
        #     print('\nidentifier: ')
        #     print(identifier)
        #     print('\ndesc: ')
        #     print(desc)
        #     print('\nduration: ')
        #     print(duration)
        #     print('\ncredits: ')
        #     print(credits)
        #     print('\n\n')
        modulhandbuch.addModul(name, identifier, parent.identifier, desc, url, '', credits)

In [76]:
def scrapeModulhandbuchPDF_v3(pdf, url, stupo, en=False):
    def scrapeName(split, en = False):
        match = re.search(r'\d* *(.*)', split[0])
        if match:
            return match.group(1).strip()
        
        return ''
        
    def scrapeVerpflichtungsgrad(text, en = False):
        # if en:
        #     match = re.search(r'Compulsory/elective *(.*)', split[5])
        #     if match:
        #         return match.group(1).strip()
        
        match = re.search(r'Pflicht/ Wahlpflicht *(.*) *\n', text)
        if match:
            return match.group(1).strip()
        
        return ''
        
    def scrapeDuration(text, en = False):
        # if en:    
        #     match = re.search(r'Workload \(hours\) *(\d+) *(Frequency|Teaching Language)', text)
        #     if match:
        #         return match.group(1).strip() + ' Stunden'
        
        duration = 0
        match = re.search(r'(Arbeitsaufwand )?Selbststudium: *(ca\. *)?(\d+((,|.)\d+)?) *h', text)
        if match:
            duration += float(match.group(3).strip().replace(',', '.'))
        match = re.search(r'Betreutes Lernen: *(\d+) *h', text)
        if match:
            duration += float(match.group(1).strip().replace(',', '.'))
        match = re.search(r'Vorbereitung PVL: *(\d+) *h', text)
        if match:
            duration += float(match.group(1).strip().replace(',', '.'))
        
        return str(duration).replace('.', ',').replace(',0', '') + ' Stunden'
    
    def scrapeCredits(text, en = False):
        # if en:
        #     match = re.search(r'ECTS Credit Points *(\d+([\.,]\d?)?) *(Semester of Studies|Participation Limit)', text)
        #     if match:
        #         return match.group(1).strip()
        
        match = re.search(r'Credit Points *(.*) *\n', text)
        if match:
            return match.group(1).strip()
        
        return ''

    def scrapeLernergebnisse(split, en = False):
        start = False
        lernergebnisse = ''

        for line in split:
            if line.startswith('Lernergebnisse'):
                start = True
                line = re.sub(r'(Lernergebnisse)', '', line)
            elif line.startswith('Prüfungsvorleistung'):
                break
        
            if start:
                lernergebnisse += '\n' + line

        lernergebnisse = cleanText(lernergebnisse)
        if lernergebnisse:
            if en:
                lernergebnisse = 'Learning Outcomes\n' + lernergebnisse
            else:
                lernergebnisse = 'Lernergebnisse\n' + lernergebnisse
        
        return lernergebnisse

    def scrapeLehrinhalte(split, en = False):
        start = False
        lehrinhalte = ''

        for line in split:
            if line.startswith('Studieninhalte'):
                start = True
                line = re.sub(r'(Studieninhalte)', '', line)
            # elif line.startswith('Literatur'):
            #     break
        
            if start:
                lehrinhalte += '\n' + line

        lehrinhalte = cleanText(lehrinhalte)
        if lehrinhalte:
            if en:
                lehrinhalte = 'Contents\n' + lehrinhalte
            else:
                lehrinhalte = 'Studieninhalte\n' + lehrinhalte
        return lehrinhalte
        
    def scrapeDescription(split, en = False):
        lernergebnisse = scrapeLernergebnisse(split, en)
        lehrinhalte = scrapeLehrinhalte(split, en)

        if lernergebnisse and lehrinhalte:
            return lernergebnisse + '\n\n' + lehrinhalte
        elif lernergebnisse:
            return lernergebnisse
        elif lehrinhalte:
            return lehrinhalte
        return ''

    def isLehrveranstaltung(text):
        if en:
            return text.startswith('Module Course:')
        else:
            return text.startswith('Lehrveranstaltung:')
    

    programarea_pf = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_PF')
    if not programarea_pf:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_PF')
        return
    programarea_wapf = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_WAPF')
    if not programarea_wapf:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_WAPF')
        return
    programarea_wa = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_WA')
    if not programarea_wa:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_WA')
        return

    pages_count = len(pdf.pages)

    last_modul_name = ''

    text = ''
    split = []
    
    for i in range(pages_count):
        page = pdf.pages[i]
        thistext = page.extract_text(layout=False)
        text += '\n' + thistext
        thissplit = thistext.split('\n')
        split += thissplit[1:len(thissplit)-1]


    first_8_lines = '\n'.join(split[:8])
    first_20_lines = '\n'.join(split[:20])
        
    name = scrapeName(split, en)
    if not name:
        print('Couldnt get name of module in programarea ' + stupo.name)
        print(split)
        return
    
    verpflichtungsgrad = scrapeVerpflichtungsgrad(first_8_lines, en)
    if not verpflichtungsgrad:
        print('Couldnt get Verpflichtungsgrad of ' + name)
        print(first_8_lines)
        return
    
    if verpflichtungsgrad == 'Pflicht' or verpflichtungsgrad == 'Compulsory':
        parent = programarea_pf
    elif verpflichtungsgrad == 'Wahlpflicht' or verpflichtungsgrad == 'Compulsory elective':
        parent = programarea_wapf
    elif verpflichtungsgrad == 'Wahl' or verpflichtungsgrad == 'Elective':
        parent = programarea_wa
    elif verpflichtungsgrad == '(Nicht festgelegt)':
        if '. Fachsemester' in first_8_lines:
            parent = programarea_pf
        else:
            parent = programarea_wa
    else:
        print('Unkwown verpflichtungsgrad "' + verpflichtungsgrad + '" of ' + name)
        return
    
    abbreviation = getAbbreviation(name, check_duplicate=False, word_length=2, max_words=3)
    
    identifier = parent.identifier + '_' + abbreviation

    desc = scrapeDescription(split, en)
    # duration = scrapeDuration(text, en)
    credits = scrapeCredits(first_20_lines, en)

    
    # print('\nname: ')
    # print(name)
    # print('\nidentifier: ')
    # print(identifier)
    # print('\nverpflichtungsgrad: ')
    # print(verpflichtungsgrad)
    # print('\ndesc: ')
    # print(desc)
    # print('\nduration: ')
    # print(duration)
    # print('\ncredits: ')
    # print(credits)
    
    modulhandbuch.addModul(name, identifier, parent.identifier, desc, url, '', credits)

In [77]:
modulhandbuch.modules = []

def scrapeModules():
    for program in modulhandbuch.programs:
        if program.parent != '13' and program.parent != '12':
            # print('Skipped: ' + program.identifier)
            continue
        modulhandbuch_urls = getModulhandbuchURLs(program.url)
        if modulhandbuch_urls:
            for url in modulhandbuch_urls:
                scrapeModulhandbuchPDF(url, program)
        else:
            print('Couldnt find any modulhandbücher online for ' + program.identifier)
        

def getModulhandbuchURLs(url):
    urls = getWeiteres(url)
    if urls:
        return urls
    
    urls = getStudienverlauf(url)
    if urls:
        return urls


def getStudienverlauf(url):
    weiteres_url = url + 'studienverlauf/'
    r = requests.get(weiteres_url)

    # Check status code for response received
    if r.status_code not in range(200, 300):
        print('Status ' + str(r.status_code) + ' at  ' + weiteres_url)
        return None

    # Parsing the HTML
    soup = bs(r.content, 'html.parser')

    modul_links = [ 
        link['href'] for link in soup.find_all('a', href=True) if re.match(r'^https:\/\/.*Plne%20und%20bersichten\/(.*)\.pdf$', link['href'])
    ]
    
    if len(modul_links) == 0:
        print('Kein Modulhanbuch gefunden auf ' + weiteres_url)
        return None
    else:
        # print('Modulhanbuch gefunden für ' + program.identifier)
        return modul_links   


def getWeiteres(url):
    weiteres_url = url + 'weiteres/'
    r = requests.get(weiteres_url)

    # Check status code for response received
    if r.status_code not in range(200, 300):
        print('Status ' + str(r.status_code) + ' at  ' + weiteres_url)
        return None

    # Parsing the HTML
    soup = bs(r.content, 'html.parser')

    modul_links = [ 
        link['href'] for link in soup.find_all('a', href=True) if re.match(r'^https:\/\/.*Plne%20und%20bersichten\/(.*_)?Modulhandbuch(_.*)?\.pdf$', link['href'])
    ]
    
    if len(modul_links) == 0:
        print('Kein Modulhanbuch gefunden auf ' + weiteres_url)
        return None
    else:
        # print('Modulhanbuch gefunden für ' + program.identifier)
        return [modul_links[0]]

pdf_blacklist = ['THL_Campusplan']

def scrapeModulhandbuchPDF(url, program):
    faculty = modulhandbuch.getFaculty(identifier=program.parent)
    if faculty == None:
        print('Couldnt find faculy for: ' + program.parent)
        return
    stupo = modulhandbuch.getStupo(parent = program.identifier)
    if stupo == None:
        print('Couldnt find stupo for: ' + program.identifier)
        return

    dirname = 'output/pdf/' + faculty.abbreviation + '/modulhandbuecher/' + program.identifier
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    if 'Modulhandbuch' in url:
        filename = dirname + '/' + program.identifier + '_Modulhandbuch.pdf'
    else:
        match = re.search(r'^https:\/\/.*Plne%20und%20bersichten\/(.*)\.pdf$', url)
        if match:
            if match.group(1) in pdf_blacklist:
                print('Skipped blacklisted pdf with name: ' + match.group(1))
                return
            filename = dirname + '/' + match.group(1) + '.pdf'
    
    if not exists(filename):
        # Download modulhandbuch pdf.
        pdf_response = requests.get(url)
        with open(filename, 'wb') as f:
            f.write(pdf_response.content)
    
    try:
        with pdfplumber.open(filename) as pdf:
            text = pdf.pages[0].extract_text()
            if text.startswith('Fachbereich'):
                scrapeModulhandbuchPDF_v1(pdf, url, stupo, en='hours per week' in text)
            elif text.startswith('Modulhandbuch') or 'Modul: ' in text or 'Module: ' in text:
                scrapeModulhandbuchPDF_v2(pdf, url, stupo, en='Module: ' in text)
            elif text.startswith('Formular für Modulbeschreibungen'):
                scrapeModulhandbuchPDF_v2(pdf, url, stupo, version=2)
            elif 'Pflicht/ Wahlpflicht' in text:
                scrapeModulhandbuchPDF_v3(pdf, url, stupo)
            else:
                print('Unkown pdf format for ' + stupo.name + ':')
                print(text[:400])
    # except (IOError, OSError) as e:
    except Exception as e:
        print('Error trying to read pdf: ', e)
        print('Couldnt open file with name: ' + filename)
        os.remove(filename)


scrapeModules()

Couldnt get name of module in Bauingenieurwesen Master SPO 2017
Status 404 at  https://www.th-luebeck.de/hochschule/fachbereich-elektrotechnik-und-informatik/studiengaenge/elektrotechnik-energiesysteme-und-automation-bsc/weiteres/
Skipped blacklisted pdf with name: THL_Campusplan
Status 404 at  https://www.th-luebeck.de/hochschule/fachbereich-elektrotechnik-und-informatik/studiengaenge/elektrotechnik-kommunikationssysteme-bsc/weiteres/
Skipped blacklisted pdf with name: THL_Campusplan
Status 404 at  https://www.th-luebeck.de/hochschule/fachbereich-elektrotechnik-und-informatik/studiengaenge/informatik-softwaretechnik-bsc/weiteres/
Skipped blacklisted pdf with name: THL_Campusplan
Status 404 at  https://www.th-luebeck.de/hochschule/fachbereich-elektrotechnik-und-informatik/studiengaenge/informationstechnologie-und-design-bsc/weiteres/
Skipped blacklisted pdf with name: THL_Campusplan
Status 404 at  https://www.th-luebeck.de/hochschule/fachbereich-elektrotechnik-und-informatik/studiengae

In [78]:
program_name_matching = {
    'Energie- und Gebäudeingenieurwesen': 'Nachhaltige Gebäudetechnik',
    'Informatik/Softwaretechnik': 'Informatik/ Softwaretechnik',
    'Bachelor Stadtplanung': 'Stadtplanung',
    'Electrical Engineering - Communication Systems': 'Elektrotechnik - Kommunikationssysteme',
    'Applied Information Technology': 'Angewandte Informationstechnik',
    'Electrical Engineering - Energy Systems and Automation Engineering': 'Elektrotechnik - Energiesysteme und Automation',
    'Computer Science/Software Engineering for Distributed Systems': 'Informatik/Softwaretechnik für verteilte Systeme',
}

def parseXML(xml):
    facultyname = xml['Modulbeschreibungen']['Modulbeschreibung'][0]['Modul']['M_Fachbereich']
    if facultyname != 'Bauwesen' and facultyname != 'Elektrotechnik und Informatik':
        return
    
    modul_xml = xml['Modulbeschreibungen']['Modulbeschreibung'][0]['Modul']
    
    m_studiengang = modul_xml['M_Studiengang']
    if not m_studiengang:
        print('Couldnt find m_studiengang')
        return
    base_program_name = re.sub(r'(, Bachelor)|(, Master)', '', m_studiengang)

    if base_program_name in program_name_matching.keys():
        base_program_name = program_name_matching[base_program_name]

    base_qualification = modul_xml['M_Niveau']
    if not base_qualification:
        print('No qualification found for ' + base_program_name)
        return
    base_qualification = parseQualification(base_qualification)

    base_program = modulhandbuch.getProgram(base_program_name, base_qualification)
    
    
    for modulbeschreibungen_xml in xml['Modulbeschreibungen']['Modulbeschreibung']:
        modul_xml = modulbeschreibungen_xml['Modul']
        lehrveranstaltungen_xml = modulbeschreibungen_xml['Lehrveranstaltungen']

        modul_name = modul_xml['M_Modulname']
        if not modul_name:
            print('No modul name found')
            continue
        qualification = modul_xml['M_Niveau']
        if not qualification:
            qualification = base_qualification
        else:
            qualification = parseQualification(qualification)

        m_studiengang = modul_xml['M_Studiengang']
        if not m_studiengang:
            program = base_program
        else:
            program_name = re.sub(r'(, Bachelor)|(, Master)', '', m_studiengang)
            if program_name in program_name_matching.keys():
                program_name = program_name_matching[program_name]

            program = modulhandbuch.getProgram(program_name, qualification)
            if not program:
                print('Couldnt find program for name: ' + program_name)
                return

        modul_index = modulhandbuch.getModulIndex(name=modul_name, childof=program.identifier)
        modul = ''
        if not modul_index:
            print('Modul not yet in modulhandbuch: ' + modul_name)
            modul = addModulFromXML(modul_xml, modul_name, program)
        else:
            print('Update existing modul: ' + modul_name)
            modul = updateModulWithXML(modul_xml, modul_index)
        
        if modul:
            childModules = [addLehrveranstaltungFromXML(lehrveranstaltungen_xml[key], modul, str(index+1)) for (
                index, key) in enumerate(lehrveranstaltungen_xml)]
            
            print('Added ' + str(len(childModules)) + ' child modules for: ' + modul_name)

def addModulFromXML(modul_xml, name, program):
        
    stupo = modulhandbuch.getStupo(parent=program.identifier)
    if not stupo:
        print('Couldnt find parent of modul_xml: ', name)
        return
    
    programarea_pf = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_PF')
    if not programarea_pf:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_PF')
        return
    programarea_wapf = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_WAPF')
    if not programarea_wapf:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_WAPF')
        return
    programarea_wa = modulhandbuch.getProgramarea(identifier=stupo.identifier + '_WA')
    if not programarea_wa:
        print('Couldnt get programe area with identifier: ' + stupo.identifier + '_WA')
        return
    
    if modul_xml['M_Verpflichtungsgrad']:
        verpflichtungsgrad = modul_xml['M_Verpflichtungsgrad']
        if verpflichtungsgrad == 'Pflicht' or verpflichtungsgrad == 'Compulsory':
            parent = programarea_pf
        elif verpflichtungsgrad == 'Wahlpflicht' or verpflichtungsgrad == 'Compulsory elective':
            parent = programarea_wapf
        elif verpflichtungsgrad == 'Wahl' or verpflichtungsgrad == 'Elective':
            parent = programarea_wa
        else:
            print('Unkwown verpflichtungsgrad "' + verpflichtungsgrad + '" of ' + name)
            return
    else:
        parent = programarea_wa

    if modul_xml['M_Stundenplankuerzel']:
        identifier = parent.identifier + '_' + modul_xml['M_Stundenplankuerzel']
    else:
        identifier = parent.identifier + '_' + getAbbreviation(name, check_duplicate=False, word_length=2, max_words=3)

    if modul_xml['M_Lernergebnisse']:
        desc = unpackDict(modul_xml['M_Lernergebnisse']['html']['body'])
    else:
        # print('Couldnt find desc for modul_xml: ', name)
        desc = ''

    duration = int(modul_xml['M_Dauer_in_Semestern']) * 6
    if duration:
        duration = str(duration) + ' Monate'
    else:
        print('Couldnt find duration for modul_xml: ', name)
        duration = ''
    
    credits = modul_xml['M_ECTS_Leistungspunkte']
    return modulhandbuch.addModul(name, identifier, parent.identifier, desc, '', duration, credits)
    
def addLehrveranstaltungFromXML(lv_xml, parent, index):
    if lv_xml['L'+index+'_Lehrveranstaltungsart']:
        name = lv_xml['L'+index+'_Lehrveranstaltungsname']
    else:
        return
    
    identifier = ''
    if lv_xml['L'+index+'_Lehrveranstaltungsart']:
        lehrveranstaltungsart = lv_xml['L'+index+'_Lehrveranstaltungsart']
        if not lehrveranstaltungsart:
            identifier = parent.identifier + '_' + index
        elif lehrveranstaltungsart == 'Vorlesung' or lehrveranstaltungsart == 'Lecture':
            identifier = parent.identifier + '_V'
        elif lehrveranstaltungsart == 'Projekt' or lehrveranstaltungsart == 'Project Work':
            identifier = parent.identifier + '_Pj'
        elif lehrveranstaltungsart == 'Praktikum' or lehrveranstaltungsart == 'Practical Training':
            identifier = parent.identifier + '_Pr'
        elif lehrveranstaltungsart == 'Übung' or lehrveranstaltungsart == 'Exercise':
            identifier = parent.identifier + '_Ü'
        elif lehrveranstaltungsart == 'Seminar' or lehrveranstaltungsart == 'Seminar':
            identifier = parent.identifier + '_S'
        elif lehrveranstaltungsart == 'Exkursion' or lehrveranstaltungsart == 'Excursion':
            identifier = parent.identifier + '_Ex'
        elif lehrveranstaltungsart == 'Online-':
            identifier = parent.identifier + '_O'
            lehrveranstaltungsart = 'Online'
        else: 
            print('Unkwon Lehrveranstaltungsart of ' + name, lehrveranstaltungsart)
            return
    if not identifier:
        identifier = parent.identifier + '_' + index

    if lv_xml['L'+index+'_Lehrinhalte']:
        desc = unpackDict(lv_xml['L'+index+'_Lehrinhalte']['html']['body'])
    else:
        desc = ''
    
    
    return modulhandbuch.addModul(name, identifier, parent.identifier, desc, '', '', '')

def updateModulWithXML(modul_xml, index):
    modul = modulhandbuch.modules[index]

    if modul_xml['M_Stundenplankuerzel']:
        identifier = modul.parent + '_' + modul_xml['M_Stundenplankuerzel']
    else:
        identifier = modul.identifier
    
    if modul_xml['M_Lernergebnisse']:
        desc = unpackDict(modul_xml['M_Lernergebnisse']['html']['body'])
    else:
        desc = modul.desc
    
    duration = int(modul_xml['M_Dauer_in_Semestern']) * 6
    if duration:
        duration = str(duration) + ' Monate'
    else:
        duration = ''
    
    credits = modul_xml['M_ECTS_Leistungspunkte']
    
    children = modulhandbuch.getModulChildren(modul.identifier)
    for child in children:
        # child_index = modulhandbuch.getModulIndex(identifier=child.identifier, childof=modul.parent)
        # if child_index:
        modulhandbuch.modules.remove(child)
            
    print('Dleteted ' + str(len(children)) + ' child modules for: ' + modul.name)
    
    return modulhandbuch.updateModul(index, modul.name, identifier, modul.parent, desc, '', duration, credits)


def cleanHtml(text):
    if not text:
        return ''

    if not isinstance(text, str):
        print(text)

    text = re.sub(r'<li>', '- ', text)
    # text = re.sub(r'</li>', '\n- ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'</?\w+>', '', text)
    # text = re.sub(r'</\w+>', '', text)
    return text.strip()

# Returns only the text inside a given dictionary, and formats the the text according to the xml tag
def unpackDict(d, key=''):
    result = ''
    if isinstance(d, dict):
        for (k, v) in d.items():
            if key == 'li':
                result += '\t- '
            result += unpackDict(v, k)
            if((k == 'li' or k == 'p') and not result.endswith('\n')):
                result += "\n"
    elif isinstance(d, list):
        for v in d:
            if key == 'li':
                result += '\t- '
            result += unpackDict(v)
            if not result.endswith('\n'):
                result += '\n'
    elif isinstance(d, str):
        result += d
        result = result.strip()

    return result

def scanModulhandbuchXML():
    # Open a file
    path = r'./input'

    # Generate new XML for every file in the subdirectories of the input folder
    with os.scandir(path) as dirs:
        for dir in dirs:
            if os.path.isdir(dir):
                with os.scandir(dir) as subdir:
                    for file in subdir:
                        if os.path.isfile(file):
                            # Read THL Moduldescriptions and convert them to dictionaries
                            xml_in = open(
                                file, 'r', encoding='utf-8').read()
                            # print(file)
                            xml_in = xmltodict.parse(xml_in)

                            parseXML(xml_in)


scanModulhandbuchXML()

Update existing modul: Entwerfen und Konstruieren I (inkl. Kompaktwochen)
Dleteted 1 child modules for: Entwerfen und Konstruieren I (inkl. Kompaktwochen)
Added 2 child modules for: Entwerfen und Konstruieren I (inkl. Kompaktwochen)
Update existing modul: Grundlagen Digitale Methoden
Dleteted 1 child modules for: Grundlagen Digitale Methoden
Added 2 child modules for: Grundlagen Digitale Methoden
Update existing modul: Bau- und Stadtbaugeschichte
Dleteted 2 child modules for: Bau- und Stadtbaugeschichte
Added 2 child modules for: Bau- und Stadtbaugeschichte
Update existing modul: Gestalten und Darstellen I
Dleteted 2 child modules for: Gestalten und Darstellen I
Added 2 child modules for: Gestalten und Darstellen I
Update existing modul: Baustoffe I
Dleteted 1 child modules for: Baustoffe I
Added 2 child modules for: Baustoffe I
Update existing modul: Tragwerkslehre I und Bauphysik I
Dleteted 1 child modules for: Tragwerkslehre I und Bauphysik I
Added 2 child modules for: Tragwerkslehr

In [79]:
modulhandbuch.toCsv()