# Kursdatenalyse

In [2]:
datapath = "crawler/database/semester_20251"

# get a list of all json files in the directory  
import os
import json

def get_course_objects(datapath):
    json_files = []
    for file in os.listdir(datapath):
        if file.endswith(".json"):
            course = {}
            with open(os.path.join(datapath, file), 'r') as f:
                course = json.load(f)
                course["html"] = file.replace(".json", ".html")
            json_files.append(course)
    return json_files

# get the course objects
courses = get_course_objects(datapath)
# filter for courses with SWS defined
courses = [course for course in courses if "Grunddaten zur Veranstaltung" in course and course["Grunddaten zur Veranstaltung"]["SWS"] != "missing" and "Veranstaltungstitel" in course]

print("Loaded " + str(len(courses)) + " courses")

Loaded 632 courses


In [38]:
from pprint import pprint
pprint(courses[3])

{'Bisonlink': 'https://bison-connector.bauhaus.uni-weimar.de/qisserver/rds?state=verpublish&status=init&vmfile=no&publishid=65070&moduleCall=webInfo&publishConfFile=webInfo&publishSubDir=veranstaltung',
 'Fakultät': 'Fakultät Medien',
 'Grunddaten zur Veranstaltung': {'Erwartete Teilnehmer/-innen': 'missing',
                                  'Hyperlink': 'http://www.uni-weimar.de/vsp',
                                  'Max. Teilnehmer/-innen': 'missing',
                                  'Rhythmus': 'jedes 2. Semester',
                                  'SWS': '2',
                                  'Semester': 'SoSe 2025',
                                  'Sprache': 'englisch',
                                  'Veranstaltungsart': 'Übung',
                                  'Veranstaltungsnummer': '2909035/02',
                                  'Zugeordnetes Modul': 'missing'},
 'Personen': [{'faculty': 'Fakultät Bau- und Umweltingenieurwissenschaften',
               'regular_name'

In [3]:
descriptions = [
    course["Veranstaltungstitel"] + " " + course["Weitere Angaben zur Veranstaltung"]["Beschreibung"]
    for course in courses
    if "Weitere Angaben zur Veranstaltung" in course and "Beschreibung" in course["Weitere Angaben zur Veranstaltung"]
]


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import nltk

# Download German stopwords if not already done
nltk.download('stopwords')
from nltk.corpus import stopwords

# Combine English and German stopwords
stop_words = list(text.ENGLISH_STOP_WORDS) + list(stopwords.words('german'))

vectorizer_model = CountVectorizer(stop_words=stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/llorenz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
!pip install nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1


In [5]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(n_neighbors=2, n_components=5, min_dist=0.4, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=2)

topic_model = BERTopic(
    language="multilingual",
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    verbose=True
)
topics, probs = topic_model.fit_transform(descriptions)

# Show topic info
topic_info = topic_model.get_topic_info()
print(topic_info)

  from .autonotebook import tqdm as notebook_tqdm
2025-06-24 15:48:37,657 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 18/18 [01:13<00:00,  4.06s/it]
2025-06-24 15:49:57,789 - BERTopic - Embedding - Completed ✓
2025-06-24 15:49:57,791 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-24 15:50:15,366 - BERTopic - Dimensionality - Completed ✓
2025-06-24 15:50:15,377 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-24 15:50:15,516 - BERTopic - Cluster - Completed ✓
2025-06-24 15:50:15,545 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-24 15:50:15,984 - BERTopic - Representation - Completed ✓


     Topic  Count                                               Name  \
0       -1     44               -1_garage_neudeli_bauhaus_kolloquium   
1        0     17                         0_vob_werfen_tool_elements   
2        1     15                     1_moderne_texte_events_seminar   
3        2     14                   2_höhlen_forest_grundlagen_reset   
4        3     14             3_usability_projektmodul_spot_bachelor   
..     ...    ...                                                ...   
96      95      3      95_bestimmte_interface_praktiken_menschlichen   
97      96      3              96_structural_sensor_steel_structures   
98      97      3                       97_virtual_vr_reality_agents   
99      98      3  98_metallkorrosion_korrosionsschutz_aktiver_ge...   
100     99      2         99_sustainability_diversity_urban_language   

                                        Representation  \
0    [garage, neudeli, bauhaus, kolloquium, archite...   
1    [vob, werfen, 

In [6]:
topic_model.generate_topic_labels()
labels = topic_model.topic_labels_
print(labels)

{-1: '-1_garage_neudeli_bauhaus_kolloquium', 0: '0_vob_werfen_tool_elements', 1: '1_moderne_texte_events_seminar', 2: '2_höhlen_forest_grundlagen_reset', 3: '3_usability_projektmodul_spot_bachelor', 4: '4_chemnitz_europäischen_stadt_städtebaulichen', 5: '5_fulldome_filme_imagination_imaginieren', 6: '6_urban_cities_german_stadtsoziologie', 7: '7_animation_klang_non_sehens', 8: '8_software_model_argumentation_engineering', 9: '9_musik_musikpsychologie_hfm_csound', 10: '10_örr_regulierung_informations_medienökonomik', 11: '11_film_language_processing_bergbau', 12: '12_forschungskolloquium_studien_markenführung_forschungs', 13: '13_bounds_gamesfabrik_technologie_bauhaus', 14: '14_concrete_building_materials_damage', 15: '15_haus_pappeln_van_hohe', 16: '16_sprachumschaltflagge_englischsprachigen_ku_installation', 17: '17_class_00_join_pcbs', 18: '18_bauens_architektur_sustainability_schwelle', 19: '19_stochastic_reliability_simulation_power', 20: '20_städtebau_ost_thüringen_knotenpunktsyst

In [8]:
# show all links of courses assigned to label
label = 33

# Get indices of courses with this topic
course_indices = [i for i, topic in enumerate(topics) if topic == label]

# Print course titles and links for this topic
print(f"\nCourses for topic {label} ({labels[label]}):\n")
for idx in course_indices:
    print(f"Title: {courses[idx]['Veranstaltungstitel']}")
    print(f"Link: {courses[idx]['Bisonlink']}\n")




Courses for topic 33 (33_coudray_hfm_konzert_chor):

Title: 5. Kernmodul: Versuchsgut Dornburg – Experimente zu einer nachhaltigen Tektonik
Link: https://bison-connector.bauhaus.uni-weimar.de/qisserver/rds?state=verpublish&status=init&vmfile=no&publishid=66767&moduleCall=webInfo&publishConfFile=webInfo&publishSubDir=veranstaltung

Title: Der imaginierte Alltag - Genremalerei in den Niederlanden (auch Prüfungsmodul Lehramt)
Link: https://bison-connector.bauhaus.uni-weimar.de/qisserver/rds?state=verpublish&status=init&vmfile=no&publishid=66030&moduleCall=webInfo&publishConfFile=webInfo&publishSubDir=veranstaltung

Title: 5. Kernmodul: Maison du Peuple
Link: https://bison-connector.bauhaus.uni-weimar.de/qisserver/rds?state=verpublish&status=init&vmfile=no&publishid=66016&moduleCall=webInfo&publishConfFile=webInfo&publishSubDir=veranstaltung

Title: Digested Specimen - Turning the Music into design
Link: https://bison-connector.bauhaus.uni-weimar.de/qisserver/rds?state=verpublish&status=i

In [65]:
from transformers import pipeline

# Use a multilingual model for German/English
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")

# Example candidate labels (expand as needed, can be hundreds!)
candidate_labels = [
    "Writing", "Fonts", "Sustainability", "Biology", "Music", "Film", "Urbanism", "Architecture", "Programming", "History", "Sociology", "Psychology", "Art", "Design", "Engineering", "Mathematics", "Physics", "Chemistry", "Philosophy", "Education"
]

# For each course description, get the top N labels
for i, desc in enumerate(descriptions[:5]):  # Try on first 5 for speed
    result = classifier(desc, candidate_labels, multi_label=True)
    print(f"Course: {courses[i]['Veranstaltungstitel']}")
    for label, score in zip(result['labels'], result['scores']):
        if score > 0.3:  # threshold for relevance
            print(f"  {label}: {score:.2f}")
    print()

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']

In [64]:
#!pip install tiktoken
!pip install protobuf


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting protobuf
  Downloading protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Downloading protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl (321 kB)
Installing collected packages: protobuf
Successfully installed protobuf-6.31.1
