In [4]:
import os
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load Dataset
md_dir = os.path.join("..", "data", "md")
doc_names = [doc for doc in os.listdir(md_dir)]
documents_list = []
for doc in doc_names:
    with open(os.path.join(md_dir, doc)) as f:
        content = f.read()
        documents_list.append(content)  


# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list)

# Define the number of topics or components
num_components=6

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(train_data)

# Get Components 
lda_components=model.components_


# Print the topics with their terms
# terms = tfidf.get_feature_names()
# for index, component in enumerate(lda_components):
    # zipped = zip(terms, component)
    # top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
    # top_terms_list=list(dict(top_terms_key).keys())
    # print("Topic "+str(index)+": ",top_terms_list)

for id, doc in enumerate(doc_names):
    print(f"\n--- {doc} ---")
    for i in range(num_components):
        print(f"Topic {i}: {lda_matrix[id][i]*100:.2f} %")




--- uni_ulm_fspo_Informatikstudiengaenge_bachelor_master_2022.md ---
Topic 0: 1.31 %
Topic 1: 1.31 %
Topic 2: 1.31 %
Topic 3: 1.31 %
Topic 4: 93.45 %
Topic 5: 1.31 %

--- uni_ulm_aspo_2017.md ---
Topic 0: 1.11 %
Topic 1: 1.11 %
Topic 2: 1.11 %
Topic 3: 1.11 %
Topic 4: 94.47 %
Topic 5: 1.11 %

--- uni_ulm_aspo_2022.md ---
Topic 0: 1.07 %
Topic 1: 1.07 %
Topic 2: 1.07 %
Topic 3: 1.07 %
Topic 4: 94.66 %
Topic 5: 1.07 %

--- hsa_fspo_informatik_bachelor_2019.md ---
Topic 0: 1.05 %
Topic 1: 1.05 %
Topic 2: 1.05 %
Topic 3: 1.05 %
Topic 4: 94.74 %
Topic 5: 1.05 %

--- uni_ulm_FSPO_Biologie_bachelor_master_2022.md ---
Topic 0: 1.21 %
Topic 1: 1.21 %
Topic 2: 1.21 %
Topic 3: 1.21 %
Topic 4: 93.97 %
Topic 5: 1.21 %

--- uni_ulm_fspo_informatikstudiengaenge_bachelor_master_2021.md ---
Topic 0: 1.24 %
Topic 1: 1.24 %
Topic 2: 1.24 %
Topic 3: 1.24 %
Topic 4: 93.81 %
Topic 5: 1.24 %

--- uni_ulm_fspo_innovations_wissenschaftsmanagement_ma_2017.md ---
Topic 0: 1.16 %
Topic 1: 1.16 %
Topic 2: 1.16 %
