In [1]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import os
from nltk.corpus import stopwords
import nltk
import numpy as np
import itertools

# nltk.download("stopwords")
stop_words = stopwords.words('german')

train_dir = os.path.join("..", "data", "md", "train")
test_dir = os.path.join("..", "data", "md", "test")

def document_generator(folder_path=train_dir, print_names=False):
    """Yield content of each text file in the specified folder."""
    for i, filename in enumerate(os.listdir(folder_path), start=1):
        # if(print_names == True):
        #     print(f"{i}. {filename}")
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                yield file.read()

no_features = 1000

train_corpus = document_generator(folder_path=train_dir)
test_corpus = document_generator(folder_path=test_dir)
all_corpus = itertools.chain(document_generator(folder_path=train_dir), document_generator(folder_path=test_dir))


# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=stop_words)
tfidf_train = tfidf_vectorizer.fit_transform(train_corpus)
tfidf_test = tfidf_vectorizer.transform(test_corpus)
tfidf_all = tfidf_vectorizer.transform(all_corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

no_topics = 5

# Run NMF
# init="nndsvd" can be used for better sparseness
nmf = NMF(n_components=no_topics, init="nndsvd").fit(tfidf_train)

transformed_docs = nmf.transform(tfidf_all)

doc_names = [name for name in os.listdir(train_dir) if os.path.isfile(os.path.join(train_dir, name))]
doc_names += [name for name in os.listdir(test_dir) if os.path.isfile(os.path.join(test_dir, name))]

for id, doc in enumerate(doc_names):
    print(f"--- {doc} ---")
    result_line = [v/np.sum(transformed_docs[id]) for v in transformed_docs[id]]
    topic_prefixes = [f"Topic {i}:" for i in range(no_topics)]
    z = [x for x in zip(topic_prefixes, result_line)]
    z.sort(key=lambda x: x[1], reverse=True)
    [print(topic, "{:.2f}".format(perc*100), end="%, ") for  (topic, perc) in z]
    print("\n")


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        

display_topics(nmf, tfidf_feature_names, 10)


--- uni_ulm_aspo_2017.md ---
Topic 0: 72.58%, Topic 3: 25.90%, Topic 4: 1.52%, Topic 1: 0.00%, Topic 2: 0.00%, 

--- uni_ulm_aspo_2022.md ---
Topic 0: 84.31%, Topic 2: 9.29%, Topic 1: 6.40%, Topic 3: 0.00%, Topic 4: 0.00%, 

--- hsa_fspo_informatik_bachelor_2019.md ---
Topic 2: 100.00%, Topic 0: 0.00%, Topic 1: 0.00%, Topic 3: 0.00%, Topic 4: 0.00%, 

--- uni_ulm_FSPO_Biologie_bachelor_master_2022.md ---
Topic 1: 100.00%, Topic 0: 0.00%, Topic 2: 0.00%, Topic 3: 0.00%, Topic 4: 0.00%, 

--- uni_ulm_fspo_informatikstudiengaenge_bachelor_master_2021.md ---
Topic 4: 99.93%, Topic 1: 0.07%, Topic 0: 0.00%, Topic 2: 0.00%, Topic 3: 0.00%, 

--- uni_ulm_fspo_innovations_wissenschaftsmanagement_ma_2017.md ---
Topic 3: 99.96%, Topic 1: 0.04%, Topic 0: 0.00%, Topic 2: 0.00%, Topic 4: 0.00%, 

--- uni_ulm_fspo_Informatikstudiengaenge_bachelor_master_2022.md ---
Topic 4: 72.41%, Topic 1: 18.23%, Topic 2: 9.36%, Topic 0: 0.00%, Topic 3: 0.00%, 

Topic 0:
fspo universität ulm jeweiligen fachspezifi

#### Result for:
1. Fit NMF-model on six train documents (excluding new Informatik-FSPO (2022))
2. Transform all documents (train documents + new Informatik-FSPO)

#### Results:
--- uni_ulm_aspo_2017.md ---

Topic 0: 81.73%, Topic 1: 17.37%, Topic 3: 0.89%, Topic 2: 0.00%, Topic 4: 0.00%, 


--- uni_ulm_aspo_2022.md ---

Topic 0: 90.23%, Topic 2: 5.43%, Topic 4: 4.34%, Topic 1: 0.00%, Topic 3: 0.00%, 


--- hsa_fspo_informatik_bachelor_2019.md ---

Topic 2: 100.00%, Topic 0: 0.00%, Topic 1: 0.00%, Topic 3: 0.00%, Topic 4: 0.00%, 


--- uni_ulm_FSPO_Biologie_bachelor_master_2022.md ---

Topic 4: 100.00%, Topic 0: 0.00%, Topic 1: 0.00%, Topic 2: 0.00%, Topic 3: 0.00%, 


--- uni_ulm_fspo_informatikstudiengaenge_bachelor_master_2021.md ---

Topic 3: 99.90%, Topic 4: 0.10%, Topic 0: 0.00%, Topic 1: 0.00%, Topic 2: 0.00%, 


--- uni_ulm_fspo_innovations_wissenschaftsmanagement_ma_2017.md ---

Topic 1: 99.96%, Topic 4: 0.04%, Topic 0: 0.00%, Topic 2: 0.00%, Topic 3: 0.00%, 


--- ['uni_ulm_fspo_Informatikstudiengaenge_bachelor_master_2022.md'] ---

Topic 3: 69.47%, Topic 4: 21.17%, Topic 2: 9.36%, Topic 0: 0.00%, Topic 1: 0.00%, 

