In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
import os
import itertools

# nltk.download("stopwords")
stop_words = stopwords.words("german")

train_dir = os.path.join("..", "data", "md", "train")
test_dir = os.path.join("..", "data", "md", "test")

doc_names = [doc for doc in os.listdir(train_dir)]
doc_names += [doc for doc in os.listdir(test_dir)]

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx) + " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

def document_generator(folder_path, print_names=False):
    """Yield content of each text file in the specified folder."""
    for i, filename in enumerate(os.listdir(folder_path), start=1):
        if(print_names):
            print(f"{i}. {filename}")
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                yield file.read()

# change cwd to folder where file is in
os.chdir(globals()['_dh'][0])

train_documents = document_generator(train_dir)
all_documents = itertools.chain(document_generator(train_dir), document_generator(test_dir))

no_features = 10000

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=stop_words)
tf_train = tf_vectorizer.fit_transform(train_documents)
tf_feature_names = tf_vectorizer.get_feature_names_out()

no_topics = 5

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=20, learning_method='online', learning_offset=50.,random_state=0).fit(tf_train)

no_top_words = 10
# display_topics(lda, tf_feature_names, no_top_words)

classification = lda.transform(tf_vectorizer.transform(all_documents), normalize=True)

for id, doc in enumerate(classification):
    print(f"\n--- {doc_names[id]} ---")
    for i in range(no_topics):
        if(doc[i] == max(doc)):
            print("x ", end="")
        print(f"Topic {i}: {doc[i]*100:.2f} %")
    




--- uni_ulm_aspo_2017.md ---
Topic 0: 0.01 %
x Topic 1: 99.97 %
Topic 2: 0.01 %
Topic 3: 0.01 %
Topic 4: 0.01 %

--- uni_ulm_aspo_2022.md ---
Topic 0: 0.01 %
x Topic 1: 99.97 %
Topic 2: 0.01 %
Topic 3: 0.01 %
Topic 4: 0.01 %

--- hsa_fspo_informatik_bachelor_2019.md ---
x Topic 0: 99.90 %
Topic 1: 0.02 %
Topic 2: 0.02 %
Topic 3: 0.02 %
Topic 4: 0.02 %

--- uni_ulm_FSPO_Biologie_bachelor_master_2022.md ---
Topic 0: 0.03 %
x Topic 1: 99.90 %
Topic 2: 0.03 %
Topic 3: 0.03 %
Topic 4: 0.03 %

--- uni_ulm_fspo_informatikstudiengaenge_bachelor_master_2021.md ---
Topic 0: 0.01 %
x Topic 1: 99.94 %
Topic 2: 0.01 %
Topic 3: 0.01 %
Topic 4: 0.01 %

--- uni_ulm_fspo_innovations_wissenschaftsmanagement_ma_2017.md ---
Topic 0: 0.03 %
x Topic 1: 99.87 %
Topic 2: 0.03 %
Topic 3: 0.03 %
Topic 4: 0.03 %

--- uni_ulm_fspo_Informatikstudiengaenge_bachelor_master_2022.md ---
Topic 0: 0.01 %
x Topic 1: 99.95 %
Topic 2: 0.01 %
Topic 3: 0.01 %
Topic 4: 0.01 %
