In [8]:
def load_documents_from_folder(folder_path, file_ext=".txt"):
    documents = []
    filenames = []

    for filename in os.listdir(folder_path):
        if filename.endswith(file_ext):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()

                # Force content to string in case something weird happens
                if isinstance(content, list):
                    content = " ".join(str(x) for x in content)
                elif not isinstance(content, str):
                    content = str(content)

                documents.append(content)
                filenames.append(filename)

    return documents, filenames


In [9]:
documents = load_documents_from_folder('Final_scraped_txts')

In [11]:
# Ensure all documents are strings (flattening any lists accidentally included)
documents = [" ".join(d) if isinstance(d, list) else str(d) for d in documents]


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Preprocessing
vectorizer = CountVectorizer(stop_words='english', max_df=500, min_df=2)
doc_term_matrix = vectorizer.fit_transform(documents)

# Fit LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(doc_term_matrix)

# Show top words for each topic
words = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    top_words = [words[i] for i in topic.argsort()[-10:]]
    print(f"Topic {idx+1}: {' | '.join(top_words)}")


Topic 1: cs | datascience | informatics | ois | fortune | luddy | iu | html | indiana | txt
Topic 2: school | luddy | students | degree | data | information | university | program | iu | science
Topic 3: blackculture | 1604350011 | fin | thats | digitalcollections | lacasa | ssw | gtidea | trustedci | jewishculture
Topic 4: school | luddy | student | information | university | program | data | students | iu | science
Topic 5: blackculture | 1604350011 | fin | thats | digitalcollections | lacasa | ssw | gtidea | trustedci | jewishculture


In [15]:
from bertopic import BERTopic

# Assuming `documents` is a list of 600 strings
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(documents)

topic_model.get_topic_info()
topic_model.visualize_topics()


  from .autonotebook import tqdm as notebook_tqdm


ValueError: zero-size array to reduction operation maximum which has no identity

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=500, min_df=2)
tfidf_matrix = vectorizer.fit_transform(documents)

# Fit LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(tfidf_matrix)

# Show top words per topic
terms = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    print(f"Topic #{idx + 1}:")
    print(" ".join([terms[i] for i in topic.argsort()[-10:][::-1]]))
    print("-" * 40)


Topic #1:
informatics fortune ois luddy bloomington data events computer graduate news
----------------------------------------
Topic #2:
txt indiana html iu luddy fortune ois informatics cs datascience
----------------------------------------
Topic #3:
informatics fortune ois luddy bloomington data events computer graduate news
----------------------------------------
Topic #4:
science iu students data program university information luddy degree student
----------------------------------------
Topic #5:
informatics fortune ois luddy bloomington data events computer graduate news
----------------------------------------
