In [3]:
#LSA Model, incl. stopwords and 20 words per topic

import docx
import spacy
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Specify the path to the folder containing all Word documents
folder_path = "/Users/mariewosny/Desktop/HSG/10_Conferences/2024_MIE/Dataset_flu"

# List all files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".docx")]

# Preprocessing and stopwords for the English language
processed_docs_all = []

custom_stopwords = set(['oh', 'ah'])

for file_path in file_paths:
    doc = docx.Document(file_path)
    text = ' '.join([paragraph.text for paragraph in doc.paragraphs])

    # Remove occurrences of the word "okay"
    text_without_okay = ' '.join([word for word in text.split() if word.lower() != 'okay'])

    processed_doc = ' '.join([token.lemma_ for token in nlp(text_without_okay) if not token.is_stop
                              and token.is_alpha 
                              and token.lemma_ not in custom_stopwords])
    processed_docs_all.append(processed_doc)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

tfidf_data = vectorizer.fit_transform(processed_docs_all)

# Define the number of topics (or components in LSA)
n_topics = 10

# Create a Truncated SVD (LSA) model
lsa = TruncatedSVD(n_components=n_topics, random_state=42)

# Fit the model to the TF-IDF data
lsa.fit(tfidf_data)

# Transform the TF-IDF data using the fitted LSA model
lsa_topic_matrix = lsa.transform(tfidf_data)

# Number of top words per topic
num_top_words = 20
    
# Print the top 20 words for each topic
feature_names = np.array(vectorizer.get_feature_names_out())
for topic_idx, topic in enumerate(lsa.components_):
    top_words_idx = topic.argsort()[:-num_top_words-1:-1]
    top_words = feature_names[top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}\n")



Topic 1: feel, come, obviously, younger, school, guess, nhs, health, kid, interesting, young, speak, happen, ill, effect, letter, appointment, remember, cold, definitely

Topic 2: wife, straightforward, pretty, faith, obviously, seek, letter, beneficial, medical, uk, website, alright, important, garden, world, scientist, certainly, science, personally, text

Topic 3: son, team, specialist, blood, health, daughter, problem, visitor, today, advice, unwell, help, june, form, allergic, particularly, eat, play, laughter, necessarily

Topic 4: daughter, feel, season, jab, mind, guess, issue, definitely, book, covid, friend, come, protect, vulnerable, mainly, strain, immunization, outbreak, importance, public

Topic 5: son, specialist, team, unwell, phone, guess, faith, available, come, organize, clinic, arrange, word, medical, food, bear, horrible, reception, allergic, trust

Topic 6: practice, guess, concerned, invite, try, rest, cost, fear, speak, provide, important, correct, ring, occur, 