In [None]:
#LDA Model, incl. stopwords and 20 words per topic

import docx
import spacy
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Specify the path to the folder containing all Word documents
folder_path = "/Users/mariewosny/HSG Python Projects/Input for Topic Modelling/English"

# List all files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".docx")]

# Preprocessing and stopwords for the English language
processed_docs_all = []

custom_stopwords = set(['probably', 'simply', 'exactly', 'bit', 'tell', 'okay', 'datum', 'stadt'])

for file_path in file_paths:
    doc = docx.Document(file_path)
    text = ' '.join([paragraph.text for paragraph in doc.paragraphs])

# Remove occurrences of the word "okay"
    text_without_okay = ' '.join([word for word in text.split() if word.lower() != 'okay'])

    processed_doc = ' '.join([token.lemma_ for token in nlp(text_without_okay) if not token.is_stop
                              and token.is_alpha and token.lemma_ not in custom_stopwords])
    processed_docs_all.append(processed_doc)

# Vectorize the text data
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

data = vectorizer.fit_transform(processed_docs_all)

# Define the number of topics

n_topics = 10
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=500, random_state=42)

# Fit the model to the data
lda.fit(data)

# Transform the data using the fitted model
transformed = lda.transform(data)

# Number of top words per topic
num_top_words = 20
 
# Print the top 20 words for each topic
feature_names = np.array(vectorizer.get_feature_names_out())
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-num_top_words-1:-1]
    top_words = feature_names[top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}\n")




In [37]:
#LSA Model, incl. stopwords and 20 words per topic

import docx
import spacy
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Specify the path to the folder containing all Word documents
folder_path = "/Users/mariewosny/HSG Python Projects/Input for Topic Modelling/English"

# List all files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".docx")]

# Preprocessing and stopwords for the English language
processed_docs_all = []

custom_stopwords = set(['probably', 'simply', 'exactly', 'bit', 'tell', 'okay', 'datum', 'stadt'])

for file_path in file_paths:
    doc = docx.Document(file_path)
    text = ' '.join([paragraph.text for paragraph in doc.paragraphs])

    # Remove occurrences of the word "okay"
    text_without_okay = ' '.join([word for word in text.split() if word.lower() != 'okay'])

    processed_doc = ' '.join([token.lemma_ for token in nlp(text_without_okay) if not token.is_stop
                              and token.is_alpha and token.lemma_ not in custom_stopwords])
    processed_docs_all.append(processed_doc)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

tfidf_data = vectorizer.fit_transform(processed_docs_all)

# Define the number of topics (or components in LSA)
n_topics = 10

# Create a Truncated SVD (LSA) model
lsa = TruncatedSVD(n_components=n_topics, random_state=42)

# Fit the model to the TF-IDF data
lsa.fit(tfidf_data)

# Transform the TF-IDF data using the fitted LSA model
lsa_topic_matrix = lsa.transform(tfidf_data)

# Number of top words per topic
num_top_words = 20
    
# Print the top 20 words for each topic
feature_names = np.array(vectorizer.get_feature_names_out())
for topic_idx, topic in enumerate(lsa.components_):
    top_words_idx = topic.argsort()[:-num_top_words-1:-1]
    top_words = feature_names[top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}\n")



Topic 1: course, program, nursing, new, change, clinic, colleague, long, little, pms, people, important, doctor, write, physician, situation, problem, life, mean, app

Topic 2: nursing, pms, project, employee, digitization, electronic, new, team, ward, chemotherapy, expert, electronically, important, involve, ai, process, staff, problem, care, management

Topic 3: pms, emergency, anesthesia, ward, program, write, icu, nursing, intensive, click, ips, documentation, care, lab, document, page, medication, curve, transfer, triage

Topic 4: pms, app, tumor, phone, surgery, video, emergency, cell, conference, board, home, test, cool, triage, electronically, personally, pc, write, cumbersome, recognition

Topic 5: app, nursing, stadt, course, situation, kisim, ward, pdms, nurse, test, easy, document, technology, instrument, documentation, innovation, implementation, care, expert, chemotherapy

Topic 6: radio, oncology, software, digitization, program, anesthesia, okay, transfer, paper, staff,

In [36]:
# NMF Model, added stopwords and 20 words per topic

import docx
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np
import spacy

# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Specify the path to the folder containing all Word documents
folder_path = "/Users/mariewosny/HSG Python Projects/Input for Topic Modelling/English"

# List all files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".docx")]

# Preprocessing and stopwords for the English language
processed_docs_all = []

custom_stopwords = set(['probably', 'simply', 'exactly', 'bit', 'tell', 'okay', 'datum', 'stadt'])

for file_path in file_paths:
    doc = docx.Document(file_path)
    text = ' '.join([paragraph.text for paragraph in doc.paragraphs])

    # Remove occurrences of the word "okay"
    text_without_okay = ' '.join([word for word in text.split() if word.lower() != 'okay'])

    processed_doc = ' '.join([token.lemma_ for token in nlp(text_without_okay) if not token.is_stop
                              and token.is_alpha and token.lemma_ not in custom_stopwords])
    processed_docs_all.append(processed_doc)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_data = vectorizer.fit_transform(processed_docs_all)

# Define the number of topics (components in NMF)
n_topics = 10  # Adjust as needed
# Create an NMF model
nmf = NMF(n_components=n_topics, random_state=42)
# Fit the model to the TF-IDF data
nmf.fit(tfidf_data)

# Number of top words per topic
num_top_words = 20

# Print the top 20 words for each topic
feature_names = np.array(vectorizer.get_feature_names_out())
for topic_idx, topic in enumerate(nmf.components_):
    top_words_idx = topic.argsort()[:-num_top_words-1:-1]  # Adjust the number of top words as needed
    top_words = feature_names[top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}\n")


Topic 1: course, physician, people, long, laboratory, guideline, process, mean, change, certain, quality, management, electronic, today, canton, develop, problem, lab, medical, doctor

Topic 2: pms, nursing, ward, emergency, new, project, employee, course, electronically, problem, documentation, electronic, document, important, little, staff, write, triage, performance, change

Topic 3: program, anesthesia, intensive, clinical, icu, ward, care, medication, emergency, doctor, ecg, page, transfer, write, unit, blood, error, situation, read, heart

Topic 4: tumor, program, phone, board, video, pms, conference, mail, colleague, podcast, cell, recognition, computer, dictate, report, list, surgery, send, cumbersome, speech

Topic 5: app, phone, test, course, stadt, cell, surgery, colleague, speak, instrument, situation, translate, operation, technology, op, easy, room, prove, photo, short

Topic 6: crash, lab, allergy, clinic, request, function, paper, okay, write, uster, wait, update, duty,