In [3]:
#LSA Model, incl. stopwords and 20 words per topic

# Question 1 - positive experience 

import docx
import spacy
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Specify the path to the folder containing all Word documents
folder_path = "/Users/mariewosny/Desktop/HSG/10_Conferences/2024_MIE/MIE2024docx/Quest1"

# List all files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".docx")]

# Preprocessing and stopwords for the English language
processed_docs_all = []

custom_stopwords = set(['oh', 'ah', 'okay'])

for file_path in file_paths:
    doc = docx.Document(file_path)
    text = ' '.join([paragraph.text for paragraph in doc.paragraphs])

    # Remove occurrences of the word "okay"
    text_without_okay = ' '.join([word for word in text.split() if word.lower() != 'okay'])

    processed_doc = ' '.join([token.lemma_ for token in nlp(text_without_okay) if not token.is_stop
                              and token.is_alpha 
                              and token.lemma_ not in custom_stopwords])
    processed_docs_all.append(processed_doc)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

tfidf_data = vectorizer.fit_transform(processed_docs_all)

# Define the number of topics (or components in LSA)
n_topics = 10

# Create a Truncated SVD (LSA) model
lsa = TruncatedSVD(n_components=n_topics, random_state=42)

# Fit the model to the TF-IDF data
lsa.fit(tfidf_data)

# Transform the TF-IDF data using the fitted LSA model
lsa_topic_matrix = lsa.transform(tfidf_data)

# Number of top words per topic
num_top_words = 20
    
# Print the top 20 words for each topic
feature_names = np.array(vectorizer.get_feature_names_out())
for topic_idx, topic in enumerate(lsa.components_):
    top_words_idx = topic.argsort()[:-num_top_words-1:-1]
    top_words = feature_names[top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}\n")



Topic 1: work, patient, think, positive, tool, look, time, course, know, thing, example, good, come, use, like, need, information, lot, quickly, cool

Topic 2: work, tool, relatively, annoy, difficult, open, expect, trigger, electronically, care, transfer, actually, realize, little, medical, update, technology, grateful, long, anymore

Topic 3: new, lot, datum, monitor, transfer, need, good, far, big, look, medical, like, nursing, write, gain, efficient, department, relatively, set, certainly

Topic 4: software, tool, time, save, hour, life, module, text, app, everyday, program, question, actually, ecg, use, answer, gain, expectation, good, finding

Topic 5: know, look, answer, question, positive, ecg, try, function, effect, exist, search, breath, depend, course, say, beginning, test, ray, quick, write

Topic 6: patient, ecg, come, positive, doctor, able, effect, point, available, immediately, far, surgery, tool, nice, write, round, nursing, care, medication, experience

Topic 7: trans

In [4]:
#LSA Model, incl. stopwords and 20 words per topic

# Question 2 - negative experience 

import docx
import spacy
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Specify the path to the folder containing all Word documents
folder_path = "/Users/mariewosny/Desktop/HSG/10_Conferences/2024_MIE/MIE2024docx/Quest2"

# List all files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".docx")]

# Preprocessing and stopwords for the English language
processed_docs_all = []

custom_stopwords = set(['oh', 'ah', 'okay'])

for file_path in file_paths:
    doc = docx.Document(file_path)
    text = ' '.join([paragraph.text for paragraph in doc.paragraphs])

    # Remove occurrences of the word "okay"
    text_without_okay = ' '.join([word for word in text.split() if word.lower() != 'okay'])

    processed_doc = ' '.join([token.lemma_ for token in nlp(text_without_okay) if not token.is_stop
                              and token.is_alpha 
                              and token.lemma_ not in custom_stopwords])
    processed_docs_all.append(processed_doc)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

tfidf_data = vectorizer.fit_transform(processed_docs_all)

# Define the number of topics (or components in LSA)
n_topics = 10

# Create a Truncated SVD (LSA) model
lsa = TruncatedSVD(n_components=n_topics, random_state=42)

# Fit the model to the TF-IDF data
lsa.fit(tfidf_data)

# Transform the TF-IDF data using the fitted LSA model
lsa_topic_matrix = lsa.transform(tfidf_data)

# Number of top words per topic
num_top_words = 20
    
# Print the top 20 words for each topic
feature_names = np.array(vectorizer.get_feature_names_out())
for topic_idx, topic in enumerate(lsa.components_):
    top_words_idx = topic.argsort()[:-num_top_words-1:-1]
    top_words = feature_names[top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}\n")



Topic 1: work, patient, time, tool, think, program, example, thing, yes, like, problem, need, know, course, day, good, say, information, come, lot

Topic 2: program, patient, information, lot, correct, look, medication, unnecessary, error, example, frustration, clinical, inefficient, interface, different, open, time, new, wrong, certain

Topic 3: program, time, work, day, think, frustrating, correct, ticket, yes, pc, duty, long, new, night, frustration, cost, inefficient, lot, morning, application

Topic 4: patient, paper, wait, switch, computer, insanely, ago, annoying, week, room, happen, lab, manage, disaster, large, crash, program, work, affect, resident

Topic 5: write, time, change, page, day, medication, button, anesthesia, super, example, ticket, week, paper, space, negative, different, outage, prescription, long, enter

Topic 6: patient, time, digital, open, actually, datum, solve, problem, different, house, source, place, ward, forget, file, insanely, able, app, start, relati

In [6]:
#LSA Model, incl. stopwords and 20 words per topic

# Question 3 - professional image 

import docx
import spacy
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

# Load spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Specify the path to the folder containing all Word documents
folder_path = "/Users/mariewosny/Desktop/HSG/10_Conferences/2024_MIE/MIE2024docx/Quest3"

# List all files in the folder
file_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".docx")]

# Preprocessing and stopwords for the English language
processed_docs_all = []

custom_stopwords = set(['oh', 'ah', 'okay'])

for file_path in file_paths:
    doc = docx.Document(file_path)
    text = ' '.join([paragraph.text for paragraph in doc.paragraphs])

    # Remove occurrences of the word "okay"
    text_without_okay = ' '.join([word for word in text.split() if word.lower() != 'okay'])

    processed_doc = ' '.join([token.lemma_ for token in nlp(text_without_okay) if not token.is_stop
                              and token.is_alpha 
                              and token.lemma_ not in custom_stopwords])
    processed_docs_all.append(processed_doc)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

tfidf_data = vectorizer.fit_transform(processed_docs_all)

# Define the number of topics (or components in LSA)
n_topics = 10

# Create a Truncated SVD (LSA) model
lsa = TruncatedSVD(n_components=n_topics, random_state=42)

# Fit the model to the TF-IDF data
lsa.fit(tfidf_data)

# Transform the TF-IDF data using the fitted LSA model
lsa_topic_matrix = lsa.transform(tfidf_data)

# Number of top words per topic
num_top_words = 20
    
# Print the top 20 words for each topic
feature_names = np.array(vectorizer.get_feature_names_out())
for topic_idx, topic in enumerate(lsa.components_):
    top_words_idx = topic.argsort()[:-num_top_words-1:-1]
    top_words = feature_names[top_words_idx]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}\n")



Topic 1: patient, think, work, thing, change, doctor, actually, time, tool, lot, look, like, know, need, datum, simply, information, knowledge, use, course

Topic 2: change, knowledge, know, tool, doctor, physician, today, person, long, increase, information, senior, past, medical, paper, able, maybe, situation, experience, mean

Topic 3: actually, information, simply, like, use, datum, thing, help, offer, influence, quality, enjoy, new, hospital, machine, tool, depend, somewhat, exciting, example

Topic 4: doctor, work, grow, change, pc, write, difference, job, ai, enjoy, thing, spend, problem, program, resident, professional, bit, important, open, time

Topic 5: datum, think, simply, hand, profession, important, affect, development, assessment, digitization, colleague, improve, great, deal, accordingly, general, ai, offer, modern, damage

Topic 6: like, document, feel, know, anesthesia, discipline, work, office, doctor, knowledge, think, medication, operate, program, play, computer, 