In [None]:
# basic imports
import pandas as pd

# preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# vectorization
from sklearn.feature_extraction.text import CountVectorizer

# topic model
from bertopic import BERTopic

# evaluation metrics
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

  from .autonotebook import tqdm as notebook_tqdm


# Data Loading

In [2]:
df = pd.read_csv("Dataset/oncology.csv",
                 sep = ';') # separator
df.head()

Unnamed: 0,cancer,Comment,tokens,stopwordremove_tokens,lemmatized_text,none
0,breast cancer,tiredness got better edema ankle several year ...,"['tiredness', 'that', 'got', 'better', 'edema'...","['tiredness', 'got', 'better', 'edema', 'ankle...","['tiredness', 'got', 'better', 'edema', 'ankle...",
1,none,one people could take attack body along cancer...,"['one', 'people', 'could', 'take', 'attack', '...","['one', 'people', 'could', 'take', 'attack', '...","['one', 'people', 'could', 'take', 'attack', '...",
2,none,take month week mg lymphedema arm get infect a...,"['take', 'month', 'week', 'mg', 'lymphedema', ...","['take', 'month', 'week', 'mg', 'lymphedema', ...","['take', 'month', 'week', 'mg', 'lymphedema', ...",
3,none,stage lymph bone bone metastasis never infusio...,"['stage', 'lymph', 'bone', 'bone', 'metastasis...","['stage', 'lymph', 'bone', 'bone', 'metastasis...","['stage', 'lymph', 'bone', 'bone', 'metastasis...",
4,none,although medication effectively treat cancer s...,"['although', 'medication', 'effectively', 'tre...","['although', 'medication', 'effectively', 'tre...","['although', 'medication', 'effectively', 'tre...",


In [3]:
# dimensions
df.shape

(14419, 6)

In [4]:
# only extract patient messages
msgs = df['Comment'].tolist()
msgs[0]

'tiredness got better edema ankle several year hair thinningjoint pain year afinitor doctor said consider going drug cause high blood sugar high cholesterol neither opted go month monthly hormone injection cancer still stable'

# Preprocessing

- Note: If using BERTopic, preprocessing is automatically done by the model (skip tokenization)

In [5]:
# create vectorizer model
vectorizer = CountVectorizer(stop_words = 'english') # count how many times each token appear

# Topic Modeling

Chosen Topic Model: BERTopic

In [6]:
# fit model
topic_model = BERTopic(vectorizer_model = vectorizer,
                       language = 'english')
topics, probs = topic_model.fit_transform(msgs)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [7]:
# inspect topics

topic_model.get_topic_info() # -1 is outlier should typically be ignored

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,6808,-1_healthcare_people_like_treatment,"[healthcare, people, like, treatment, pay, day...",[sorry go kind would add last sentence even ti...
1,0,576,0_tamoxifen_hormone_post_therapy,"[tamoxifen, hormone, post, therapy, hello, tam...",[pseudopseudo letrozole post sunnylife letrozo...
2,1,474,1_neuropathy_cold_foot_ice,"[neuropathy, cold, foot, ice, sensitivity, han...",[ice help session folfox summer start first in...
3,2,379,2_estrogen_testosterone_level_aromasin,"[estrogen, testosterone, level, aromasin, ai, ...",[heyyes blood result finethey showestradiol ce...
4,3,347,3_drug_price_research_company,"[drug, price, research, company, profit, cost,...",[unfortunately rd pretty huge factor price man...
...,...,...,...,...,...
147,146,10,146_ai_nebido_controllower_trt,"[ai, nebido, controllower, trt, lipid, aromasi...",[best ai use trt controllower using letro mont...
148,147,10,147_cover_dental_health_private,"[cover, dental, health, private, middle, tooth...",[read post universal health insurance affords ...
149,148,10,148_vial_reality_die_people,"[vial, reality, die, people, save, dying, cani...",[million people take care ofi know perfect fai...
150,149,10,149_tuition_quebec_uni_school,"[tuition, quebec, uni, school, semester, cuz, ...",[pay finish uni year take much parent could af...


In [8]:
# see the probabilities of each word within each topic

#example topic 1
topic_model.get_topic(0)

[('tamoxifen', 0.017519498053472304),
 ('hormone', 0.01731677619895688),
 ('post', 0.01706787041193619),
 ('therapy', 0.011171119209985939),
 ('hello', 0.010061006842220352),
 ('tamoxifene', 0.009636624005581412),
 ('tamo', 0.009597321759026884),
 ('ovary', 0.009525418989274519),
 ('breast', 0.009297620107102497),
 ('aromasine', 0.008720244253040032)]

In [9]:
# inspect each document and their assigned topic

topic_model.get_document_info(msgs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,tiredness got better edema ankle several year ...,-1,-1_healthcare_people_like_treatment,"[healthcare, people, like, treatment, pay, day...",[sorry go kind would add last sentence even ti...,healthcare - people - like - treatment - pay -...,0.000000,False
1,one people could take attack body along cancer...,-1,-1_healthcare_people_like_treatment,"[healthcare, people, like, treatment, pay, day...",[sorry go kind would add last sentence even ti...,healthcare - people - like - treatment - pay -...,0.000000,False
2,take month week mg lymphedema arm get infect a...,-1,-1_healthcare_people_like_treatment,"[healthcare, people, like, treatment, pay, day...",[sorry go kind would add last sentence even ti...,healthcare - people - like - treatment - pay -...,0.000000,False
3,stage lymph bone bone metastasis never infusio...,120,120_lung_nodule_sutent_slight,"[lung, nodule, sutent, slight, ulcer, cough, p...",[first try stress lung nodule lot people benig...,lung - nodule - sutent - slight - ulcer - coug...,0.895874,False
4,although medication effectively treat cancer s...,56,56_rash_hive_lesion_lotion,"[rash, hive, lesion, lotion, wrist, itchy, pic...",[first time post pic tongue lesion show progre...,rash - hive - lesion - lotion - wrist - itchy ...,0.186250,False
...,...,...,...,...,...,...,...,...
14414,one course folfox oxaliplatin straight away bl...,1,1_neuropathy_cold_foot_ice,"[neuropathy, cold, foot, ice, sensitivity, han...",[ice help session folfox summer start first in...,neuropathy - cold - foot - ice - sensitivity -...,0.985446,False
14415,mild nausea day chemo sessiontingling fingerti...,-1,-1_healthcare_people_like_treatment,"[healthcare, people, like, treatment, pay, day...",[sorry go kind would add last sentence even ti...,healthcare - people - like - treatment - pay -...,0.000000,False
14416,following treatment since october course octob...,1,1_neuropathy_cold_foot_ice,"[neuropathy, cold, foot, ice, sensitivity, han...",[ice help session folfox summer start first in...,neuropathy - cold - foot - ice - sensitivity -...,1.000000,False
14417,forgot tell since operation still urinary leak...,-1,-1_healthcare_people_like_treatment,"[healthcare, people, like, treatment, pay, day...",[sorry go kind would add last sentence even ti...,healthcare - people - like - treatment - pay -...,0.000000,False


# Visualization

In [10]:
topic_model.visualize_topics()

### Coherence

In [22]:
# coherence
topics = topic_model.get_topics()
topic_words = [[word for word, _ in topic_model.get_topic(topic)]
                for topic in range(len(topics) - 1)]

# prepare for gensim
texts = [msg.lower().split() for msg in msgs]
dictionary = Dictionary(texts)

# compute coherence
coherence_model = CoherenceModel(
    topics = topic_words,
    texts=texts,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_score = coherence_model.get_coherence()
print("Coherence Score: ", coherence_score)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Coherence Score:  0.495218175630432


coherence score > 0.4 -> better topic quality

### Topic Diversity

In [24]:
def topic_diversity(top_n_words=10):
    topic_words = topic_model.get_topics()
    all_words = [word for topic in topic_words.values() for word, _ in topic[:top_n_words]]
    unique_words = len(set(all_words))
    total_words = len(all_words)
    return unique_words/total_words

print("Topic diversity: ", topic_diversity())

Topic diversity:  0.7743421052631579


### Topic Quality = Coherence * Diversity

In [26]:
topic_diversity = topic_diversity()
print("Topic quality: ", coherence_score * topic_diversity)

Topic quality:  0.383468284682249


### Hierarchy

In [27]:
topic_model.visualize_hierarchy()

### Barchart

In [29]:
topic_model.visualize_barchart()