In [122]:
import pandas as pd
import re
import emoji
import spacy
# Spacy's english model
nlp = spacy.load('en_core_web_sm')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# used to fix contractions such as I'll to I will
import contractions

from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance
from bertopic import BERTopic

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

In [123]:
# Filepaths
tweet_data_fp = 'twitter_data/custom_data/pruned_medicine_and_research_users.csv'

In [124]:
tweet_data_df = pd.read_csv(tweet_data_fp)
tweet_data_df = tweet_data_df[tweet_data_df['Lang_code'] == 'en']

## Preprocessing

In [125]:
# Gathering documents
docs = tweet_data_df['Tweet'].tolist()
pruned_docs = docs

In [126]:
len(pruned_docs)

27952

In [127]:
# Best to err on the side of caution when pre-processing:
# https://maartengr.github.io/BERTopic/faq.html#should-i-preprocess-the-data

# Removal of links
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

pruned_docs = [remove_urls(doc) for doc in pruned_docs]

# Converting emojis into words
# Emoji's are removed as part of the tokenizer
def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.demojize(doc, delimiters=(" ", " "))

#pruned_docs = [convert_emojis(doc) for doc in pruned_docs]

# Converting hashtags into words
# Or removing the hashtag and word entirely
# Hashtags are converted to standard words as part of the tokenizer
def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

pruned_docs = [remove_hashtags(doc) for doc in pruned_docs]

# Converting mentions into words
def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

pruned_docs = [remove_numbers(doc) for doc in pruned_docs]

# Function to remove user mentions
def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

pruned_docs = [remove_user_mentions(doc) for doc in pruned_docs]

def fix_contractions(doc):
    return contractions.fix(doc)

pruned_docs = [fix_contractions(doc) for doc in pruned_docs]

# Remove punctuation 
# Punctuation is removed as part of the tokenizer
def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

pruned_docs = [remove_punctuation(doc) for doc in pruned_docs]

# Function to remove stopwords
def remove_stopwords(doc):
    return ' '.join([word for word in doc.split() if word.lower() not in stop_words])

pruned_docs = [remove_stopwords(doc) for doc in pruned_docs]

# Lemmatize the text
def lemmatize_text(doc):
    doc = nlp(doc)
    return ' '.join([token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct and not token.is_space])

pruned_docs = [lemmatize_text(doc) for doc in pruned_docs]

# Remove the 'amp' word
def remove_amp(doc):
    return re.sub(r'\bamp\b', '', doc).strip() # strip removes the surrounding white space

pruned_docs = [remove_amp(doc) for doc in pruned_docs]

In [128]:
tokenizer = lambda s: re.findall( '\w+', s.lower() )

pruned_docs_split = [ tokenizer(doc) for doc in  pruned_docs ]

# Combine the inner lists into sentences
pruned_docs_tokenized = [' '.join(words) for words in pruned_docs_split]

In [129]:
pruned_docs_tokenized

['speak kids infection',
 'practice announce price tag notably deep global recession since second world war nature fail enact diversity inclusion pledge nailed dr pai',
 'awesome',
 'good',
 'whimper',
 'steady tuesday twitterverse',
 'one recent study suggest families first coronavirus response act ffcra grant worker pay sick leave expand family medical leave due covid help reduce spread virus learn',
 'anyone look poster idea encourage vaccination hospital clinic',
 'tomorrow december brace year review article come',
 'neighbor need key pickup seat drive supermarket',
 'akk sorry folk live week tomorrow erroneously tweet tomorrow apologies still two great speaker moderator lose track time',
 'american kestrel yolobypass afternoon take phone call traildesk',
 'new york state doh initiate emergency hospital measure today amid steady increase covid case preparation surgemanage capacity hospitals direct identify retire staff make plan add bed capacity confirm day ppe stockpile',
 'update

## Performing BERTopic

In [130]:
# Representation model is KeyBERTInspired
# This should reduce the prevalence of stopwords in the output model
# None of these give strong results
#representation_model = KeyBERTInspired() 
#representation_model = PartOfSpeech("en_core_web_sm") 
#representation_model = MaximalMarginalRelevance(diversity=0.8) 

# Topic modelling

# Initialize UMAP and HDBSCAN with random seeds
from umap import UMAP
from hdbscan import HDBSCAN

#random_seed = 50
#umap_model = UMAP()
#hdbscan_model = HDBSCAN(prediction_data=True, min_samples=10, gen_min_span_tree=True)

bertopic_model = BERTopic()
topics, probs = bertopic_model.fit_transform(pruned_docs_tokenized)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [131]:
pruned_topic_df = bertopic_model.get_document_info(pruned_docs)

In [132]:
pruned_topic_df['Topic'].nunique()

309

In [133]:
pruned_topic_df

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,speak Kids infection,22,22_child_kid_pediatric_parent,"[child, kid, pediatric, parent, children, youn...","[also datum child covid death see, covid pande...",child - kid - pediatric - parent - children - ...,0.628504,False
1,practice announce price tag notably deep globa...,-1,-1_health_covid_vaccine_people,"[health, covid, vaccine, people, need, pandemi...",[indeed stupid approach testing state goal lim...,health - covid - vaccine - people - need - pan...,0.000000,False
2,awesome,160,160_awesome_cool_alright_indeed,"[awesome, cool, alright, indeed, woot, haha, l...","[awesome, awesome, awesome]",awesome - cool - alright - indeed - woot - hah...,0.813322,True
3,good,204,204_good_ok_nice_one,"[good, ok, nice, one, thank, , , , , ]","[good, good, good]",good - ok - nice - one - thank - - - - -,0.892945,True
4,whimper,259,259_tragic_tragedy_thrown_causative,"[tragic, tragedy, thrown, causative, siri, hor...","[tragic x gen x, sure bad stuff except thrown ...",tragic - tragedy - thrown - causative - siri -...,0.679765,False
...,...,...,...,...,...,...,...,...
27947,receive question Michael thought utility rapid...,9,9_test_rapid_testing_athome,"[test, rapid, testing, athome, tool, free, fre...",[thread cost athome rapid tests us government ...,test - rapid - testing - athome - tool - free ...,0.766718,False
27948,create training hub boost pharmaceutical produ...,-1,-1_health_covid_vaccine_people,"[health, covid, vaccine, people, need, pandemi...",[indeed stupid approach testing state goal lim...,health - covid - vaccine - people - need - pan...,0.000000,False
27949,year ago today test US already fail diagnose p...,-1,-1_health_covid_vaccine_people,"[health, covid, vaccine, people, need, pandemi...",[indeed stupid approach testing state goal lim...,health - covid - vaccine - people - need - pan...,0.000000,False
27950,OMG laugh hard Elmo replace Timothee Chalamet ...,-1,-1_health_covid_vaccine_people,"[health, covid, vaccine, people, need, pandemi...",[indeed stupid approach testing state goal lim...,health - covid - vaccine - people - need - pan...,0.000000,False


## Calculating Topic Coherence

In [134]:
topic_info = bertopic_model.get_topic_info()
topics = topic_info['Representation'].tolist()

word2id = Dictionary(pruned_docs_split)

In [135]:
cm = CoherenceModel(topics=topics, 
                    texts=pruned_docs_split,
                    coherence='c_v',  # c_npmi was used in the bertopic serbian research paper
                    dictionary=word2id)

In [136]:
coherence_per_topic = cm.get_coherence_per_topic()
macro_topic_coherence = sum(coherence_per_topic) / len(coherence_per_topic)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [137]:
# This returns 0.02 when set to npmi, the serbian research return -0.042 as their best score
# This returns 0.511 when set to c_v, this is a good coherence score
# When removing non-english words and links this drops to 0.44
# When adding lemmatization this goes up to 0.46
# Removing numbers drop to 0.4567
macro_topic_coherence 

0.4177785447439156

## Calculating Topic Diversity

In [138]:
#Found here: https://github.com/silviatti/topic-model-diversity/blob/master/diversity_metrics.py
def proportion_unique_words(topics, topk=10):
    """
    compute the proportion of unique words

    Parameters
    ----------
    topics: a list of lists of words
    topk: top k words on which the topic diversity will be computed (There are 10 words in each topic)
    """
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than '+str(topk))
    else:
        unique_words = set()
        for topic in topics:
            unique_words = unique_words.union(set(topic[:topk]))
        puw = len(unique_words) / (topk * len(topics))
        return puw


In [139]:
from itertools import combinations
def pairwise_jaccard_diversity(topics, topk=10):
    '''
    compute the average pairwise jaccard distance between the topics 
  
    Parameters
    ----------
    topics: a list of lists of words
    topk: top k words on which the topic diversity
          will be computed (There are 10 words in each topic)
    
    Returns
    -------
    pjd: average pairwise jaccard distance
    '''
    dist = 0
    count = 0
    for list1, list2 in combinations(topics, 2):
        js = 1 - len(set(list1).intersection(set(list2)))/len(set(list1).union(set(list2)))
        dist = dist + js
        count = count + 1
    return dist/count

In [140]:
# The first inputs the topics, the second inputs the number of words in each topic
proportion_unique_words(topics, 10)

0.7919093851132686

In [141]:
pairwise_jaccard_diversity(topics, 10)

0.9986612417386271

In [154]:
print(topic_info['Representation'][50])

['employee', 'worker', 'mandate', 'employer', 'osha', 'business', 'paidtimeoff', 'pto', 'requirement', 'vaccinate']
