In [61]:
import pandas as pd
import re
import emoji
import spacy
# Spacy's english model
nlp = spacy.load('en_core_web_sm')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# used to fix contractions such as I'll to I will
import contractions

from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance
from bertopic import BERTopic

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

In [62]:
# Filepaths
tweet_data_fp = 'twitter_data/custom_data/pruned_media_users.csv'

In [63]:
tweet_data_df = pd.read_csv(tweet_data_fp)
tweet_data_df = tweet_data_df[tweet_data_df['Lang_code'] == 'en']

## Preprocessing

In [64]:
# Gathering documents
docs = tweet_data_df['Tweet'].tolist()
pruned_docs = docs

In [65]:
len(pruned_docs)

1831

In [66]:
# Best to err on the side of caution when pre-processing:
# https://maartengr.github.io/BERTopic/faq.html#should-i-preprocess-the-data

# Removal of links
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

pruned_docs = [remove_urls(doc) for doc in pruned_docs]

# Converting emojis into words
# Emoji's are removed as part of the tokenizer
def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.demojize(doc, delimiters=(" ", " "))

#pruned_docs = [convert_emojis(doc) for doc in pruned_docs]

# Converting hashtags into words
# Or removing the hashtag and word entirely
# Hashtags are converted to standard words as part of the tokenizer
def remove_hashtags(doc):
    return doc.replace('#', '')
    #return re.sub(r'#\w+', '', doc)

pruned_docs = [remove_hashtags(doc) for doc in pruned_docs]

# Converting mentions into words
def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

pruned_docs = [remove_numbers(doc) for doc in pruned_docs]

# Function to remove user mentions
def remove_user_mentions(doc):
    return re.sub(r'@\w+', '', doc)

pruned_docs = [remove_user_mentions(doc) for doc in pruned_docs]

def fix_contractions(doc):
    return contractions.fix(doc)

pruned_docs = [fix_contractions(doc) for doc in pruned_docs]

# Remove punctuation 
# Punctuation is removed as part of the tokenizer
def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

pruned_docs = [remove_punctuation(doc) for doc in pruned_docs]

# Function to remove stopwords
def remove_stopwords(doc):
    return ' '.join([word for word in doc.split() if word.lower() not in stop_words])

pruned_docs = [remove_stopwords(doc) for doc in pruned_docs]

# Lemmatize the text
def lemmatize_text(doc):
    doc = nlp(doc)
    return ' '.join([token.lemma_ for token in doc if token.lemma_ not in stop_words and not token.is_punct and not token.is_space])

pruned_docs = [lemmatize_text(doc) for doc in pruned_docs]

# Remove the 'amp' word
def remove_amp(doc):
    return re.sub(r'\bamp\b', '', doc).strip() # strip removes the surrounding white space

pruned_docs = [remove_amp(doc) for doc in pruned_docs]

In [67]:
tokenizer = lambda s: re.findall( '\w+', s.lower() )

pruned_docs_split = [ tokenizer(doc) for doc in  pruned_docs ]

# Combine the inner lists into sentences
pruned_docs_tokenized = [' '.join(words) for words in pruned_docs_split]

In [68]:
pruned_docs_tokenized

['campaign want challenge misinformation dispel fear support health worker forefront fight virus ram devineni indianamerican creator priya comic series say',
 'interesting news story leak document reveal chinas mishandle early stage covid page leak document hubei provincial center disease control prevention share verify cnn',
 'finding suggest sarscov may introduce united states prior january serologic testing yous blood donation identify sarscovreactive antibodie dec jan clinical infectious diseases oxford',
 'interesting perspective respected vaccine expert dr paul offit',
 'gov hogan bad part entire crisis still ahead maryland maryland hospital week get plan ready loom surge',
 'fbi warn new coronavirus email autoforwarde scam',
 'austin mayor stress resident need stay home vacation cabo time smartnews',
 'abrupt halt world trade tourism impact lockdown international migration remittance deal ruinous blow',
 'coronavirus hackers target covid vaccine supply cold chain phishing email 

## Performing BERTopic

In [98]:
# Representation model is KeyBERTInspired
# This should reduce the prevalence of stopwords in the output model
# None of these give strong results
#representation_model = KeyBERTInspired() 
#representation_model = PartOfSpeech("en_core_web_sm") 
#representation_model = MaximalMarginalRelevance(diversity=0.8) 

# Topic modelling

# Initialize UMAP and HDBSCAN with random seeds
from umap import UMAP
from hdbscan import HDBSCAN

#random_seed = 50
#umap_model = UMAP()
#hdbscan_model = HDBSCAN(prediction_data=True, min_samples=10, gen_min_span_tree=True)

bertopic_model = BERTopic()
topics, probs = bertopic_model.fit_transform(pruned_docs_tokenized)

In [99]:
pruned_topic_df = bertopic_model.get_document_info(pruned_docs)

In [100]:
pruned_topic_df['Topic'].nunique()

27

In [101]:
pruned_topic_df

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,campaign want challenge misinformation dispel ...,-1,-1_covid_vaccine_health_say,"[covid, vaccine, health, say, coronavirus, pan...",[control covid americas take year slow pace va...,covid - vaccine - health - say - coronavirus -...,0.000000,False
1,interesting news story leak document reveal Ch...,2,2_china_origin_wuhan_virus,"[china, origin, wuhan, virus, lab, mission, ch...",[press briefing hypothesis remain open origin ...,china - origin - wuhan - virus - lab - mission...,1.000000,False
2,finding suggest SARSCoV may introduce United S...,12,12_sarscov_variant_virus_infectious,"[sarscov, variant, virus, infectious, mutation...",[fastspreade delta variant coronavirus erode p...,sarscov - variant - virus - infectious - mutat...,1.000000,False
3,interesting perspective respected vaccine expe...,-1,-1_covid_vaccine_health_say,"[covid, vaccine, health, say, coronavirus, pan...",[control covid americas take year slow pace va...,covid - vaccine - health - say - coronavirus -...,0.000000,False
4,Gov Hogan bad part entire crisis still ahead M...,3,3_maryland_smartnews_county_montgomery,"[maryland, smartnews, county, montgomery, hoga...",[maryland gov larry hogan say hospitalization ...,maryland - smartnews - county - montgomery - h...,1.000000,False
...,...,...,...,...,...,...,...,...
1826,recent survey result present CDC meeting vacci...,1,1_vaccine_country_latin_america,"[vaccine, country, latin, america, vaccination...",[pan american health organization strike deal ...,vaccine - country - latin - america - vaccinat...,0.943293,False
1827,good thread mask care,0,0_mask_transmission_indoor_air,"[mask, transmission, indoor, air, wear, ventil...",[important note apply fully vaccinated people ...,mask - transmission - indoor - air - wear - ve...,1.000000,False
1828,interesting,11,11_interesting_jody_see_probably,"[interesting, jody, see, probably, yes, fascin...","[interesting, interesting, interesting]",interesting - jody - see - probably - yes - fa...,0.700704,True
1829,impact worsen covid much transmissible also si...,-1,-1_covid_vaccine_health_say,"[covid, vaccine, health, say, coronavirus, pan...",[control covid americas take year slow pace va...,covid - vaccine - health - say - coronavirus -...,0.000000,False


## Calculating Topic Coherence

In [102]:
topic_info = bertopic_model.get_topic_info()
topics = topic_info['Representation'].tolist()

word2id = Dictionary(pruned_docs_split)

In [103]:
cm = CoherenceModel(topics=topics, 
                    texts=pruned_docs_split,
                    coherence='c_v',  # c_npmi was used in the bertopic serbian research paper
                    dictionary=word2id)

In [104]:
coherence_per_topic = cm.get_coherence_per_topic()
macro_topic_coherence = sum(coherence_per_topic) / len(coherence_per_topic)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [105]:
# This returns 0.02 when set to npmi, the serbian research return -0.042 as their best score
# This returns 0.511 when set to c_v, this is a good coherence score
# When removing non-english words and links this drops to 0.44
# When adding lemmatization this goes up to 0.46
# Removing numbers drop to 0.4567
macro_topic_coherence 

0.41045269088659486

## Calculating Topic Diversity

In [106]:
#Found here: https://github.com/silviatti/topic-model-diversity/blob/master/diversity_metrics.py
def proportion_unique_words(topics, topk=10):
    """
    compute the proportion of unique words

    Parameters
    ----------
    topics: a list of lists of words
    topk: top k words on which the topic diversity will be computed (There are 10 words in each topic)
    """
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than '+str(topk))
    else:
        unique_words = set()
        for topic in topics:
            unique_words = unique_words.union(set(topic[:topk]))
        puw = len(unique_words) / (topk * len(topics))
        return puw


In [107]:
from itertools import combinations
def pairwise_jaccard_diversity(topics, topk=10):
    '''
    compute the average pairwise jaccard distance between the topics 
  
    Parameters
    ----------
    topics: a list of lists of words
    topk: top k words on which the topic diversity
          will be computed (There are 10 words in each topic)
    
    Returns
    -------
    pjd: average pairwise jaccard distance
    '''
    dist = 0
    count = 0
    for list1, list2 in combinations(topics, 2):
        js = 1 - len(set(list1).intersection(set(list2)))/len(set(list1).union(set(list2)))
        dist = dist + js
        count = count + 1
    return dist/count

In [108]:
# The first inputs the topics, the second inputs the number of words in each topic
proportion_unique_words(topics, 10)

0.8

In [109]:
pairwise_jaccard_diversity(topics, 10)

0.9837772465911446

In [121]:
print(topic_info['Representation'][6])

['vaccine', 'covid', 'thread', 'today', 'good', 'fda', 'woman', 'story', 'dose', 'covax']
