In [43]:
import pandas as pd
import re
import emoji

from bertopic import BERTopic

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

In [44]:
# Filepaths
tweet_data_fp = 'twitter_data/custom_data/pruned_media_users.csv'

In [45]:
tweet_data_df = pd.read_csv(tweet_data_fp)

## Preprocessing

In [46]:
# Gathering documents
docs = tweet_data_df['Tweet'].tolist()
pruned_docs = docs

In [47]:
# Removal of links
def remove_urls(doc):
    return re.sub(r'http\S+', '', doc)

pruned_docs = [remove_urls(doc) for doc in pruned_docs]

# Converting emojis into words
def convert_emojis(doc):
    # delimiters are what is used around the emoji description, in this case spaces are used
    return emoji.demojize(doc, delimiters=(" ", " "))

pruned_docs = [convert_emojis(doc) for doc in pruned_docs]

# Converting hashtags into words
def remove_hashtags(doc):
    return doc.replace('#', '')

pruned_docs = [remove_hashtags(doc) for doc in pruned_docs]

# Converting mentions into words
def remove_numbers(doc):
    return re.sub(r'\d+', '', doc)

pruned_docs = [remove_numbers(doc) for doc in pruned_docs]

# Remove punctuation 
def remove_punctuation(doc):
    return re.sub(r'[^\w\s]', '', doc)

pruned_docs = [remove_punctuation(doc) for doc in pruned_docs]

In [48]:
pruned_docs

['Through this campaign we wanted to challenge misinformation dispel fear and support the health workers and others at the forefront of the fight against the virus Ram Devineni the IndianAmerican creator of the Priya comic series said ',
 'Interesting news story Leaked documents reveal Chinas mishandling of the early stages of Covid\n pages of leaked documents from the Hubei Provincial Center for Disease Control and Prevention shared with and verified by CNN ',
 'These findings suggest that SARSCoV may have been introduced into the United States prior to Jan   Serologic testing of US blood donations to identify SARSCoVreactive antibodies Dec Jan   Clinical Infectious Diseases  Oxford ',
 'Interesting perspective from a respected vaccine expert Dr Paul Offit ',
 'Gov Hogan Worst part of this entire crisis is still ahead of us in Maryland  Maryland hospitals now have a week to get a plan ready for the looming surge ',
 'FBI warns of new coronavirus email autoforwarding scam\n\n',
 'Austi

In [49]:
tokenizer = lambda s: re.findall( '\w+', s.lower() )

pruned_docs_split = [ tokenizer(doc) for doc in  pruned_docs ]

# Combine the inner lists into sentences
pruned_docs_tokenized = [' '.join(words) for words in pruned_docs_split]

## Performing BERTopic

In [50]:
# Topic modelling
bertopic_model = BERTopic()
topics, probs = bertopic_model.fit_transform(pruned_docs_tokenized)

In [51]:
pruned_topic_df = bertopic_model.get_document_info(pruned_docs)

In [52]:
pruned_topic_df['Topic'].nunique()

37

## Calculating Topic Coherence

In [35]:
topic_info = bertopic_model.get_topic_info()
topics = topic_info['Representation'].tolist()

word2id = Dictionary(pruned_docs_split)

In [36]:
cm = CoherenceModel(topics=topics, 
                    texts=pruned_docs_split,
                    coherence='c_v',  # c_npmi was used in the bertopic serbian research paper
                    dictionary=word2id)

In [37]:
coherence_per_topic = cm.get_coherence_per_topic()
macro_topic_coherence = sum(coherence_per_topic) / len(coherence_per_topic)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [38]:
# This returns 0.02 when set to npmi, the serbian research return -0.042 as their best score
# This returns 0.511 when set to c_v, this is a good coherence score
macro_topic_coherence 

0.4777628268022707

## Calculating Topic Diversity

In [39]:
#Found here: https://github.com/silviatti/topic-model-diversity/blob/master/diversity_metrics.py
def proportion_unique_words(topics, topk=10):
    """
    compute the proportion of unique words

    Parameters
    ----------
    topics: a list of lists of words
    topk: top k words on which the topic diversity will be computed (There are 10 words in each topic)
    """
    if topk > len(topics[0]):
        raise Exception('Words in topics are less than '+str(topk))
    else:
        unique_words = set()
        for topic in topics:
            unique_words = unique_words.union(set(topic[:topk]))
        puw = len(unique_words) / (topk * len(topics))
        return puw


In [40]:
from itertools import combinations
def pairwise_jaccard_diversity(topics, topk=10):
    '''
    compute the average pairwise jaccard distance between the topics 
  
    Parameters
    ----------
    topics: a list of lists of words
    topk: top k words on which the topic diversity
          will be computed (There are 10 words in each topic)
    
    Returns
    -------
    pjd: average pairwise jaccard distance
    '''
    dist = 0
    count = 0
    for list1, list2 in combinations(topics, 2):
        js = 1 - len(set(list1).intersection(set(list2)))/len(set(list1).union(set(list2)))
        dist = dist + js
        count = count + 1
    return dist/count

In [41]:
# The first inputs the topics, the second inputs the number of words in each topic
proportion_unique_words(topics, 10)

0.7753462603878116

In [42]:
pairwise_jaccard_diversity(topics, 10)

0.9984637897625313