# Topic Modeling on Short Text with BERTopic and BERTweet

### Team members: Emily Altland, Terryl Dodson, Maheep Mahat, Daniel Manesh, Kiet Nguyen, Tianjiao Yu

Semester: Spring 2022

Instructor: Dr. Dawei Zhou

First, we import the necessary packages to run our code:

In [3]:
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import argparse
import torch

# BERTopic (our modification of the source code)
from bertopic._bertopic import BERTopic

# Dimension reduction
from umap import UMAP
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Embeddings
from flair.embeddings import TransformerDocumentEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer

# Evaluation
import gensim.corpora as corpora
from gensim.models import CoherenceModel

ModuleNotFoundError: No module named 'yaml'

Here is our function to train our BERTopic model. We pass to it our data in the form of a Pandas Series, an embedding model (in our case, it is either Sentence Transformers or BERTweet), and a dimension reduction model (UMAP, PCA, or t-SNE).

In [None]:
def train_bertopic(data, embedding_model, dimension_reduction_model):
    vectorizer_model = CountVectorizer(ngram_range=(1, 1), min_df=1)
    if isinstance(embedding_model, TSNE):
        nr_topics = 5
    else:
        nr_topics = "auto"
    topic_model = BERTopic(
        embedding_model=embedding_model,
        nr_topics=nr_topics,
        top_n_words=20,
        min_topic_size=30,
        verbose=True,
        low_memory=True,
        vectorizer_model=vectorizer_model,
        umap_model=dimension_reduction_model,
    )
    topics, _ = topic_model.fit_transform(data.tolist())
    return topic_model, topics

In [None]:
def get_coherence_score(data, topic_model, topics, coherence):
    # Extract vectorizer and tokenizer from BERTopic
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()

    # Extract features for Topic Coherence evaluation
    tokens = [tokenizer(doc) for doc in data]
    # tokens = [token for token in tokens if token!='']

    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [
        [words for words, _ in topic_model.get_topic(topic) if words != ""]
        for topic in range(len(set(topics)) - 1)
    ]

    # Evaluate
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=tokens,
        corpus=corpus,
        dictionary=dictionary,
        coherence=coherence,
    )
    return coherence_model.get_coherence()

In [None]:
def main():
    data_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        "data",
        "preprocessed_tweets",
        "all_tweets.csv",
    )
    df = pd.read_csv(data_path, header=0)
    df = df.text.dropna()[:20000] # Train on sample of 20,000 tweets

    # Train BERTopic using BERTweet base vs. BERT base as our embedding model
    bertweet = TransformerDocumentEmbeddings("vinai/bertweet-base")
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

    embedding_models = [bertweet, sentence_model]
    embedding_model_names = ["BERTweet", "BERT base"]
    
    # Dimension reduction models
    tsne = TSNE(n_components=5, init="pca", method="exact", random_state=2022)
    pca = PCA(n_components=5, random_state=2022)
    umap = UMAP(n_neighbors=35, n_components=5, min_dist=0.0, metric="euclidean", random_state=2022)

    dimension_reduction_models = [tsne, pca, umap]
    dimension_reduction_model_names = ["t-SNE", "PCA", "UMAP"]

    for embedding_model, ename in zip(embedding_models, embedding_model_names):
        for dimension_reduction_model, dname in zip(dimension_reduction_models, dimension_reduction_model_names):
            print(f"{datetime.now().strftime('%H:%M:%S')}: Computing coherence score: {ename} with {dname}.")
            topic_model, topics = train_bertopic(df, embedding_model, dimension_reduction_model)
            score = get_coherence_score(df, topic_model, topics, "u_mass")
            print(
                f"{datetime.now().strftime('%H:%M:%S')}: {ename} with {dname} UMass score: {score}."
            )