# Purpose: 

Experiment and Investigate BERTopic


In [1]:
import pandas as pd 
import re 
import json 
import os 
import datetime
import matplotlib.pyplot as plt
import mplcursors
import gensim, spacy, logging, warnings
from gensim import corpora
import pyLDAvis
from collections import Counter
%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
from typing import Dict, Any, Union
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
import numpy as np


In [7]:
nlp = spacy.load("en_core_web_sm", disable=['parser']) 
# added these additional words to stopword list as after first round of pre-processing, I checked into the frequency of words in our corpus. When looking into the corpus
# of words, domain specific words such as train, customer and service which occured amongst the top 200 frequent words can be removed because they do not add any additonal 
# value in extraction of topics

In [20]:
%%time
ner_dict,date_list, gpe_list = get_named_entity_word_counts(all_sentences)

CPU times: user 5.1 s, sys: 182 ms, total: 5.28 s
Wall time: 5.28 s


In [86]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2") # utilised sentence transformer to convert text into high-dimensional vector represenatation 
embeddings = sentence_model.encode(train_reviews_df['final_text'], show_progress_bar=True) # generate sentence embeddings 

# Train our topic model using our pre-trained sentence-transformers embeddings
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, embedding_model=sentence_model,
                       language="english", 
                       calculate_probabilities=True, 
                       nr_topics= "auto", 
                       top_n_words= 30, 
                       n_gram_range=(1,2)
                       )
topics, probs = topic_model.fit_transform(train_reviews_df['final_text'], embeddings)

# Generate `X` and `labels` only for non-outlier topics (as they are technically not clusters)
umap_embeddings = topic_model.umap_model.transform(embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X = umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]

# Calculate silhouette score
silhouette_score(X, labels) 

Batches:   0%|          | 0/52 [00:00<?, ?it/s]

0.4833881

In [1]:
# Visualize intertopic distance
topic_model.visualize_topics()

In [22]:
def keep_adj(text):
    """
    Extracts all the adjectives from the given text and returns them as a string.

    Args:
    text (str): The text to extract adjectives from.

    Returns:
    str: A string containing all the adjectives in the text.
    """

    # Process the text with SpaCy
    doc = nlp(text)

    # Extract all the adjectives from the text
    adj = [token.text for token in doc if token.pos_ == 'ADJ']

    # Join the adjectives into a single string
    return ' '.join(adj)
# keeping only adjectives, to remove noise from reviews that needed further investigation. Adjectives provide good insights into the customers emotions. 
train_reviews_2['adj']= train_reviews_2['final_text'].apply(lambda x: keep_adj(x))

In [23]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(train_reviews_2['adj'], show_progress_bar=True)

# Train our topic model using our pre-trained sentence-transformers embeddings
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, embedding_model=sentence_model,
                       language="english", 
                       calculate_probabilities=True, 
                       nr_topics= "auto", 
                       top_n_words= 30, 
                       n_gram_range=(1,2)
                       )
topics, probs = topic_model.fit_transform(train_reviews_2['adj'], embeddings)

# Generate `X` and `labels` only for non-outlier topics (as they are technically not clusters)
umap_embeddings = topic_model.umap_model.transform(embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X = umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]

# Calculate silhouette score
silhouette_score(X, labels) 

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

0.69791675

In [120]:
print('End of Notebook')

End of Notebook
