In [33]:
from bertopic import BERTopic
import pymongo
from dotenv import load_dotenv
import os

# Text preprocessiong
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()

# Dimension reduction
from umap import UMAP

load_dotenv()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leowalker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/leowalker/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leowalker/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
client = pymongo.MongoClient(os.getenv('mongodb_uri'))
DB_NAME = "hibt_transcripts"
COLLECTION_NAME = "hibt_answer_collection"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "hibt_test_index"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

In [18]:
# In the mongo collection find all documents where answer is not "Question/Answer not found."

answers = MONGODB_COLLECTION.find({"answer": {"$ne": "Question/Answer not found."}})

In [19]:
documents = list(answers)

In [21]:
answer_list = [doc["answer"] for doc in documents]

In [34]:
# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
print(f'There are {len(stopwords)} default stopwords. They are {stopwords}')

There are 179 default stopwords. They are ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'no

In [35]:
answer_list_without_stopwords = [' '.join([w for w in answer.split() if w.lower() not in stopwords]) for answer in answer_list]

lemmatized_answers = [' '.join([wn.lemmatize(w) for w in answer.split()]) for answer in answer_list_without_stopwords]


In [37]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(lemmatized_answers)


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [38]:
# Get the list of topics
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,72,-1_know_think_luck_like,"[know, think, luck, like, really, im, work, lu...","[know, tough one hard tell, right? I'm huge be..."
1,0,19,0_know_like_think_people,"[know, like, think, people, lot, work, really,...","[mean, would say lot hard work, lot hard work...."
2,1,14,1_luck_know_think_lucky,"[luck, know, think, lucky, like, thats, theres...",[think luck come people every single day use l...
3,2,12,2_like_think_know_luck,"[like, think, know, luck, people, life, hard, ...","[guess I, attribute luck, call luck. know, thi..."
