In [None]:
# Install bertopic if you don't already have it
pip install bertopic

In [None]:
# Install nltk if you don't already have it
pip install nltk

In [None]:
# Import libraries
from bertopic import BERTopic
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer



In [None]:
# Import nltk libraries
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
# Read in the data file, remove null Descriptions and check the first few rows
docs = pd.read_csv(r"[your_filepath]\Bookshelf-2025-05-08.csv", dtype={'Description': str}, engine = 'python')
docs = docs[docs['Description'].notnull()]
docs.head()

In [None]:
# Create multilingual stopword lists
stop_words_en = set(stopwords.words('english'))
stop_words_he = set(stopwords.words('greek'))
stop_words_de = set(stopwords.words('german'))


In [None]:
# Remove English, Greek and German stopwords from the description
# Words need to be changed to lower case or else they remain in the list
docs_df = pd.DataFrame(docs)
docs_df.columns = ['Title', 'Categories', 'Description']
docs_df['desc_without_stopwords'] = docs_df['Description'].apply(lambda x: ' '.join([word for word in x.split() if (word.lower() not in (stop_words_en)) and (word.lower() not in (stop_words_de)) and (word.lower() not in (stop_words_he))]))
print(docs_df['desc_without_stopwords'])

In [None]:
# Create vectorizer model
vectorizer_model = CountVectorizer(ngram_range=(1, 2))

In [None]:
# BERTopic needs data to be in a list, so we create the list using the descriptions without stopwords
docs_list = docs_df['desc_without_stopwords']
docs_list = docs_list[~pd.isna(docs_list)].to_list()

In [None]:
# Create BERTopic model. Needs to be multilingual and to have small number of words per topic to categorise non-English books and books that have a small number per category
topic_model = BERTopic(vectorizer_model=vectorizer_model, verbose = True, language = "multilingual", calculate_probabilities=True)

In [None]:
# Fit the model
topics, probs = topic_model.fit_transform(docs_list)

In [None]:
# View the topics
topic_model.get_topic_info()

In [None]:
# Check a topic
topic_model.get_topic(3)

In [None]:
# Create hierarchical cluster model of topics. This throws an error for me but still produces a dendrogram
topic_model.visualize_hierarchy()

In [None]:
# Create a heatmap to show topic similarities. This throws an error for me like with the hierarchical cluster model. I find this diagram to be less useful than the dendrogram
topic_model.visualize_heatmap()

In [None]:
## Check the topic against individual documents in the list (using the description without stopwords)
topic_model.get_document_info(docs_list)