In [10]:
# !pip install flair
import pandas as pd
from bertopic import BERTopic
from nltk.corpus import stopwords
import regex as re

from flair.embeddings import TransformerDocumentEmbeddings
from sentence_transformers import SentenceTransformer

In [11]:
stop = stopwords.words('english')
def preprocess(text):
    # split into words
    text = text.split()

    sentence = ' '.join([word for word in text if word.lower() not in stop])
    # remove punctuation
    sentence = re.sub(r'[^\w\s]','',sentence)
    # remove numbers
    sentence = re.sub(r'\d+','',sentence)
    # remove whitespaces
    sentence = re.sub(r'\s+',' ',sentence)
    # remove single characters
    sentence = re.sub(r'\s+[a-zA-Z]\s+',' ',sentence)
    
    return sentence

In [12]:
posts = pd.read_csv('../data/pyt_posts.csv')

ids = posts["Id"].tolist()
texts = posts["Body"].tolist()
doc = []
to_remove = []
print(len(ids), len(texts))

for i in range(len(texts)):
    clean = preprocess(texts[i])
    if clean != "":
        doc.append(clean)
    else:
        to_remove.append(i)

ids = [i for j, i in enumerate(ids) if j not in to_remove]
print(len(ids), len(doc))

24140 24140
24140 24140


In [17]:
roberta = TransformerDocumentEmbeddings('roberta-base')
sentence_model = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')
topic_model = BERTopic(embedding_model=roberta, language="english", calculate_probabilities=True, n_gram_range=(1,2), min_topic_size=30)
topics, probabilities = topic_model.fit_transform(doc)

In [18]:
topic_model.visualize_barchart(top_n_topics=20, title="Roberta, 1-2 n-grams, min_topic_size=30")

In [6]:
topic_model.save("albert_model")

In [7]:
bertModel = BERTopic.load("albert_model")

In [8]:
vis = bertModel.visualize_barchart(top_n_topics=8, title="Topics Word Scores")

#save the visualization as pdf using matplotlib
import matplotlib.pyplot as plt
plt.savefig("bertopic.pdf")


<Figure size 640x480 with 0 Axes>

In [9]:
# Get the topic labels for each topic
topic_labels = model.get_topic_labels()

# Create a dictionary to store the categories
categories = {}
for i, label in enumerate(topic_labels):
    category = categories.get(label, {'count': 0, 'topics': []})
    category['count'] += 1
    category['topics'].append(topics[i])
    categories[label] = category

# Save each category to a new file
for label, category in categories.items():
    file_name = f'category_{label}.csv'
    category_data = data.loc[topic_labels == label]
    category_data.to_csv(file_name, index=False)

    print(f'Category {label} - {category_data.shape[0]} posts')
    print('Topics:')
    for topic in category['topics']:
        print('- ' + topic)
    print()

[1,
 -1,
 -1,
 0,
 0,
 0,
 30,
 -1,
 0,
 1,
 25,
 0,
 13,
 14,
 -1,
 0,
 22,
 1,
 1,
 1,
 -1,
 1,
 -1,
 0,
 -1,
 0,
 11,
 11,
 -1,
 2,
 0,
 -1,
 1,
 0,
 -1,
 11,
 -1,
 8,
 -1,
 -1,
 3,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 1,
 -1,
 -1,
 0,
 18,
 -1,
 -1,
 0,
 0,
 -1,
 18,
 -1,
 16,
 0,
 0,
 -1,
 1,
 1,
 3,
 2,
 -1,
 -1,
 0,
 14,
 -1,
 8,
 8,
 -1,
 -1,
 10,
 1,
 12,
 -1,
 -1,
 -1,
 0,
 0,
 0,
 -1,
 -1,
 18,
 1,
 37,
 -1,
 1,
 -1,
 0,
 1,
 0,
 1,
 0,
 -1,
 10,
 0,
 11,
 -1,
 -1,
 1,
 3,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 -1,
 3,
 -1,
 -1,
 2,
 0,
 -1,
 -1,
 -1,
 8,
 -1,
 5,
 -1,
 -1,
 1,
 -1,
 -1,
 -1,
 8,
 2,
 -1,
 -1,
 0,
 -1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 1,
 17,
 1,
 -1,
 -1,
 8,
 -1,
 1,
 -1,
 0,
 -1,
 -1,
 -1,
 -1,
 1,
 -1,
 3,
 3,
 -1,
 -1,
 -1,
 16,
 0,
 -1,
 1,
 8,
 1,
 6,
 1,
 -1,
 -1,
 -1,
 12,
 1,
 -1,
 -1,
 1,
 0,
 0,
 1,
 -1,
 2,
 -1,
 -1,
 2,
 -1,
 1,
 0,
 -1,
 -1,
 -1,
 1,
 -1,
 11,
 -1,
 -1,
 -1,
 -1,
 2,
 -1,
 0,
 18,
 2,
 -1,
 -1,
 8,
 -1,
 -1,
 0,
 5,
 1,