In [1]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Example loading from a CSV file
antiwork = pd.read_excel("../data/1000_per_subreddit/long_query subreddit_antiwork sort_new t_all n_1000.xlsx")


In [3]:
texts = antiwork['Body'].astype(str).tolist()

In [4]:
# Initialize BERTopic model
topic_model = BERTopic(calculate_probabilities=True)

# Fit the model to your texts
print("Training model...")
topics, probs = topic_model.fit_transform(texts)


In [5]:
# Get the topics with their respective words
topics_overview = topic_model.get_topic_info()

# Print topics overview
print(topics_overview)

# Access words of a specific topic (e.g., topic 0)
print(topic_model.get_topic(0))

    Topic  Count                                  Name  \
0      -1     32                      -1_im_my_and_job   
1       0    190              0_nan_lord_chode_decreed   
2       1    150                      1_the_of_to_that   
3       2     70                    2_people_to_the_of   
4       3     60            3_we_project_they_projects   
5       4     60                       4_the_and_to_of   
6       5     50                        5_was_my_to_me   
7       6     40         6_onboarding_tracking_ai_they   
8       7     37               7_ai_workers_is_hearing   
9       8     30                        8_my_am_to_and   
10      9     30           9_tech_layoffs_companies_of   
11     10     30                  10_gt_she_minutes_me   
12     11     29                     11_it_and_the_was   
13     12     25              12_fucking_for_dont_have   
14     13     21          13_idea_replace_release_ceos   
15     14     20              14_change_system_push_or   
16     15     

In [8]:
# Plot topics
topic_model.visualize_topics()


In [None]:
# Plot topic probability distribution
topic_model.visualize_distribution(probs[0])

In [10]:
# Save the model
topic_model.save("bertopic_model")

# Load the model
loaded_model = BERTopic.load("bertopic_model")



In [26]:
topic_names = {}
for i in range(len(topics_overview)):
    topic_names[topics_overview['Topic'][i]] = topics_overview['Name'][i]

{-1: '-1_im_my_and_job',
 0: '0_nan_lord_chode_decreed',
 1: '1_the_of_to_that',
 2: '2_people_to_the_of',
 3: '3_we_project_they_projects',
 4: '4_the_and_to_of',
 5: '5_was_my_to_me',
 6: '6_onboarding_tracking_ai_they',
 7: '7_ai_workers_is_hearing',
 8: '8_my_am_to_and',
 9: '9_tech_layoffs_companies_of',
 10: '10_gt_she_minutes_me',
 11: '11_it_and_the_was',
 12: '12_fucking_for_dont_have',
 13: '13_idea_replace_release_ceos',
 14: '14_change_system_push_or',
 15: '15_checklist_experience_late_message',
 16: '16_amazon_dsp_the_they',
 17: '17_you_we_your_well',
 18: '18_direct_crosschq_past_references',
 19: '19_100x_engineer_engineers_great',
 20: '20_job_my_bad_to',
 21: '21_he_department_him_told',
 22: '22_ever_merit_lego_with'}

In [28]:
antiwork['Topic'] = topics
antiwork['Topic_name'] = antiwork['Topic'].map(topic_names)
antiwork.to_excel("../data/1000_per_subreddit/antiwork_topics.xlsx", index=False)