In [1]:
import numpy as np
from bertopic import BERTopic
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
np.random.seed(2018)
stop = stopwords.words('english')
stop.extend(['election','vote','midterms','midterm','elect','elections', 'senate', 'house',"us","democracy", "amp", "midtermelections"])

In [2]:
tweets = pd.read_csv("./cleaned_data/2022-11-07_clean.csv")

In [3]:
tweets = tweets.loc[tweets['tweet_type'].isin(['original','reply'])]

In [4]:
# clean the text which includes finding mentioned users, removing numbers and punctuation, and stop words
tweets['mentions'] = tweets['clean_text'].apply(lambda x: re.findall(r"@[a-zA-Z0-9_]*",x))
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: re.sub(r"[^A-Za-z]+"," ", x))
tweets['clean_text'] =tweets['clean_text'].str.lower()
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
tweets = tweets.loc[tweets['clean_text'].str.split().str.len() >= 10].reset_index()

In [20]:
# Train the model, this was done using a subset of ~200000 tweets to save time
topic_model = BERTopic(calculate_probabilities = False)
topics = topic_model.fit_transform(tweets['clean_text'])

In [21]:
# breakdown of each topic
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,34246,-1_people_ballot_get_win
1,0,1034,0_twitter_elonmusk_elon_musk
2,1,932,1_abortion_abortionrights_forcedbirth_reproductive
3,2,737,2_florida_desantis_charliecrist_vbm
4,3,649,3_inflation_economy_reduction_recession
...,...,...,...
999,998,10,998_everyones_finances_worlds_hopeless
1000,999,10,999_archer_bonuses_crossbones_ps
1001,1000,10,1000_mailbox_stitt_flag_tulsa
1002,1001,10,1001_fraud_fraudulent_legit_nuisance


In [22]:
# We can reduce the number of topics so that they are more consise
topic_model.reduce_topics(tweets['clean_text'], nr_topics=75)

<bertopic._bertopic.BERTopic at 0x22d38e9b310>

In [23]:
topics = topic_model.topics_
tweets['topic'] = topics

In [24]:
# The reduced topics
topic_model.get_topic_info()[0:10]

Unnamed: 0,Topic,Count,Name
0,-1,46838,-1_ballot_gop_trump_republicans
1,0,1418,0_abortion_abortionrights_roe_women
2,1,1268,1_twitter_elonmusk_elon_musk
3,2,1101,2_ohio_ryan_vance_tim
4,3,1017,3_early_voting_earlyvoting_find
5,4,943,4_florida_desantis_rubio_charliecrist
6,5,923,5_inflation_economy_womansrights_england
7,6,812,6_ballots_count_mail_votes
8,7,705,7_biden_joe_democrats_president
9,8,698,8_georgia_turnout_gapol_staceyabrams


In [25]:
pd.options.display.max_colwidth = 1000
tweets[['clean_text','topic']][0:4]

Unnamed: 0,clean_text,topic
0,mid term upon public advocate surveyed candidates key races across country florida race val demings democratic pro family marco rubio republican pro family floridaelections,4
1,aftunion latest episode uniontalk podcast live listen important roundtable featuring rweingarten nikki congress rosadelauro repjahanahayes zimmermanforny,37
2,pennsylvania four living former governors monday sent letter main party candidates vying position nov urging respect accept results regardless outcome mail invoting,6
3,melissajpeltier documentary worth time game disillusioned trump voters tell stories watch retweet votebluefordemocracy,-1


In [26]:
remaining = tweets.loc[tweets['topic'] == -1].reset_index()

In [27]:
# From here on can be ignored, I attempted to rerun the docs in the general category (topic = -1) to see if I could
# extract additional meaning. Looked like it might have worked but ran out of time to include this

topic_model_2 = BERTopic(calculate_probabilities = False)
topics = topic_model_2.fit_transform(remaining['clean_text'])

In [28]:
topic_model_2.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,22994,-1_voting_democrats_republicans_ballot
1,0,1573,0_women_abortion_rights_lgbtq
2,1,1063,1_fetterman_oz_pennsylvania_johnfetterman
3,2,743,2_georgia_walker_herschel_herschelwalker
4,3,718,3_russia_ukraine_russian_putin
...,...,...,...
473,472,10,472_bail_prosecute_lawbreakers_vtgop
474,473,10,473_minnesota_pollwatchmn_mprnews_requested
475,474,10,474_confront_extremist_extremists_stephenm
476,475,10,475_govotenow_joycewhitevance_felt_earlyvote


In [29]:
topic_model_2.reduce_topics(remaining['clean_text'], nr_topics=30)
topics = topic_model_2.topics_
remaining['topic'] = topics

In [30]:
topic_model_2.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,33679,-1_democrats_ballot_gop_voting
1,0,1812,0_women_abortion_rights_children
2,1,1342,1_fetterman_oz_pennsylvania_droz
3,2,767,2_georgia_walker_herschel_herschelwalker
4,3,756,3_russia_ukraine_russian_putin
5,4,681,4_money_dark_billion_spending
6,5,514,5_gas_prices_oil_opec
7,6,500,6_button_check_mark_liunavotes
8,7,428,7_michigan_tudordixon_whitmer_gretchenwhitmer
9,8,402,8_crossbones_skull_ps_harder


In [31]:
topic_model.save("./models/model_bert")
topic_model_2.save("./models/model_bert2")

  self._set_arrayXarray(i, j, x)
  self._set_arrayXarray(i, j, x)
