In [None]:
import os
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
from bertopic.representation import MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer
import copy

In [None]:
# load positives df, see how many are scored between .5 and .7
os.chdir('/data/kayla_data')
df = pd.read_csv('classified_positives_missingtweetsincluded_oct2424.csv', low_memory=False)
df = df[['tweet_id', 'text','predicted_label', 'score','predicted_label_numeric','user_id']]
print(df['score'].between(.5,.7).sum()/len(df)) 
print(df['score'].between(.9,1).sum()/len(df)) 

0.08014234093126142
0.7905302978259575


In [7]:
df.head()

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0,tweet_id,text,predicted_label,score,Unnamed: 0.1,user_id,created_at_day,created_at_time,user_screen_name,favorite_count,retweet_count,predicted_label_numeric
0,0,0,0,954588503677001729,".@RepRichmond: ""…[Republicans] are unable to g...",takes a position,0.999771,,,,,,,,1
1,1,1,1,954550807025897473,Are you fired up by the #TrumpShutdown? Are yo...,takes a position,0.99971,,,,,,,,1
2,2,2,2,954549999290978304,Democrats are holding strong to say: #GOP cont...,takes a position,0.999847,,,,,,,,1
3,3,3,3,953700349885509633,"Today, three quarters of all immigrants to the...",takes a position,0.999444,,,,,,,,1
4,4,4,4,951144066426777600,"Starting today, I'm asking nominees to our cou...",takes a position,0.991061,,,,,,,,1


# Topic modeling with bertopic

In [3]:
# preprocess - remove RT @s at the start of tweets so it doesn't become a topic, remove urls
df['text'] = df['text'].str.replace(r'RT @', '', regex=True)
df['text'] = df['text'].str.replace(r'http\S+|www.\S+', '', regex=True)

get embeddings first so can load them (instead of rerunning them) in case of error. 

In [5]:
docs = df['text'].tolist()
print(torch.backends.mps.is_available())
sentence_model = SentenceTransformer("all-MiniLM-L6-v2", device = 'mps')
#embeddings = sentence_model.encode(docs, show_progress_bar=True)
#np.save('bertopic_new_oct2424/embeddings_oct2424_noRTnoHTTP.npy', embeddings)

True


Batches:   0%|          | 0/13647 [00:00<?, ?it/s]

In [6]:
# import the embeddings if needed
embeddings = np.load('bertopic_new_oct2424/embeddings_oct2424_noRTnoHTTP.npy')

Initial training of the model

In [7]:
# mmr at higher levels will put more diverse words together in a topic (range 0-1)
representation_model = MaximalMarginalRelevance(diversity=0.5)
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(embedding_model=sentence_model, 
                       representation_model=representation_model, 
                       vectorizer_model = vectorizer_model,
                       n_gram_range=(1,3),
                       min_topic_size=300)
# if issue with frequent words that aren't stop words: ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

initial application of the model

In [8]:
topics, probs = topic_model.fit_transform(docs, embeddings)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [10]:

topic_model.save("bertopic_new_oct2424/topic models_soct2424", serialization="safetensors", save_ctfidf=True)

explore initial topics before doing any sort of reduction

In [9]:
print(topic_model.get_topic_info())
print(topic_model.get_topic(-1))

     Topic   Count                                               Name  \
0       -1  160557                 -1_workers_vote_healthcare_twitter   
1        0   15877                       0_prolife_abortions_born_roe   
2        1   13394                   1_educators_public_redfored_debt   
3        2   12652                      2_lgbtq_hrc_trans_equalityact   
4        3   10546                3_climatechange_epa_emissions_green   
..     ...     ...                                                ...   
160    159     317                       159_culinary226_union_las_nv   
161    160     314                160_phit_thesfia_fitness_inactivity   
162    161     309  161_nasw_socialwork_socialworkmonth_socialwork...   
163    162     304  162_geriatrics_amergeriatrics_seniors_olderadults   
164    163     303             163_licenses_ohio_freedomtowork_reform   

                                        Representation  \
0    [workers, vote, healthcare, twitter, tax, cong...   
1    [p

## reduce number of topics 

In [22]:
# reduce topics to 50
topic_model_50 = copy.deepcopy(topic_model)
topic_model_50.reduce_topics(docs, nr_topics=50)
topic_model_50.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,160557,-1_workers_twitter_trump_support,"[workers, twitter, trump, support, congress, h...",[The opportunity to join a union—no matter whe...
1,0,43573,0_medicareforall_opioid_nurses_prices,"[medicareforall, opioid, nurses, prices, obama...","[Thanks to the ACA's #Medicaid expansion, thou..."
2,1,25590,1_climate_infrastructure_spending_change,"[climate, infrastructure, spending, change, ca...",[If @JoeBiden's $4 trillion infrastructure pac...
3,2,23660,2_abortion_pro_hyde_roe,"[abortion, pro, hyde, roe, amendment, right, s...",[For those who claim Planned Parenthood isn't ...
4,3,19424,3_covid19_pandemic_coronavirus_hiv,"[covid19, pandemic, coronavirus, hiv, vaccines...",[COVID-19 has disproportionately impacted comm...
5,4,15747,4_lgbtq_hrc_discrimination_equalityact,"[lgbtq, hrc, discrimination, equalityact, penc...","[AlphonsoDavid: For far too long, anti-LGBTQ p..."
6,5,13394,5_educators_public_rweingarten_loan,"[educators, public, rweingarten, loan, redfore...","[AFSCME: We stand in solidarity with over 30,0..."
7,6,11632,6_veterans_thank_roseanndemoro_dcstatehood,"[veterans, thank, roseanndemoro, dcstatehood, ...","[As veterans, we dedicated our lives to this c..."
8,7,11335,7_kavanaugh_judges_barrett_nominees,"[kavanaugh, judges, barrett, nominees, courtsm...",[An increasing number of Americans don't want ...
9,8,9240,8_immigration_children_2020census_tps,"[immigration, children, 2020census, tps, asylu...",[Immigration has been a major topic of debate ...


In [23]:
# reduce topics to 30
topic_model_30 = copy.deepcopy(topic_model)
topic_model_30.reduce_topics(docs, nr_topics=30)
topic_model_30.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,160557,-1_workers_twitter_need_congress,"[workers, twitter, need, congress, support, ta...",[The opportunity to join a union—no matter whe...
1,0,66552,0_medicaid_care_covid19_aca,"[medicaid, care, covid19, aca, congress, cover...",[#Medicaid is proven to lift millions of peopl...
2,1,30320,1_climate_infrastructure_budget_energy,"[climate, infrastructure, budget, energy, cong...",[#InfrastructureIsCalling. Working people are ...
3,2,28179,2_daca_immigrants_educators_rweingarten,"[daca, immigrants, educators, rweingarten, dre...",[Tomorrow at 6PM -> Rights of Immigrant Studen...
4,3,25047,3_abortion_prolife_parenthood_women,"[abortion, prolife, parenthood, women, hyde, r...","[Abortion is violence., BREAKING: House passes..."
5,4,23084,4_lgbtq_hrc_trans_women,"[lgbtq, hrc, trans, women, union, discriminati...",[LGBTQ+ rights are human rights. The House jus...
6,5,19171,5_vote_georgia_veterans_thank,"[vote, georgia, veterans, thank, today, rosean...",[Be an early voter in Georgia! Take part in th...
7,6,12323,6_kavanaugh_supreme_judges_barrett,"[kavanaugh, supreme, judges, barrett, nominees...",[The stakes with this nomination are too high....
8,7,12094,7_naacp_doj_gunviolence_vanitaguptacr,"[naacp, doj, gunviolence, vanitaguptacr, floyd...",[Watch Kristen Clarke explain the moment she k...
9,8,11543,8_taxes_cuts_device_reform,"[taxes, cuts, device, reform, gop, repealdevic...",[@VernBuchanan A tax on medical devices is a t...


In [24]:
# reduce topics to 20
topic_model_20 = copy.deepcopy(topic_model)
topic_model_20.reduce_topics(docs, nr_topics=20)
topic_model_20.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,160557,-1_workers_twitter_trump_congress,"[workers, twitter, trump, congress, support, t...",[Are you ready to stand against rich and power...
1,0,92533,0_abortion_care_covid19_aca,"[abortion, care, covid19, aca, congress, twitt...",[Congress can't let the president undermine th...
2,1,65588,1_congress_schools_climate_families,"[congress, schools, climate, families, daca, i...","[""I am very concerned about what is going to b..."
3,2,28436,2_lgbtq_hrc_workers_trans,"[lgbtq, hrc, workers, trans, union, discrimina...",[The #EqualityAct would protect: 🗂4 million+ L...
4,3,19171,3_vote_georgia_thank_veterans,"[vote, georgia, thank, veterans, today, congre...","[.@POTUS is right: For ""We the People,"" for ou..."
5,4,17415,4_civil_justice_amacforamerica_naacp,"[civil, justice, amacforamerica, naacp, doj, n...",[Together with President Biden and Attorney Ge...
6,5,12323,5_kavanaugh_supreme_judges_barrett,"[kavanaugh, supreme, judges, barrett, senate, ...",[With Trump in the White House and Brett Kavan...
7,6,11543,6_taxes_cuts_device_reform,"[taxes, cuts, device, reform, gop, repealdevic...",[@RepStephenLynch A tax on medical devices is ...
8,7,9934,7_socialsecurity_benefits_medicare_retirees,"[socialsecurity, benefits, medicare, retirees,...",[Our Social Security system is vital in ensuri...
9,8,6186,8_cigarettes_products_youth_vaping,"[cigarettes, products, youth, vaping, menthol,...",[#Minnesota cities are acting to reduce tobacc...


## now reduce outliers

In [19]:
topic_model_50.vectorizer_model

In [25]:
# chaining some outlier reduction methods. with only distributions, 387 are left as outliers
new_topics_50 = topic_model_50.reduce_outliers(documents=docs, topics=topic_model_50.topics_, strategy="distributions")
new_topics_50 = topic_model_50.reduce_outliers(documents=docs, topics=new_topics_50, strategy="embeddings", embeddings=embeddings, threshold=.1)

#update topics. ensure vectorizor model is included to stop stopwords from reappearing
topic_model_50.update_topics(docs, topics=new_topics_50, vectorizer_model=topic_model_50.vectorizer_model)
# update frequencies
documents_outlierreduction_50 = pd.DataFrame({"Document": docs, "Topic": new_topics_50})
topic_model_50._update_topic_size(documents_outlierreduction_50)
# view
topic_model_50.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,67674,0_health_care_medicaid_aca,"[health, care, medicaid, aca, coverage, health...","[Thanks to the ACA's #Medicaid expansion, thou..."
1,1,34230,1_climate_infrastructure_budget_change,"[climate, infrastructure, budget, change, gove...",[If @JoeBiden's $4 trillion infrastructure pac...
2,2,29341,2_abortion_life_prolife_pro,"[abortion, life, prolife, pro, planned, parent...",[For those who claim Planned Parenthood isn't ...
3,3,25259,3_covid19_covid_19_pandemic,"[covid19, covid, 19, pandemic, vaccine, corona...",[COVID-19 has disproportionately impacted comm...
4,4,19787,4_lgbtq_hrc_equality_trans,"[lgbtq, hrc, equality, trans, transgender, wom...","[AlphonsoDavid: For far too long, anti-LGBTQ p..."
5,5,15855,5_schools_students_education_school,"[schools, students, education, school, teacher...","[AFSCME: We stand in solidarity with over 30,0..."
6,6,24928,6_thank_support_congress_senators,"[thank, support, congress, senators, members, ...","[As veterans, we dedicated our lives to this c..."
7,7,15879,7_kavanaugh_court_supreme_brett,"[kavanaugh, court, supreme, brett, judge, stop...",[An increasing number of Americans don't want ...
8,8,12695,8_immigration_immigrants_border_immigrant,"[immigration, immigrants, border, immigrant, c...",[Immigration has been a major topic of debate ...
9,9,11204,9_security_social_socialsecurity_medicare,"[security, social, socialsecurity, medicare, b...",[Social Security is more important than ever. ...


In [26]:
# reduce outliers
new_topics_30 = topic_model_30.reduce_outliers(documents=docs, topics=topic_model_30.topics_, strategy="distributions")
new_topics_30 = topic_model_30.reduce_outliers(documents=docs, topics=new_topics_30, strategy="embeddings", embeddings=embeddings, threshold=.1)

#update topics. ensure vectorizor model is included to stop stopwords from reappearing
topic_model_30.update_topics(docs, topics=new_topics_30, vectorizer_model=topic_model_30.vectorizer_model)

# update topic frequencies
documents_outlierreduction_30 = pd.DataFrame({"Document": docs, "Topic": new_topics_30})
topic_model_30._update_topic_size(documents_outlierreduction_30)
# view 
topic_model_30.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,102082,0_health_care_medicaid_patients,"[health, care, medicaid, patients, healthcare,...",[#Medicaid is proven to lift millions of peopl...
1,1,43444,1_climate_congress_infrastructure_budget,"[climate, congress, infrastructure, budget, go...",[#InfrastructureIsCalling. Working people are ...
2,2,33948,2_students_schools_education_immigration,"[students, schools, education, immigration, sc...",[Tomorrow at 6PM -> Rights of Immigrant Studen...
3,3,31774,3_abortion_life_prolife_pro,"[abortion, life, prolife, pro, women, planned,...","[Abortion is violence., BREAKING: House passes..."
4,4,34833,4_lgbtq_union_hrc_equality,"[lgbtq, union, hrc, equality, people, workers,...",[LGBTQ+ rights are human rights. The House jus...
5,5,43434,5_vote_voting_election_make,"[vote, voting, election, make, voter, today, t...",[Be an early voter in Georgia! Take part in th...
6,6,18095,6_kavanaugh_court_supreme_brett,"[kavanaugh, court, supreme, brett, judge, trum...",[The stakes with this nomination are too high....
7,7,19155,7_justice_violence_civil_gun,"[justice, violence, civil, gun, rights, racism...",[Watch Kristen Clarke explain the moment she k...
8,8,21317,8_tax_taxreform_taxes_cuts,"[tax, taxreform, taxes, cuts, repeal, reform, ...",[@VernBuchanan A tax on medical devices is a t...
9,9,12734,9_security_social_socialsecurity_medicare,"[security, social, socialsecurity, medicare, b...","[Donald Trump has promised to permanently ""ter..."


In [27]:
# reduce outliers
new_topics_20 = topic_model_20.reduce_outliers(documents=docs, topics=topic_model_20.topics_, strategy="distributions")
new_topics_20 = topic_model_20.reduce_outliers(documents=docs, topics=new_topics_20, strategy="embeddings", embeddings=embeddings, threshold=.1)

#update topics. ensure vectorizor model is included to stop stopwords from reappearing
topic_model_20.update_topics(docs, topics=new_topics_20, vectorizer_model=topic_model_20.vectorizer_model)

# update topic frequencies
documents_outlierreduction_20 = pd.DataFrame({"Document": docs, "Topic": new_topics_20})
topic_model_20._update_topic_size(documents_outlierreduction_20)
# view 
topic_model_20.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,133404,0_health_care_abortion_medicaid,"[health, care, abortion, medicaid, access, pat...",[Congress can't let the president undermine th...
1,1,94641,1_congress_families_need_students,"[congress, families, need, students, schools, ...","[""I am very concerned about what is going to b..."
2,2,46560,2_workers_lgbtq_union_people,"[workers, lgbtq, union, people, hrc, equality,...",[The #EqualityAct would protect: 🗂4 million+ L...
3,3,45047,3_vote_voting_election_make,"[vote, voting, election, make, today, voter, t...","[.@POTUS is right: For ""We the People,"" for ou..."
4,4,28829,4_justice_violence_rights_civil,"[justice, violence, rights, civil, gun, racism...",[Together with President Biden and Attorney Ge...
5,5,18872,5_kavanaugh_court_supreme_trump,"[kavanaugh, court, supreme, trump, brett, judg...",[With Trump in the White House and Brett Kavan...
6,6,21846,6_tax_taxreform_taxes_cuts,"[tax, taxreform, taxes, cuts, repeal, reform, ...",[@RepStephenLynch A tax on medical devices is ...
7,7,13139,7_security_social_socialsecurity_medicare,"[security, social, socialsecurity, medicare, b...",[Our Social Security system is vital in ensuri...
8,8,6915,8_tobacco_cigarettes_products_smoking,"[tobacco, cigarettes, products, smoking, flavo...",[#Minnesota cities are acting to reduce tobacc...
9,9,5026,9_war_diplomacy_peace_yemen,"[war, diplomacy, peace, yemen, iran, congress,...",[Congress has to support #Diplomacy with #Nort...


Save topics as excel file w/ a sheet for each

In [28]:
with pd.ExcelWriter("bertopic_new_oct2424/topics_oct2424.xlsx", engine="openpyxl", mode="w") as writer:
    topic_model_50.get_topic_info()[["Topic", "Count", "Representation", "Representative_Docs"]].to_excel(writer, sheet_name="topics_50")
    topic_model_30.get_topic_info()[["Topic", "Count", "Representation", "Representative_Docs"]].to_excel(writer, sheet_name="topics_30")
    topic_model_20.get_topic_info()[["Topic", "Count", "Representation", "Representative_Docs"]].to_excel(writer, sheet_name="topics_20")

### visualization of the topics

In [None]:
topic_model_30.visualize_heatmap(n_clusters=5)

In [None]:
topic_model_50.visualize_barchart(topics=list(range(0,49)))