In [139]:
import numpy as np
from bertopic import BERTopic
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
import plotly.express as px
from ast import literal_eval
np.random.seed(2018)


In [54]:
tweets1 = pd.read_csv("./cleaned_data/final_w_topics_1.csv")
tweets2 = pd.read_csv("./cleaned_data/final_w_topics_2.csv")
tweets3 = pd.read_csv("./cleaned_data/final_w_topics_3.csv")

In [55]:
tweets = pd.concat([tweets1,tweets2,tweets3])

In [56]:
topic_model = BERTopic.load("./models/model_bert")
topic_map = topic_model.get_topic_info()
topic_map.head()

Unnamed: 0,Topic,Count,Name
0,-1,46838,-1_ballot_gop_trump_republicans
1,0,1418,0_abortion_abortionrights_roe_women
2,1,1268,1_twitter_elonmusk_elon_musk
3,2,1101,2_ohio_ryan_vance_tim
4,3,1017,3_early_voting_earlyvoting_find


In [57]:
# Mapping the key words/names for each topic to the corresponding topid ID
tweets = tweets.merge(topic_map, how = 'left', left_on = 'topic', right_on= 'Topic')
tweets['Name'] = tweets['Name'].apply(lambda x: x.split('_',1)[1])

In [58]:
len(tweets)

819128

In [59]:
# Format datetime
tweets['date'] = pd.to_datetime(tweets['parsed_created_at'])

In [60]:
# Group by number of tweets per topic per week
weekly_breakdown = tweets.groupby([pd.Grouper(key='date', freq='W-MON'), 'topic']).size().reset_index()
weekly_breakdown.head()

Unnamed: 0,date,topic,0
0,2022-09-26 00:00:00+00:00,-1,26435
1,2022-09-26 00:00:00+00:00,0,572
2,2022-09-26 00:00:00+00:00,1,28
3,2022-09-26 00:00:00+00:00,2,1527
4,2022-09-26 00:00:00+00:00,3,142


In [61]:
# This plot is ugly
fig = px.bar(weekly_breakdown, x = 'date', y = 0, color='topic')
fig.show()

In [62]:
# THe general most popular topics. Topic == -1 is for generic tweets (generally about the election but no specific topic) 
time_period_t = tweets.loc[tweets['date'] >= '11-07-2022']
time_period_t = time_period_t.groupby(['topic']).size().reset_index()
time_period_t.head()

Unnamed: 0,topic,0
0,-1,273968
1,0,9278
2,1,4805
3,2,3554
4,3,3907


In [63]:
# most popular topics
chart_sample = time_period_t.sort_values(0, ascending = False)[1:10]

In [64]:
topic_model = BERTopic.load("./models/model_bert")
topic_map = topic_model.get_topic_info()
topic_map.head()

Unnamed: 0,Topic,Count,Name
0,-1,46838,-1_ballot_gop_trump_republicans
1,0,1418,0_abortion_abortionrights_roe_women
2,1,1268,1_twitter_elonmusk_elon_musk
3,2,1101,2_ohio_ryan_vance_tim
4,3,1017,3_early_voting_earlyvoting_find


In [65]:
# Add the names for each topic by Topic ID
chart_sample = chart_sample.merge(topic_map, how = 'left', left_on = 'topic', right_on= 'Topic')
chart_sample['Name'] = chart_sample['Name'].apply(lambda x: x.split('_',1)[1])
chart_sample.head()

Unnamed: 0,topic,0,Topic,Count,Name
0,0,9278,0,1418,abortion_abortionrights_roe_women
1,54,9188,54,334,karilake_lake_kari_arizona
2,6,8811,6,812,ballots_count_mail_votes
3,14,8053,14,578,red_wave_redwave_tsunami
4,4,7007,4,943,florida_desantis_rubio_charliecrist


In [66]:
# Most popular topics not including general
fig = px.bar(chart_sample, y = 'Name', x = 0, orientation='h')
# fig.update_xaxes(
#         tickangle = 45,
#         title_text = "Topic")
fig.show()

# BERTopic viz

In [67]:
# Principle components for each topic, this suggest we could've gotten away with using even fewer topics
topic_model.visualize_topics()

# Time series

In [68]:
top_topics = tweets.groupby(['topic']).size().reset_index().sort_values(0, ascending = False)
top_topics_list = top_topics[1:7]['topic']

In [69]:
top_t_tweets = tweets.loc[tweets['topic'].isin(top_topics_list)]
len(top_t_tweets)

89127

In [70]:
# The most popular topics grouped by individual day
top_t_tweets['date'] = pd.to_datetime(top_t_tweets['parsed_created_at']).dt.date
grouped_tweets = top_t_tweets.groupby(['topic','Name','date']).size().to_frame('size').reset_index()
grouped_tweets

Unnamed: 0,topic,Name,date,size
0,0,abortion_abortionrights_roe_women,2022-09-21,1
1,0,abortion_abortionrights_roe_women,2022-09-22,97
2,0,abortion_abortionrights_roe_women,2022-09-23,148
3,0,abortion_abortionrights_roe_women,2022-09-24,157
4,0,abortion_abortionrights_roe_women,2022-09-25,118
...,...,...,...,...
297,71,disclose_money_block_act,2022-11-06,1
298,71,disclose_money_block_act,2022-11-07,10
299,71,disclose_money_block_act,2022-11-08,12
300,71,disclose_money_block_act,2022-11-09,13


In [71]:
simple_grouped_tweets = grouped_tweets.loc[grouped_tweets['topic'] != 71]

In [72]:
simple_grouped_tweets['size_log'] = np.log(simple_grouped_tweets['size'])

In [73]:
# Plot for continuous topics, i.e. topics that were generally popular thoughout the collection frame
fig = px.line(simple_grouped_tweets, x = 'date', y = 'size_log', color = 'Name')
fig.show()

In [91]:
# Looking for topics with high variance
tweets['date'] = pd.to_datetime(tweets['parsed_created_at']).dt.date
top_variance = tweets.groupby(['topic', 'date']).size().to_frame('size').reset_index()
top_variance = top_variance.groupby(['topic']).agg(['var','median']).reset_index()
top_variance.columns = top_variance.columns.droplevel(0)
top_variance.columns = ['topic', 'var', 'median']
top_variance['avg_var'] = top_variance['var']/top_variance['median']
top_variance.sort_values('avg_var', ascending = False, inplace = True)
top_variance.head()

Unnamed: 0,topic,var,median,avg_var
72,71,6393939.0,6.0,1065656.0
0,-1,505451800.0,4491.0,112547.7
55,54,773291.3,10.0,77329.13
52,51,123311.8,3.0,41103.94
64,63,445423.5,14.5,30718.86


In [127]:
top_variance_list = top_variance[15:20]['topic']
top_v_tweets = tweets.loc[tweets['topic'].isin(top_variance_list)]
len(top_v_tweets)

24690

In [128]:
grouped_tweets = top_v_tweets.groupby(['topic','Name','date']).size().to_frame('size').reset_index()
grouped_tweets['size_log'] = np.log(grouped_tweets['size'])
# grouped_tweets = grouped_tweets.loc[grouped_tweets['topic'].isin([51,71])]
grouped_tweets

Unnamed: 0,topic,Name,date,size,size_log
0,10,seats_repr_held_condemns,2022-09-22,22,3.091042
1,10,seats_repr_held_condemns,2022-09-23,3,1.098612
2,10,seats_repr_held_condemns,2022-09-24,14,2.639057
3,10,seats_repr_held_condemns,2022-09-25,9,2.197225
4,10,seats_repr_held_condemns,2022-09-26,13,2.564949
...,...,...,...,...,...
240,61,brazil_bolsonaro_lula_versa,2022-11-06,12,2.484907
241,61,brazil_bolsonaro_lula_versa,2022-11-07,123,4.812184
242,61,brazil_bolsonaro_lula_versa,2022-11-08,56,4.025352
243,61,brazil_bolsonaro_lula_versa,2022-11-09,43,3.761200


In [130]:
# Some topics with really high variance in popularity day-to-day
fig = px.line(grouped_tweets, x = 'date', y = 'size_log', color = 'topic')
fig.show()

# Mentions

In [140]:
tweets['mentions'] = tweets['mentions'].apply(lambda x: literal_eval(x))
mention_tweets = tweets.explode('mentions')
grouped_mentions = mention_tweets.groupby(['mentions']).size().to_frame('size').reset_index().sort_values("size", ascending = False)
grouped_mentions.head()

Unnamed: 0,mentions,size
11913,@JohnFetterman,23640
7123,@DrOz,22084
12139,@JoshShapiroPA,21409
29523,@cz_binance,10158
20823,@SBF_FTX,10152


In [142]:
fig = px.bar(grouped_mentions[0:10], y = 'mentions', x = 'size', orientation='h')
fig.show()