In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

import re

In [None]:
!pip install bertopic

In [3]:
from bertopic import BERTopic

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd './drive/MyDrive/dawn_chorus'

/content/drive/MyDrive/dawn_chorus


In [6]:
df = pd.read_csv('pa_reps2022-07-19.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
pd.set_option('display.max_columns', None)

### Let's narrow down the analysis to a few columns

In [9]:
tweets_df = df[['district', 'Representative', 'party', 'text', 'followers_count']].copy(deep=True)

In [10]:
tweets_df.head()

Unnamed: 0,district,Representative,party,text,followers_count
0,PA-01,Brian Fitzpatrick,Republican,"Now more than ever, it is critical that we pro...",21431
1,PA-01,Brian Fitzpatrick,Republican,Thank you Dr. Umar Farooq and the Association ...,21431
2,PA-01,Brian Fitzpatrick,Republican,"Today marks the launch of 988, a new lifesavin...",21431
3,PA-01,Brian Fitzpatrick,Republican,I am proud to cosponsor this bipartisan legisl...,21431
4,PA-01,Brian Fitzpatrick,Republican,Joe was a recognized leader in Bristol and Buc...,21431


We have tweets from all 18 congressional leaders in PA

#### Clean the tweets

In [11]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    text = text.replace('amp', ' ')
    return text

# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: remove_emojis(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
tweets_df['text'] = tweets_df['text'].apply(call_emoji_free)

#Create a new column with url free tweets
tweets_df['text'] = tweets_df['text'].apply(url_free_text)

In [12]:
tweets_df.head()

Unnamed: 0,district,Representative,party,text,followers_count
0,PA-01,Brian Fitzpatrick,Republican,"Now more than ever, it is critical that we pro...",21431
1,PA-01,Brian Fitzpatrick,Republican,Thank you Dr. Umar Farooq and the Association ...,21431
2,PA-01,Brian Fitzpatrick,Republican,"Today marks the launch of 988, a new lifesavin...",21431
3,PA-01,Brian Fitzpatrick,Republican,I am proud to cosponsor this bipartisan legisl...,21431
4,PA-01,Brian Fitzpatrick,Republican,Joe was a recognized leader in Bristol and Buc...,21431


In [13]:
# create docs from the the tweets

docs = tweets_df['text'].tolist()

In [14]:
docs[0]

'Now more than ever, it is critical that we provide necessary support and an expansion of resources for the millions of Americans who struggle with mental health. '

### Topic modeling

In [None]:
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

### Explore topics

In [19]:
# These are the top topics identified, with the most number of tweets falling in these top 10 categories

freq = topic_model.get_topic_info(); freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,18381,-1_to_in_and_our
1,0,1237,0_food_agriculture_farmers_farm
2,1,702,1_border_southern_illegal_crisis
3,2,543,2_ukraine_ukrainian_russian_putin
4,3,495,3_gun_violence_guns_laws
5,4,460,4_unemployment_jobs_added_rate
6,5,442,5_abortion_roe_reproductive_womens
7,6,427,6_inflation_bidenflation_spending_bidens
8,7,388,7_patcunnane_wildclayton_mikeschlossberg_dwp1726
9,8,378,8_childtaxcredit_payments_child_expanded


In [20]:
# below we can see that 615 unique topics were identified
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,18381,-1_to_in_and_our
1,0,1237,0_food_agriculture_farmers_farm
2,1,702,1_border_southern_illegal_crisis
3,2,543,2_ukraine_ukrainian_russian_putin
4,3,495,3_gun_violence_guns_laws
...,...,...,...
611,610,10,610_pretzel_nationalpretzelday_revonah_monroe
612,611,10,611_child_care_repkclark_essential
613,612,10,612_firefighters_prac_wrightcentergme_state
614,613,10,613_laborcaucus_wellequipped_caucus_ty


Topic -1 is a category for the text that was not categorized. Here we see that sadly 18,381 tweets did not fall under any of the identified topics. Also note that the bottom less popular topics only have about 10 tweets each. 

### Visualize the topics

Below we will visualize how the top 20 topics cluster together and how they relate to each other

In [22]:
topic_model.visualize_topics(top_n_topics=20)

We can see that among the top 20 topics, there are three clusters. And each of those cluster seemed to be quite distinct from each other, as they are far apart on the graph. 

In [23]:
topic_model.visualize_hierarchy(top_n_topics=20)

Above, we can see the top 20 topics and how they relate to each other. 

### Top words found in the top 20 topics

In [24]:
topic_model.visualize_barchart(top_n_topics=20)