In [3]:
import re
import nltk
import gensim
import gensim.corpora as corpora
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
import warnings
warnings.filterwarnings("ignore")

In [4]:
dataset = pd.read_csv("data/tweets_01-08-2021.csv")

In [5]:
trump = dataset

In [6]:
print(len(dataset)) #the length of the data
print(type(dataset)) # the type of variable the data is stored in
print(dataset[:1]) # the first instance of the content within the data

56571
<class 'pandas.core.frame.DataFrame'>
                  id                                               text  \
0  98454970654916608  Republicans and Democrats have both created ou...   

  isRetweet isDeleted     device  favorites  retweets                 date  \
0         f         f  TweetDeck         49       255  2011-08-02 18:07:48   

  isFlagged  
0         f  


In [7]:
dataset.head()

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
0,98454970654916608,Republicans and Democrats have both created ou...,f,f,TweetDeck,49,255,2011-08-02 18:07:48,f
1,1234653427789070336,I was thrilled to be back in the Great city of...,f,f,Twitter for iPhone,73748,17404,2020-03-03 01:34:50,f
2,1218010753434820614,RT @CBS_Herridge: READ: Letter to surveillance...,t,f,Twitter for iPhone,0,7396,2020-01-17 03:22:47,f
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
4,1218159531554897920,RT @MZHemingway: Very friendly telling of even...,t,f,Twitter for iPhone,0,9081,2020-01-17 13:13:59,f


In [8]:
full_train = pd.DataFrame()
full_train['text'] = dataset['text']

In [9]:
tweets = full_train

In [10]:
tweets.head()

Unnamed: 0,text
0,Republicans and Democrats have both created ou...
1,I was thrilled to be back in the Great city of...
2,RT @CBS_Herridge: READ: Letter to surveillance...
3,The Unsolicited Mail In Ballot Scam is a major...
4,RT @MZHemingway: Very friendly telling of even...


In [11]:
na_count = tweets.isnull().sum()
print("Number of NA rows:", na_count)

Number of NA rows: text    0
dtype: int64


In [12]:
print(tweets.count())

text    56571
dtype: int64


In [13]:
trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
trump = trump.loc[(trump.isRetweet == "f") & (trump.text != ""), :]
timestamps = trump.date.to_list()
tweets = trump.text.to_list()

In [14]:
tweets[0]

'republicans and democrats have both created our economic problems'

In [15]:
trump_tweets = pd.DataFrame()

In [16]:
trump_tweets['text'] = tweets

In [17]:
trump_tweets.head()

Unnamed: 0,text
0,republicans and democrats have both created ou...
1,i was thrilled to be back in the great city of...
2,the unsolicited mail in ballot scam is a major...
3,getting a little exercise this morning
4,thank you elise


In [18]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bhava\AppData\Roaming\nltk_data...


True

In [19]:
filtered_text = []
lemmatizer = WordNetLemmatizer()

for w in tweets:
      filtered_text.append(lemmatizer.lemmatize(w))
print(filtered_text[:10])

['republicans and democrats have both created our economic problems', 'i was thrilled to be back in the great city of charlotte north carolina with thousands of hardworking american patriots who love our country cherish our values respect our laws and always put america first thank you for a wonderful evening kag', 'the unsolicited mail in ballot scam is a major threat to our democracy amp the democrats know it almost all recent elections using this system even though much smaller amp with far fewer ballots to count have ended up being a disaster large numbers of missing ballots amp fraud', 'getting a little exercise this morning', 'thank you elise', 'as per your request joe', 'the threshold identification of ballots is turning out to be even bigger than originally anticipated a very large number of ballots are impacted stay tuned', 'i m running as a proud democrat for the senate sleepy joe biden today it s only going to get worse it is not sustainable for our county china will own us'

In [21]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [22]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

In [23]:
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [24]:
vectorizer_model = CountVectorizer(stop_words="english")

In [25]:
ctfidf_model = ClassTfidfTransformer()

In [26]:
from bertopic.representation import MaximalMarginalRelevance
representation_model = MaximalMarginalRelevance(diversity=0.5)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [27]:
topic_model = BERTopic(
                embedding_model=embedding_model,    # Step 1 - Extract embeddings
                  umap_model=umap_model,              # Step 2 - Reduce dimensionality
                  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
                  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
                  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
                  representation_model=representation_model,
                  nr_topics=10                        # Step 6 - Diversify topic words
)

In [28]:
topics, probs = topic_model.fit_transform(filtered_text)

In [30]:
topics

[-1,
 -1,
 0,
 -1,
 1,
 -1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 -1,
 0,
 0,
 -1,
 0,
 -1,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 1,
 -1,
 1,
 -1,
 -1,
 1,
 0,
 0,
 0,
 0,
 0,
 -1,
 -1,
 0,
 0,
 0,
 -1,
 0,
 0,
 -1,
 -1,
 0,
 0,
 -1,
 -1,
 0,
 0,
 0,
 -1,
 0,
 -1,
 0,
 -1,
 0,
 -1,
 1,
 -1,
 0,
 0,
 -1,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 2,
 -1,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 -1,
 -1,
 0,
 0,
 0,
 -1,
 1,
 -1,
 -1,
 0,
 -1,
 -1,
 0,
 -1,
 0,
 0,
 1,
 4,
 -1,
 -1,
 0,
 0,
 0,
 0,
 -1,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 2,
 0,
 0,
 0,
 -1,
 0,
 -1,
 -1,
 0,
 0,
 0,
 0,
 0,
 -1,
 -1,
 0,
 0,
 -1,
 -1,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 -1,
 -1,
 -1,
 0,
 -1,
 0,
 0,
 -1,
 1,
 0,
 0,
 0,
 -1,
 0,
 0,
 -1,
 0,
 1,
 0,
 0,
 -1,
 0,
 0,
 -1,
 -1,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 1,
 0,
 -1,
 -1,
 1,
 -1,
 -1,
 0,
 0,
 0,
 -1,
 0,
 -1,
 -1,
 0,
 -1,
 -1,
 2,
 0,
 -1,
 -1,
 -1,
 -1,
 -1,
 0,
 0,
 -1,
 0,
 0,
 0,
 0,
 -1,
 0

In [31]:
topic_model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [32]:
topic_model.visualize_barchart()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [33]:
topic_model.visualize_heatmap()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [34]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,16086,-1_amp_obama_big_like,"[amp, obama, big, like, make, fake, today, tha...",[thank you governor make america safe and grea...
1,0,21517,0_amp_thank_china_vote,"[amp, thank, china, vote, make, obama, news, l...","[make america great again, thank you trump, th..."
2,1,2875,1_thanks_happy_birthday_christmas,"[thanks, happy, birthday, christmas, merry, mo...","[thanks, thanks, thanks]"
3,2,2500,2_apprentice_celebapprentice_celebrity_miss,"[apprentice, celebapprentice, celebrity, miss,...",[the celebrity apprentice sunday night at pm o...
4,3,1252,3_entrepreneurs_think_success_focus,"[entrepreneurs, think, success, focus, passion...",[entrepreneurs don t sell yourself short don t...
5,4,590,4_amp_order_big_thank,"[amp, order, big, thank, law, think, trump, ru...","[law amp order, my int with on obama being all..."
6,5,391,5_wind_warming_turbines_windmills,"[wind, warming, turbines, windmills, birds, fa...","[wind turbines are a disaster, for those that ..."
7,6,73,6_hunt_swamp_draining_continues,"[hunt, swamp, draining, continues, schneiderma...","[witch hunt, witch hunt, witch hunt]"
8,7,54,7_screen_apple_larger_mobile,"[screen, apple, larger, mobile, samsung, tmobi...",[i can t believe apple isn t moving faster to ...
9,8,17,8_pinehurst_golf_greens_critical,"[pinehurst, golf, greens, critical, looked, pl...",[matt you don t understand golf even though yo...


In [35]:
probs_df=pd.DataFrame(probs)
probs_df['main percentage'] = pd.DataFrame({'max': probs_df.max(axis=1)})
probs_text=probs_df[["main percentage"]]
text_topics_assinged=pd.concat([full_train,probs_text],axis=1)

In [36]:
text_topics_assinged.head()

Unnamed: 0,text,main percentage
0,Republicans and Democrats have both created ou...,0.0
1,I was thrilled to be back in the Great city of...,0.0
2,RT @CBS_Herridge: READ: Letter to surveillance...,1.0
3,The Unsolicited Mail In Ballot Scam is a major...,0.0
4,RT @MZHemingway: Very friendly telling of even...,1.0


In [37]:
topic_labels = topic_model.generate_topic_labels(topic_prefix=False,nr_words=5,
                                                 separator=",")

In [38]:
topic_model.set_topic_labels(topic_labels)

In [39]:
topic_list = topic_model.get_topic_info()
topic_list

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,16086,-1_amp_obama_big_like,"amp,obama,big,like,make","[amp, obama, big, like, make, fake, today, tha...",[thank you governor make america safe and grea...
1,0,21517,0_amp_thank_china_vote,"amp,thank,china,vote,make","[amp, thank, china, vote, make, obama, news, l...","[make america great again, thank you trump, th..."
2,1,2875,1_thanks_happy_birthday_christmas,"thanks,happy,birthday,christmas,merry","[thanks, happy, birthday, christmas, merry, mo...","[thanks, thanks, thanks]"
3,2,2500,2_apprentice_celebapprentice_celebrity_miss,"apprentice,celebapprentice,celebrity,miss,melania","[apprentice, celebapprentice, celebrity, miss,...",[the celebrity apprentice sunday night at pm o...
4,3,1252,3_entrepreneurs_think_success_focus,"entrepreneurs,think,success,focus,passion","[entrepreneurs, think, success, focus, passion...",[entrepreneurs don t sell yourself short don t...
5,4,590,4_amp_order_big_thank,"amp,order,big,thank,law","[amp, order, big, thank, law, think, trump, ru...","[law amp order, my int with on obama being all..."
6,5,391,5_wind_warming_turbines_windmills,"wind,warming,turbines,windmills,birds","[wind, warming, turbines, windmills, birds, fa...","[wind turbines are a disaster, for those that ..."
7,6,73,6_hunt_swamp_draining_continues,"hunt,swamp,draining,continues,schneiderman","[hunt, swamp, draining, continues, schneiderma...","[witch hunt, witch hunt, witch hunt]"
8,7,54,7_screen_apple_larger_mobile,"screen,apple,larger,mobile,samsung","[screen, apple, larger, mobile, samsung, tmobi...",[i can t believe apple isn t moving faster to ...
9,8,17,8_pinehurst_golf_greens_critical,"pinehurst,golf,greens,critical,looked","[pinehurst, golf, greens, critical, looked, pl...",[matt you don t understand golf even though yo...


In [41]:
topic_list.to_excel('topic_list.xlsx')

In [None]:
import pickle

In [None]:
with open("Bertopic_model.pkl", "wb") as f:
    pickle.dump(topic_model, f)


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [42]:
tweets = pd.DataFrame({"Tweets": filtered_text,
                          "ID": range(len(filtered_text)),
                          "Topic": topics})
tweets_per_topic = tweets.groupby(['Topic'], as_index=False).agg({'Tweets': ' '.join})
cleaned_topics = topic_model._preprocess_text(tweets_per_topic.Tweets.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_topics]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)]
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words,
                                 texts=tokens,
                                 corpus=corpus,
                                 dictionary=dictionary,
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

In [43]:
print(coherence)

0.5807259485336854


In [44]:
def get_topics_with_coherence():
  topic_list.to_excel('topic_list.xlsx'),coherence

In [45]:
topic_model.save("topicmodel.pkl",serialization="pickle")

