In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import pandas as pd
np.random.seed(2018)
import nltk
# nltk.download('wordnet')

In [2]:
tweets = pd.read_csv("./cleaned_data/2022-11-07_clean.csv")

In [3]:
tweets = tweets.loc[tweets['tweet_type'].isin(['original','reply'])]

In [4]:
tweets = tweets.loc[tweets['clean_text'].str.split().str.len() >= 10]

In [5]:
tweets['mentions'] = tweets['clean_text'].apply(lambda x: re.findall(r"@[a-zA-Z0-9_]*",x))

In [6]:
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: re.sub(r"@[a-zA-Z0-9_]*","", x))

In [7]:
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: re.sub(r":[a-zA-Z0-9_]*:","", x))

In [9]:
tweets['clean_text'] = tweets['clean_text'].apply(lambda x: re.sub(r"[^A-Za-z]+"," ", x))

In [10]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

stemmer = SnowballStemmer("english")

In [11]:
doc_sample = tweets['clean_text'][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['', 'Sorry', 'he', 'will', 'still', 'be', 'your', 'President', 'on', 'November', 'GOP', 'Midterms', 'BlueWave', '']


 tokenized and lemmatized document: 
['sorri', 'presid', 'novemb', 'midterm', 'bluewav']


In [12]:
processed_tweets = tweets['clean_text'].map(preprocess)

In [13]:
dictionary = gensim.corpora.Dictionary(processed_tweets)

In [14]:
dictionary.filter_extremes(no_below=400, no_above=0.3, keep_n=5000)

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_tweets]

In [16]:
bow_doc_0 = bow_corpus[2]
for i in range(len(bow_doc_0)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_0[i][0], 
                                               dictionary[bow_doc_0[i][0]], 
bow_doc_0[i][1]))

Word 13 ("import") appears 1 time.
Word 14 ("latest") appears 1 time.
Word 15 ("listen") appears 1 time.
Word 16 ("live") appears 1 time.
Word 17 ("podcast") appears 1 time.


In [17]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [31]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=2, workers=4)
# for idx, topic in lda_model_tfidf.print_topics(-1):
#     print('Topic: {} Word: {}'.format(idx, topic))

  and should_run_async(code)


In [32]:
from gensim.models import CoherenceModel

# Compute Perplexity
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf))  
# a measure of how good the model is. lower the better.

  and should_run_async(code)



Perplexity:  -7.48545499597439


In [20]:
# for n in [50,55,60,65,70,75,80,85,90]:
#     print(n)
#     lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=n, id2word=dictionary, passes=2, workers=4)
#     print('Perplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf[:100000]))
    

# Vizualizing the Topics

In [34]:
lda_model_tfidf.print_topics()

  and should_run_async(code)


[(16,
  '0.050*"senat" + 0.032*"send" + 0.030*"illinoi" + 0.029*"leader" + 0.027*"write" + 0.027*"tell" + 0.027*"want" + 0.026*"ask" + 0.025*"surpris" + 0.022*"attempt"'),
 (4,
  '0.049*"democrat" + 0.048*"joebiden" + 0.043*"independ" + 0.042*"republican" + 0.038*"fact" + 0.031*"past" + 0.027*"especi" + 0.023*"nomine" + 0.022*"biden" + 0.022*"victori"'),
 (10,
  '0.068*"seat" + 0.038*"senat" + 0.034*"florida" + 0.032*"hous" + 0.024*"flip" + 0.022*"california" + 0.021*"desanti" + 0.020*"republican" + 0.019*"predict" + 0.018*"choos"'),
 (12,
  '0.051*"research" + 0.042*"reject" + 0.041*"result" + 0.038*"year" + 0.035*"twitter" + 0.035*"medic" + 0.031*"door" + 0.028*"judg" + 0.026*"mehmet" + 0.021*"come"'),
 (20,
  '0.033*"women" + 0.030*"immigr" + 0.030*"climat" + 0.026*"polici" + 0.025*"posit" + 0.023*"local" + 0.021*"law" + 0.020*"illeg" + 0.019*"newyork" + 0.019*"border"'),
 (11,
  '0.045*"legisl" + 0.040*"court" + 0.030*"presid" + 0.030*"elector" + 0.030*"trump" + 0.023*"donald" + 0.

In [33]:
import pyLDAvis.gensim_models

lda_display = pyLDAvis.gensim_models.prepare(lda_model_tfidf, corpus_tfidf, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  and should_run_async(code)
  default_term_info = default_term_info.sort_values(


# Grid Search 

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

  and should_run_async(code)


In [36]:
tweets['processed'] = processed_tweets
tweets['processed_test'] = [' '.join(map(str, l)) for l in tweets['processed']]

  and should_run_async(code)


In [37]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(tweets['processed_test'])

  and should_run_async(code)
