<h1>Twitter Data</h1>

<h3>Installation and import of libraries</h3>

In [2]:
!pip install pyLDAvis



In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import nltk
nltk.download('stopwords') #download if don't have yet
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lindy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h3>Viewing Data</h3>

In [4]:
# Path
folder_path = "../Twitter_Data/"
file1 = "Twitter_Covid-19_Lockdown_5000.csv"

In [5]:
twitter_lockdown_df = pd.read_csv(folder_path + file1)
twitter_lockdown_df

Unnamed: 0,author_id,conversation_id,id,edit_history_tweet_ids,text,created_at,withheld.copyright,withheld.country_codes,author_name,orginal_text,original_text
0,495071269,1640204550631043072,1640204550631043072,['1640204550631043072'],RT @Chris_EvansMP: Today marks three years sin...,2023-03-27T04:10:13.000Z,,,BarnettElaine,Today marks three years since the UK went int...,Today marks three years since the UK went int...
1,1564131612505149440,1639919857012469760,1640204157683417089,['1640204157683417089'],@RoastSmith_ I used play Fortnite a lot on my ...,2023-03-27T04:08:39.000Z,,,tanseus1,@RoastSmith_ I used play Fortnite a lot on my ...,@RoastSmith_ I used play Fortnite a lot on my ...
2,1548020151814987779,1640202097772601345,1640202097772601345,['1640202097772601345'],RT @Somali_ICS: If it wasn't for #Tiktok there...,2023-03-27T04:00:28.000Z,,,Somali_ICS,If it wasn't for #Tiktok there would've been ...,If it wasn't for #Tiktok there would've been ...
3,27968588,1640194922799144961,1640194922799144961,['1640194922799144961'],@NYCMayor @ericadamsfornyc time to change cour...,2023-03-27T03:31:57.000Z,,,vliscony,@NYCMayor @ericadamsfornyc time to change cour...,@NYCMayor @ericadamsfornyc time to change cour...
4,1428236322838220802,1640190517093990400,1640190517093990400,['1640190517093990400'],Feeling in the dumps because of lockdown? \nHe...,2023-03-27T03:14:27.000Z,,,CovidHelpBot,Feeling in the dumps because of lockdown? \nHe...,Feeling in the dumps because of lockdown? \nHe...
...,...,...,...,...,...,...,...,...,...,...,...
3745,1428236322838220802,1637683999882760192,1637683999882760192,['1637683999882760192'],Feeling unhappy because of lockdown? \nHere ar...,2023-03-20T05:14:26.000Z,,,CovidHelpBot,Feeling unhappy because of lockdown? \nHere ar...,Feeling unhappy because of lockdown? \nHere ar...
3746,1590162549525422080,1637617552351215616,1637683256580616192,['1637683256580616192'],@TRyanGregory Lockdowns make billionaires more...,2023-03-20T05:11:29.000Z,,,InfiniteKB_Com,@TRyanGregory Lockdowns make billionaires more...,@TRyanGregory Lockdowns make billionaires more...
3747,85601740,1637680366268841984,1637680366268841984,['1637680366268841984'],"Of not passing: homelessness, addiction, menta...",2023-03-20T05:00:00.000Z,,,Prison_Health,"Of not passing: homelessness, addiction, menta...","Of not passing: homelessness, addiction, menta..."
3748,1557364662546714626,1637561236274528257,1637678230713909250,['1637678230713909250'],"@olaadun @Samuelo84500495 @Dawa911 No, they wo...",2023-03-20T04:51:31.000Z,,,ElliotLinksync,"@olaadun @Samuelo84500495 @Dawa911 No, they wo...","@olaadun @Samuelo84500495 @Dawa911 No, they wo..."


<h3>Cleaning Data</h3>

In [9]:
def remove_user_mentions(text):
    return re.sub("@[A-Za-z0-9_]+","", text)

def remove_links(text):
    return re.sub(r'http\S+', '', text)

def remove_digit_strings(text):
    return re.sub(r'\d+', '', text)

def remove_special_chars(text):
    remove_chars = '[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    return re.sub(remove_chars, ' ', text)

def clean_text(text):
    result_text = text
    result_text = remove_user_mentions(result_text)
    result_text = remove_links(result_text)
    result_text = remove_digit_strings(result_text)
    result_text = remove_special_chars(result_text)
    return result_text

In [10]:
'''
Clean text, and check for empty strings / strings containing only whitespace
'''
texts = twitter_lockdown_df["text"].tolist()
for i in range(len(texts)):
    texts[i] = clean_text(texts[i])

In [12]:
print(len(texts))
print(texts[0])
print(texts[1])

3750
RT   Today marks three years since the UK went into lockdown   

On this National Day of Reflection  I visited the  
 I used play Fortnite a lot on my Android in COVID   lockdown and Chapter  Midas revenge was the best one


<h3>Tokenizing Data</h3>

In [13]:
# Stopwords
stop_words = stopwords.words('english')
exclude_words = stop_words

#exclude common words 
exclude_words_extra = ["RT","still","covid","lockdown", "pandemic","get","go","im","ive","would","one","also","to"]

exclude_words.extend(exclude_words_extra)

In [15]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(texts))

In [16]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['rt', 'today', 'marks', 'three', 'years', 'since', 'the', 'uk', 'went_into', 'lockdown', 'on', 'this', 'national', 'day', 'of', 'reflection', 'visited', 'the']


In [17]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in exclude_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [18]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['today', 'mark', 'year', 'go', 'lockdown', 'national', 'day', 'reflection', 'visit']]


In [19]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]]


[[('day', 1),
  ('go', 1),
  ('lockdown', 1),
  ('mark', 1),
  ('national', 1),
  ('reflection', 1),
  ('today', 1),
  ('visit', 1),
  ('year', 1)]]

<h3>LDA Model</h3>

In [20]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# supporting function 2
def compute_perplexity_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    perplexity_score = lda_model.log_perplexity(corpus_sets[i])
    
    return perplexity_score

In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': [],
                 'Perplexity': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=271)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    p = compute_perplexity_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    model_results['Perplexity'].append(p)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./twitter_lda_tuning_results.csv', index=False)
    pbar.close()

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [None]:
optimal_model = lda_model
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [None]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(4):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print