# CSM Assignment 3: Hands-on exercise with topic modeling

## Part 1: Data loading and Preprocessing

In [5]:
# load necessary modules and libraries

import json
import nltk
import re
from itertools import compress
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import numpy as np

stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer()

In [6]:
# load data
tweets = []
for line in open('covidtrack_50K.json', 'r'):
    tweets.append(json.loads(line))
    
# preprocessing
texts = [word_tokenize(re.sub(r'\W+', ' ', t['text'].lower())) for t in tweets] #convert into lowercase, remove non-alphabetic characters, and tokenize
texts = [list(compress(txt, list(map(lambda w: len(w)> 2, txt)))) for txt in texts] #remove short words
texts = [list(map(lemmatizer.lemmatize, txt)) for txt in texts] #lemmatization
texts = [list(compress(txt, list(map(lambda w: w not in stop_words, txt)))) for txt in texts] #remove stopwords
texts = [list(compress(txt, list(map(lambda w: w != 'http', txt)))) for txt in texts] #this is a word that is appearing in urls, which are not meaningful in itself.
             

## Part 2-1: Topic modelling with LDA

There are several possible ways that I can perform topic modelling with LDA. There are different libraries, and different ways to count words and create dictionaries. To compare, I first tried with lda library, and then compared with lda model implemented in gensim library.

In [7]:
# LDA with pypi lda libary

# create word-count dictionary
word_doc_count_raw = {}
for txt in texts:
    txt = list(set(txt))
    for word in txt:
        try:
            word_doc_count_raw[word] += 1
        except:
            word_doc_count_raw[word] = 1
            
# Remove words that occur in less than 10 documents, and words that occur in more than 90% of the documents.
word_doc_count = {}            
for i in word_doc_count_raw:
    if word_doc_count_raw[i] < 10:
        pass
       
    elif word_doc_count_raw[i] > len(tweets)*0.9:
        pass
    
    else:
        word_doc_count[i] = word_doc_count_raw[i]        

        
        
import lda
import lda.datasets

# Transform each document to a vectorized form by computing the frequency of each word.
Y = np.zeros((len(texts), len(word_doc_count)))
for i, txt in enumerate(texts):

    for j, w in enumerate(word_doc_count):
        Y[i,j] = int(txt.count(w))

Y = Y = Y.astype('int64')  #set datatype

In [32]:
# check the tweets that don't contain any selected word.
print('=================================')
print('zero-vector tweets :  does not contain any of the words in the dictionary')
for i in range(Y.shape[0]):
    if np.array_equal(Y[i,:], np.zeros(len(word_doc_count))):
        print(texts[i])

zero-vector tweets :  does not contain any of the words in the dictionary
['timeless', 'coronavirus', 'z2xleebenp']
['zara_ansari', 'timeless', 'coronavirus', 'z2xleebenp']
['sweetanuu', 'coronavirus', 'chal', 'rha']
['anshu_vats1', 'sweetanuu', 'coronavirus', 'chal', 'rha']
['painting', 'coronavirus', 'mtszri9wz1']
['𝖱𝖾𝗆𝗂𝗇𝖽', '𝗒𝗈𝗎𝗋𝗌𝖾𝗅𝖿', '𝖻𝗋𝖾𝖺𝗍𝗁𝖾', 'x9rtihqf5i', 'sumayah369', 'coronavirus', 'fkxksgwns8']
['technicaldebt', 'intensifies', 'cobol', 'programmer', 'programming', 'coronavirus', 'zzdwarnbqd']
['srbachchan', 'masterpiece', 'masterpiece', 'educational', 'coronavirus', 'coronaindia']
['oooh', 'exciting', 'coronavirus', '6icmwgsxb1']
['burying', 'coronavirus']
['rue', 'faidherbe', 'lille', 'confinementotal', 'coronavirus', '8vuqutoeea']
['coronavirus', 'iwolz0wjbl']
['robin', 'juste', 'digest', '3awzktsro4', 'ecommerce', 'coronavirus']
['tynmgn9mwk', 'coronavirus', 'vide']
['coronavirus']
['coronavirus', 'helpdonthurt', 'howdareyou', 'thomasmodly', 'shame', 'legacy']
['belly', '

These tweets can be summarized like following:

1. Tweets that contain URLs with random string.
2. Tweets that ignore spacing and contain typos.
3. Tweets that contain only the major terms, like 'coronavirus'.
4. Tweets that contain different conjugations of verbs (since I did not perfomed stemming.)
5. Tweets that has no significant words


In [42]:
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
model.fit(Y)
topic_word = model.topic_word_  
n_top_words = 7
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(tuple(word_doc_count.keys()))[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))        

INFO:lda:n_documents: 50000
INFO:lda:vocab_size: 5026
INFO:lda:n_words: 441393
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -5291758
INFO:lda:<10> log likelihood: -3260704
INFO:lda:<20> log likelihood: -3101759
INFO:lda:<30> log likelihood: -3064257
INFO:lda:<40> log likelihood: -3052516
INFO:lda:<50> log likelihood: -3043877
INFO:lda:<60> log likelihood: -3038223
INFO:lda:<70> log likelihood: -3035084
INFO:lda:<80> log likelihood: -3031270
INFO:lda:<90> log likelihood: -3029369
INFO:lda:<100> log likelihood: -3027879
INFO:lda:<110> log likelihood: -3025160
INFO:lda:<120> log likelihood: -3024171
INFO:lda:<130> log likelihood: -3022504
INFO:lda:<140> log likelihood: -3021115
INFO:lda:<150> log likelihood: -3021641
INFO:lda:<160> log likelihood: -3020446
INFO:lda:<170> log likelihood: -3020180
INFO:lda:<180> log likelihood: -3019403
INFO:lda:<190> log likelihood: -3019336
INFO:lda:<200> log likelihood: -3017627
INFO:lda:<210> log likelihood: -3016601
INFO:lda

Topic 0: case death new total confirmed reported breaking
Topic 1: covid19 covid mask emergency state live japan
Topic 2: people stay nh nurse staff medical ajenglish
Topic 3: china realdonaldtrump trump wa tomfitton warned navarro
Topic 4: covid19 lockdown time week life whole lockdownextension
Topic 5: crisis people yet current continues suffering accelerate
Topic 6: iran state ha death fight regime city
Topic 7: pandemic point play feeling watching realcandaceo arrived
Topic 8: death breaking piersmorgan toll number total johnson
Topic 9: ha say poor spread american people government
Topic 10: china test pandemic country kit million take
Topic 11: covid19 pandemic ha tested positive patient expert
Topic 12: latest trump thanks president covid19 daily advice
Topic 13: know amp people covid19 come want world
Topic 14: died ha patient say day skynews year
Topic 15: covid19 medium briefing day drtedros lockdown china
Topic 16: public tomfitton shutdown getting rule chicago mayor
Topic 1

## Part 2-2: Examining topic specifications and naming 'topics'
Before proceeding to the next step, examming and proposing names of each topic, I examined the list of words. Most of them were familar, while some, 'nh', 'ha', 'nh', and 'amp', were not. So I closely examinined those words, to figure out what that mean.

In [46]:
# examining strange words

import pandas as pd

pd.set_option('display.max_colwidth', -1)

irregularities = ['ha','ppe','nh','amp','tomfitton','drtedros','realcandaceo','piersmorgan','navarro']
irregularities_df = pd.DataFrame(columns = irregularities)

for i,t in enumerate(texts):
    for w in irregularities:        
        if w in t:
            irregularities_df.loc[i,w] = tweets[i]['text']
            
for w in irregularities:
    print('\nIrregular Word : {0}'.format(w))
    print(irregularities_df[w].dropna().head())


Irregular Word : ha
3     RT @PDChina: Made it! A 104-yr-old World War II veteran from U.S. state of Oregon has become the #oldest known #survivor of #coronavirus di… 
21    RT @SkyNews: A British Airways pilot has been praised for becoming a Tesco delivery driver during the #coronavirus lockdown https://t.co/OW… 
23    RT @BWYK9: We shall continue to help and support agencies and organisations that tackle the illegal trade in wildlife. The #coronavirus has… 
42    RT @EU_Commission: The EU has put international cooperation at the forefront of its response to the #coronavirus. \n\nFollowing an agreement…
56    RT @allysonhorn: BREAKING: A Queensland infectious disease nurse who's been working with #coronavirus patients has tested positive for the…  
Name: ha, dtype: object

Irregular Word : ppe
263     Can you believe?\n\nChina sending a PPE (Test Kits) across world, found be be contaminated with #coronavirus. OMG 🤔🤔… https://t.co/rlehK8UFTF
562     Blackman &amp; White cuts through C

I acquired interesting results here. All of those strange keywords emerged from different reasons:
1. 'ha' was originally 'has'. This is a verb that consists stopwords, so it must have been removed. However, since lemmatization is done before stopword removal, 'has' is treated as noun, and converted to 'ha'.
2. 'ppe' is an abbreviation of Personal Protective Equipment. Usually, it is not difficult to discern abbreviations from other words, since abbreviations are written in capital letters. In this case, all texts are converted into lower-case letters. It has no effect on meanings, but it becomes hard to understand the meaning at a first glance.
3. 'nh' is lemmatized from 'nhs', which is, again, an abbreviation of National Health Service. This case, it is hard to infer the original meaning from the processed text, unless a person is already well-aware of the situation and the data. 
4. When a tweet contains '&' character, the Twitter API translates the character into '&amp' when returning the tweet content. Since the character semantically means 'and', which is another stopword, it is better not to take this keyword into account.

And of course, there are some names and mentions to another account.
1. Tom Fitton is the president of Judicial Watch in the US, an American pro-Trump, conservative (or extreme-right) activist group.
2. Dr. Tedros is the Director-General of the World Health Organization
3. Candace Owen is an American conservative commentator and political activistknown for her pro-Trump activism and her criticism of Black Lives Matter and of the Democratic Party.
4. Piers Morgan is an English broadcaster whose long friendship with President Trump has recently fallen into perils.
5. Peter Navarro is an assistant to the President for Trade & Manufacturing Policy, who is said to have warned the White House of the risk of pandemic in January.


For the topics to semantically examine, I chose following 8 topics to propose names: 

- Topic 0: case death new total confirmed reported breaking - Counting confirmed cases and deaths.
- Topic 3: china realdonaldtrump trump wa tomfitton warned navarro - Retweeting conservative political figures
- Topic 4: covid19 lockdown time week life whole lockdownextension - Lockdown notices and reports
- Topic 10: china test pandemic country kit million take - COVID testing kit supply
- Topic 11: covid19 pandemic ha tested positive patient expert - Updates in new COVID patients
- Topic 15: covid19 medium briefing day drtedros lockdown china - WHO briefings
- Topic 17: covid19 home spread covid covid_19 stop corona - stay-at-home campaign
- Topic 18: death keep via safe ppe nh provide - wear-your-mask campaign


## Part 3: Discussion

First of all, I realized the importance of proper data pre-processing. Here, I followed the guidelines provided by Trung, but this method came up with some errorneous preprocessed results, mentioned in part 2-2. There are several elements in tweets contents, such as hashtags, retweet indicators, urls, and mentions, and also, texts itself. They are not formatted or read in same way, thus those differences must be taken into account before naive preprocessing. For example, urls should be processed separately, since they contain non-alphabets, random strings, and their own 'stopwords' (e.g. 'https', 'com', 'org') . It might be possible to consider only domain names (e.g. who.int, but not its directory), or neglect them completely. Furthermore, adding verb stemming might have been a good add-on as well.

Furthermore, abbreviations has to be taken into account. In my analysis, 'WHO' is not found anywhere in the keywords, which is quite strange, because we have its director-general. It is happening because lowercased 'who' is removed through stopword removals. In real-life situations, two words with the identical spellings but different letter cases may mean completely different things. There must be more important factors like this, that should be taken account while preprocessing, and such cases should be examined throughly with the corresponding experts.


Secondly, I tried applying LDA by using different libraries, and the result was quite different. In real-world situation, choosing a right library to import the same LDA module might matter. (The topic extraction in gensim LDA is demonstrated below this cell)

Regarding the topics, it was surprising to see that mentions to other accounts were important 'keywords' overall. This in itself, is not surprising, because it was covered in class, that information flow in two-steps in Twitter, that media has an indirect influence over the public via an intermediate layer of opinion leaders. Also, through the past assignment, I found out that most of the tweets are 'retweets' of others.
What was surprising, however, was that those 'keyword' figures were mostly conservative political figures. (In my point of view, more close to extreme-right). Except for Dr.Tedros, the director-general of WHO, all the figures were closely related to Donald Trump. Scientists, medical journalists, or heath authorities don't appear to be influential enough, nor 'liberal' political figures. 
There might be several reasons to this. Maybe the sample was simple biased. Maybe the result is demonstrating the spread of disinformation and misinformation in Twitter. Maybe it is more related to politics and psycology, or the Twitter demographics. But still, the fact that 'conservative' politicians are way more influential in topics than their 'liberal' counterparts is interesting, and this might resolve some of the ambiguities we have in the real world.



In [88]:
#Appendix : LDA topic generation and topic specifications using different library

import gensim
import logging, sys

logging.disable(sys.maxsize)

dictionary = gensim.corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=10, no_above=0.9, keep_n=100000)

bow_corpus = [dictionary.doc2bow(doc) for doc in texts]


lda_model_gensim = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)


#for idx, topic in lda_model_gensim.print_topics(-1):
#    print('Topic: {} \nWords: {}'.format(idx, topic))

lda_model_gensim.show_topics(num_topics=20, num_words=5, log=False, formatted=False)

[(0,
  [('000', 0.021446023),
   ('covid19', 0.019942822),
   ('great', 0.019108178),
   ('year', 0.017917603),
   ('due', 0.0175473)]),
 (1,
  [('trump', 0.065633394),
   ('president', 0.032885034),
   ('medical', 0.024133747),
   ('latest', 0.022624508),
   ('wa', 0.020685801)]),
 (2,
  [('covid19', 0.09167603),
   ('lockdown', 0.07842503),
   ('time', 0.061652694),
   ('life', 0.057069365),
   ('week', 0.055344574)]),
 (3,
  [('health', 0.041053344),
   ('world', 0.037649713),
   ('covidー19', 0.03351267),
   ('sure', 0.023461446),
   ('minister', 0.022270415)]),
 (4,
  [('know', 0.023671394),
   ('want', 0.02133876),
   ('covid19', 0.021316553),
   ('come', 0.019346252),
   ('bill', 0.015862737)]),
 (5,
  [('crisis', 0.103897795),
   ('people', 0.09852376),
   ('yet', 0.09712378),
   ('suffering', 0.09650331),
   ('continues', 0.08833984)]),
 (6,
  [('pandemic', 0.031772286),
   ('conference', 0.031021953),
   ('10downingstreet', 0.022425262),
   ('press', 0.020662017),
   ('ha', 0.