In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
import re
import numpy as np

In [3]:
label    = pd.read_csv("./data/covid_labels.csv")
covid_df = pd.read_csv("./data/covid_bert_data.csv")
covid_df["label"] = label["Predicted"]
covid_df.head()

Unnamed: 0.1,Unnamed: 0,source,replies,label
0,0,"According to the New York Times, Warner Bros. ...",,0
1,1,Hurricane Hanna has made landfall in Texas.\n\...,@TexasTribune Guess what the cause of death is...,0
2,2,Monkeys on the loose in India with stolen coro...,,1
3,3,,@BelAkinyii When was the last time you washed ...,0
4,4,"“If Trump felt comfortable having it here, the...",@HeidiNBC These Trump fans have a right to att...,1


In [9]:
import re
def remove_URL(original):
    """Remove url link in the text"""
    result = re.sub(r"http\S+", "", original)
    result = re.sub(r"www.\S+", "", result)
    result = re.sub(r"wasap.my+", "", result)
    return result

def remove_digits(original):
    return re.sub("\d+", "", original)

filtered = covid_df.dropna().copy()
filtered["source"] = filtered["source"].apply(remove_URL).apply(remove_digits)
filtered

Unnamed: 0.1,Unnamed: 0,source,replies,label
1,1,Hurricane Hanna has made landfall in Texas.\n\...,@TexasTribune Guess what the cause of death is...,0
4,4,"“If Trump felt comfortable having it here, the...",@HeidiNBC These Trump fans have a right to att...,1
6,6,Today I’m thinking about the families across o...,@KamalaHarris Trump ignored Bush43’s 2005 pand...,1
7,7,KHive has become the most toxic thing on Twitt...,@JasonOverstreet It wasn't enough that Warren'...,1
9,9,Who steals the SA’s COVID- monies ? Hon Nqaba...,@BantuHolomisa They will never steal Covid19. ...,0
...,...,...,...,...
17453,17453,I wonder how many lives could’ve been saved if...,@funder Wonder how many lives could have been ...,1
17454,17454,The @thetimes front page on th March. The firs...,@NadineDorries @thetimes Inadequate supplies o...,0
17455,17455,Trump just completed the racism trifecta in a ...,@DNCWarRoom Fact check: Chinese is not a race....,1
17456,17456,Here are a few of my photographs from today’s ...,@Jess__Taylor__ @davidallengreen Eck! What are...,0


In [10]:
filtered_rumours = filtered.loc[filtered["label"] == 1]
filtered_rumours

Unnamed: 0.1,Unnamed: 0,source,replies,label
4,4,"“If Trump felt comfortable having it here, the...",@HeidiNBC These Trump fans have a right to att...,1
6,6,Today I’m thinking about the families across o...,@KamalaHarris Trump ignored Bush43’s 2005 pand...,1
7,7,KHive has become the most toxic thing on Twitt...,@JasonOverstreet It wasn't enough that Warren'...,1
11,11,Maaaring magdeklara ng martial law si Pangulon...,@gmanews Is COVID-19 a legal entity to constit...,1
15,15,"You know, #Donald says he has tested negative ...",@BetteMidler .@BetteMidler Will you please tak...,1
...,...,...,...,...
17447,17447,The lack of tests for #coronavirus is a “natio...,@SquawkCNBC @atrupar @DrMattMcCarthy Now that ...,1
17448,17448,‘Residents protest coronavirus stay-at-home or...,@TheRynheart Most state that had protests are ...,1
17453,17453,I wonder how many lives could’ve been saved if...,@funder Wonder how many lives could have been ...,1
17455,17455,Trump just completed the racism trifecta in a ...,@DNCWarRoom Fact check: Chinese is not a race....,1


In [6]:
filtered_nonrumours = filtered.loc[filtered["label"] == 0]
filtered_nonrumours

Unnamed: 0.1,Unnamed: 0,source,replies,label
1,1,Hurricane Hanna has made landfall in Texas.\n\...,@TexasTribune Guess what the cause of death is...,0
9,9,Who steals the SA’s COVID- monies ? Hon Nqaba...,@BantuHolomisa They will never steal Covid19. ...,0
10,10,New Harris County #Houston #COVID numbers onli...,@PeterHotez @HoustonChron @NewDay @CNN I'm ver...,0
13,13,ICYMI: Select NHS patients will soon be treate...,@MattHancock Select NHS patients? The ones tha...,0
17,17,new cases of #COVID have been confirmed in Ni...,@NCDCgov @Fmohnigeria @FMICNigeria @nighealthw...,0
...,...,...,...,...
17445,17445,Researchers @IITHyderabad have developed a low...,@kiranshaw @IITHyderabad with so may testing p...,0
17446,17446,"South City Hospital Khi charging Rs , per day ...","@omar_quraishi \n\nPlease watch and share, ver...",0
17449,17449,"The U.S. could have prevented roughly , deaths...",@NPR Hmmm 1 infected in January? And @WHO prom...,0
17454,17454,The @thetimes front page on th March. The firs...,@NadineDorries @thetimes Inadequate supplies o...,0


In [46]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(["coronavirus", "covid", "amp"])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
vectorizer        = CountVectorizer(stop_words=stop_words, min_df=0.001, max_df=0.9)
freq_rumour       = vectorizer.fit_transform(filtered_rumours["source"])
names_rumour      = vectorizer.get_feature_names()
freq_nonrumour    = vectorizer.fit_transform(filtered_nonrumours["source"])
names_nonrumour   = vectorizer.get_feature_names()

In [92]:
lda = LatentDirichletAllocation(n_components=10)
lda.fit(freq_rumour)
for idx, topic in enumerate(lda.components_):
    print ("Topic ", idx, " ".join(names_rumour[i] for i in topic.argsort()[:-8-1:-1]))

Topic  0 trump president china response pandemic chinese says world
Topic  1 new florida state cases gov texas governor reopening
Topic  2 americans trump pandemic bill million need democrats health
Topic  3 trump rally president campaign says tulsa positive people
Topic  4 trump house white says president back virus people
Topic  5 pence positive china cases tested tests testing us
Topic  6 cases new us deaths world people death died
Topic  7 trump president care administration pandemic people health pelosi
Topic  8 deaths us weeks cases ago death people days
Topic  9 people trump news get like americans know american


In [93]:
lda.fit(freq_nonrumour)
for idx, topic in enumerate(lda.components_):
    print ("Topic ", idx, " ".join(names_nonrumour[i] for i in topic.argsort()[:-8-1:-1]))

Topic  0 lockdown people could new health say uk live
Topic  1 health workers care hospital support pandemic medical crisis
Topic  2 health take home stay world us fight says
Topic  3 people mask died pandemic india one care home
Topic  4 positive tested test tests symptoms people days testing
Topic  5 get says sick may us dr must virus
Topic  6 minister vaccine pandemic says president said prime government
Topic  7 people patients virus trump us hydroxychloroquine drug first
Topic  8 spread us mask public masks wear face wearing
Topic  9 cases new deaths number confirmed total reported day
