In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
import re
import numpy as np

In [3]:
label    = pd.read_csv("./data/covid_labels.csv")
covid_df = pd.read_csv("./data/covid_bert_data.csv")
covid_df["label"] = label["Predicted"]
covid_df.head()

Unnamed: 0.1,Unnamed: 0,source,replies,label
0,0,"According to the New York Times, Warner Bros. ...",,0
1,1,Hurricane Hanna has made landfall in Texas.\n\...,@TexasTribune Guess what the cause of death is...,0
2,2,Monkeys on the loose in India with stolen coro...,,0
3,3,,@BelAkinyii When was the last time you washed ...,0
4,4,"“If Trump felt comfortable having it here, the...",@HeidiNBC These Trump fans have a right to att...,1


In [4]:
import re
def remove_URL(original):
    """Remove url link in the text"""
    result = re.sub(r"http\S+", "", original)
    result = re.sub(r"www.\S+", "", result)
    result = re.sub(r"wasap.my+", "", result)
    return result

def remove_digits(original):
    return re.sub("\d+", "", original)

def find_hashtags(tweet):
    '''This function will extract hashtags'''
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet) 

filtered = covid_df.dropna().copy()
# https://ourcodingclub.github.io/tutorials/topic-modelling-python/#:~:text=%2C%20tweet)-,def,-find_hashtags(tweet
filtered["hashtags"] = filtered["source"].apply(find_hashtags)

filtered["source"] = filtered["source"].apply(remove_URL).apply(remove_digits)
filtered

Unnamed: 0.1,Unnamed: 0,source,replies,label,hashtags
1,1,Hurricane Hanna has made landfall in Texas.\n\...,@TexasTribune Guess what the cause of death is...,0,[]
4,4,"“If Trump felt comfortable having it here, the...",@HeidiNBC These Trump fans have a right to att...,1,[]
6,6,Today I’m thinking about the families across o...,@KamalaHarris Trump ignored Bush43’s 2005 pand...,0,[]
7,7,KHive has become the most toxic thing on Twitt...,@JasonOverstreet It wasn't enough that Warren'...,0,[]
9,9,Who steals the SA’s COVID- monies ? Hon Nqaba...,@BantuHolomisa They will never steal Covid19. ...,0,"[#UDMChiefWhip, #COVID-19]"
...,...,...,...,...,...
17453,17453,I wonder how many lives could’ve been saved if...,@funder Wonder how many lives could have been ...,1,[]
17454,17454,The @thetimes front page on th March. The firs...,@NadineDorries @thetimes Inadequate supplies o...,0,[]
17455,17455,Trump just completed the racism trifecta in a ...,@DNCWarRoom Fact check: Chinese is not a race....,0,[]
17456,17456,Here are a few of my photographs from today’s ...,@Jess__Taylor__ @davidallengreen Eck! What are...,0,"[#COVID19, #voting, #parliament]"


In [5]:
filtered_rumours = filtered.loc[filtered["label"] == 1]
filtered_rumours

Unnamed: 0.1,Unnamed: 0,source,replies,label,hashtags
4,4,"“If Trump felt comfortable having it here, the...",@HeidiNBC These Trump fans have a right to att...,1,[]
21,21,Trump has royally fucked up this country.,"@StephenKing With a little bit of luck, in Jan...",1,[]
24,24,President Trump says from the beginning he kne...,@Yamiche @LazuliLady Good????? How? Because he...,1,[]
32,32,NHL statement on Phase testing.,@PR_NHL @FriedgeHNIC Maybe instead of fighting...,1,[]
46,46,BREAK: Brazil #coronavirus death toll now worl...,"MORE: Brazil reports 25,982 new cases of #coro...",1,[#coronavirus]
...,...,...,...,...,...
17410,17410,Dennis Quaid Believes Trump Is ‘Doing a Good J...,@VoteBlueIn2020 'Dennis Quaid Believes Trump I...,1,[]
17436,17436,"Breaking: Walmart, the largest retailer in the...",@nmeyersohn Are you getting all the info on #F...,1,[]
17438,17438,"They're lying, we know they're lying &amp; the...",@Hardeep_Matharu @LollysMum1 Thought provoking...,1,[]
17453,17453,I wonder how many lives could’ve been saved if...,@funder Wonder how many lives could have been ...,1,[]


In [6]:
filtered_nonrumours = filtered.loc[filtered["label"] == 0]
filtered_nonrumours

Unnamed: 0.1,Unnamed: 0,source,replies,label,hashtags
1,1,Hurricane Hanna has made landfall in Texas.\n\...,@TexasTribune Guess what the cause of death is...,0,[]
6,6,Today I’m thinking about the families across o...,@KamalaHarris Trump ignored Bush43’s 2005 pand...,0,[]
7,7,KHive has become the most toxic thing on Twitt...,@JasonOverstreet It wasn't enough that Warren'...,0,[]
9,9,Who steals the SA’s COVID- monies ? Hon Nqaba...,@BantuHolomisa They will never steal Covid19. ...,0,"[#UDMChiefWhip, #COVID-19]"
10,10,New Harris County #Houston #COVID numbers onli...,@PeterHotez @HoustonChron @NewDay @CNN I'm ver...,0,"[#Houston, #COVID19]"
...,...,...,...,...,...
17448,17448,‘Residents protest coronavirus stay-at-home or...,@TheRynheart Most state that had protests are ...,0,[]
17449,17449,"The U.S. could have prevented roughly , deaths...",@NPR Hmmm 1 infected in January? And @WHO prom...,0,[]
17454,17454,The @thetimes front page on th March. The firs...,@NadineDorries @thetimes Inadequate supplies o...,0,[]
17455,17455,Trump just completed the racism trifecta in a ...,@DNCWarRoom Fact check: Chinese is not a race....,0,[]


In [7]:
from collections import defaultdict
rumour_tags = defaultdict(int)
for taglist in filtered_rumours["hashtags"]:
    for tag in taglist:
        rumour_tags[tag.lower()] += 1
pd.Series(rumour_tags).sort_values(ascending=False)[:10]

#covid19                 64
#coronavirus             54
#trump                    4
#gmb                      4
#americafirst             4
#china                    4
#breaking                 4
#trumpmeltdown            4
#trumppressconference     3
#trumpownseverydeath      3
dtype: int64

In [8]:
nonrumour_tags = defaultdict(int)
for taglist in filtered_nonrumours["hashtags"]:
    for tag in taglist:
        nonrumour_tags[tag.lower()] += 1
pd.Series(nonrumour_tags).sort_values(ascending=False)[:10]

#covid19                1609
#coronavirus             779
#breaking                 58
#covid                    50
#china                    32
#covid_19                 31
#stayhome                 28
#coronaviruspandemic      24
#cdnpoli                  23
#stayhomesavelives        23
dtype: int64

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(["coronavirus", "covid", "amp"])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
vectorizer        = CountVectorizer(stop_words=stop_words, min_df=0.01, max_df=0.8)
freq_rumour       = vectorizer.fit_transform(filtered_rumours["source"])
names_rumour      = vectorizer.get_feature_names()
freq_nonrumour    = vectorizer.fit_transform(filtered_nonrumours["source"])
names_nonrumour   = vectorizer.get_feature_names()

In [14]:
lda = LatentDirichletAllocation(n_components=5)
lda.fit(freq_rumour)
for idx, topic in enumerate(lda.components_):
    print ("Topic ", idx, " ".join(names_rumour[i] for i in topic.argsort()[:-8-1:-1]))

Topic  0 trump virus biden positive economy china could health
Topic  1 trump americans news cnn fox new going go
Topic  2 people trump china us said know get died
Topic  3 cases florida new deaths day reported state record
Topic  4 trump president pandemic says response donald america realdonaldtrump


In [15]:
lda.fit(freq_nonrumour)
for idx, topic in enumerate(lda.components_):
    print ("Topic ", idx, " ".join(names_nonrumour[i] for i in topic.argsort()[:-8-1:-1]))

Topic  0 china government health patients death virus new time
Topic  1 us would people could testing million dr vaccine
Topic  2 trump pandemic president house white says health response
Topic  3 cases new deaths days number states weeks reported
Topic  4 people positive tested mask masks social spread tests
