In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
import tweepy

covid_df = pd.read_csv("covid_df.csv")
covid_df.fillna(value="",inplace=True)
covid_df.drop(list(covid_df.columns)[0],axis=1,inplace=True)
rumours = covid_df[covid_df['label']==1]
nonrumours = covid_df[covid_df['label']==0]

covid_df

Unnamed: 0,text,user,verified,likes,retweets,hashtag,sentiment,followers,statuses,label
0,"According to the New York Times, Warner Bros. ...",TheFilmUpdates,False,260.0,25.0,Tenet,0.296591,45008.0,5654.0,0
1,Hurricane Hanna has made landfall in Texas.\n\...,TexasTribune,True,580.0,479.0,,0.000000,297392.0,110931.0,0
2,Monkeys on the loose in India with stolen coro...,thehill,True,164.0,72.0,,-0.076923,4362459.0,1013903.0,0
3,"“If Trump felt comfortable having it here, the...",HeidiNBC,True,1952.0,786.0,,0.400000,204299.0,19775.0,1
4,DISTURBING: Alabama officials say some student...,ABC7,True,471.0,256.0,COVID19,-0.125000,1178868.0,256495.0,0
...,...,...,...,...,...,...,...,...,...,...
15957,I wonder how many lives could’ve been saved if...,funder,True,10632.0,2567.0,,0.200000,1073597.0,305211.0,1
15958,The @thetimes front page on 17th March. The fi...,NadineDorries,True,772.0,227.0,,0.125000,128557.0,29621.0,0
15959,Trump just completed the racism trifecta in a ...,DNCWarRoom,True,915.0,367.0,,-0.388889,88433.0,10988.0,1
15960,Here are a few of my photographs from today’s ...,Jess__Taylor__,False,343.0,153.0,COVID19,-0.200000,8044.0,692.0,0


In [2]:
def preprocess_df(df):
    preprocessed = df.copy(deep=True)
    text = list(preprocessed['text'])
    verified = list(preprocessed['verified'])
    for i in range(len(text)):
        text[i] = " ".join(filter(lambda x:x[0]!='@', text[i].split()))
        text[i] = " ".join(filter(lambda x:x[0:4]!='http', text[i].split()))
        text[i] = " ".join(filter(lambda x:x[0:4]!='&amp', text[i].split()))
        text[i] = re.sub(r'[^a-zA-Z ]','',text[i])
        text[i] = " ".join(filter(lambda x:x[0:1]!='Q', text[i].split()))
        text[i] = " ".join(filter(lambda x:x[0:1]!='A', text[i].split()))
    preprocessed['text'] = text
    for j in range(len(verified)):
        if verified[j]:
            verified[j] = 1
        else:
            verified[j] = 0
    preprocessed['verified'] = verified
    preprocessed.fillna('', inplace=True)
    return preprocessed

def count_column(df, col, n=20):
    d = {}
    for x in df[col]:
        x = re.sub(r'[^a-zA-Z0-9]', '', str(x)).lower()
        if x == "":
            pass
        elif x in d:
            d[x] +=1
        else:
            d[x] = 1
    d = {k: v for k, v in sorted(d.items(),key=lambda item: item[1],reverse=True)}
    for i in range(n):
        print(str(list(d.keys())[i]) + ": " + str(list(d.values())[i]))
        
def calc_average(df, col):
    print(col + " " + str(round(sum(list(df[col]))/len(list(df[col])),2)))        

In [3]:
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma


def tokenize_df(df):
    tokenized_df = df
    tokenized_sentence = []
    for _id, row in tokenized_df.iterrows():
        text = row['text']
        # tokenize tweet
        tokens = tt.tokenize(text)
        # convert to lowercase
        tokens = [tok.lower() for tok in tokens]
        # remove stopwords
        tokens = [tok for tok in tokens if tok not in stopwords]
        # lemmatize
        tokens = [lemmatize(tok) for tok in tokens]
        tokenized_sentence.append(tokens)
    tokenized_df.insert(1, 'tokens', tokenized_sentence)
    return tokenized_df

tt = TweetTokenizer()
stopwords = set(stopwords.words('english')) #note: stopwords are all in lowercase
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

In [4]:
covid = tokenize_df(preprocess_df(covid_df))
rumours = tokenize_df(preprocess_df(rumours))
nonrumours = tokenize_df(preprocess_df(nonrumours))

print("all")
calc_average(covid, 'sentiment')
calc_average(covid, 'retweets')
calc_average(covid, 'likes')
calc_average(covid, 'verified')
print()
print("rumours")
calc_average(rumours, 'sentiment')
calc_average(rumours, 'retweets')
calc_average(rumours, 'likes')
calc_average(rumours, 'verified')
print()
print("nonrumours")
calc_average(nonrumours, 'sentiment')
calc_average(nonrumours, 'retweets')
calc_average(nonrumours, 'likes')
calc_average(nonrumours, 'verified')

all
sentiment 0.06
retweets 2619.56
likes 9683.76
verified 0.79

rumours
sentiment 0.03
retweets 3750.15
likes 14223.14
verified 0.78

nonrumours
sentiment 0.07
retweets 2189.3
likes 7956.26
verified 0.8


In [40]:
print(f'all hashtag')
count_column(covid,'hashtag',10)
print()
print(f'rumours hashtag')
count_column(rumours,'hashtag',10)
print()
print(f'nonrumours hashtag')
count_column(nonrumours,'hashtag',10)
print()

all hashtag
covid19: 1669
coronavirus: 751
breaking: 98
coronaviruspandemic: 24
covid: 23
cdnpoli: 20
china: 19
covid19ph: 17
trump: 14
stayalert: 14

rumours hashtag
covid19: 122
coronavirus: 119
china: 9
maga: 8
breaking: 7
trump: 6
oann: 6
trumpownseverydeath: 4
stopairingtrump: 4
trumppressconference: 4

nonrumours hashtag
covid19: 1547
coronavirus: 632
breaking: 91
covid: 22
coronaviruspandemic: 22
cdnpoli: 18
covid19ph: 17
stayalert: 14
watch: 13
stayhomesavelives: 12



In [6]:
def count_tokens(df):
    d = {}
    for token_list in df['tokens']:
        for token in token_list:
            if token in d:
                d[token] +=1
            else:
                d[token] = 1
    return {k: v for k, v in sorted(d.items(),key=lambda item: item[1],reverse=True)}
                
all_count = count_tokens(covid)
rumours_count = count_tokens(rumours)
nonrumours_count = count_tokens(nonrumours)

In [26]:
list(all_count.keys())[0:20]

['coronavirus',
 'covid',
 'trump',
 'say',
 'test',
 'case',
 'new',
 'people',
 'u',
 'death',
 'pandemic',
 'state',
 'day',
 'president',
 'get',
 'go',
 'health',
 'one',
 'report',
 'positive']

In [32]:
list(rumours_count.values())[0:20]

[2297,
 1706,
 801,
 701,
 561,
 554,
 485,
 479,
 350,
 349,
 347,
 297,
 297,
 287,
 274,
 273,
 235,
 225,
 224,
 200]

In [33]:
list(nonrumours_count.values())[0:20]

[5746,
 5185,
 1845,
 1739,
 1562,
 1521,
 1246,
 1125,
 1060,
 985,
 954,
 895,
 886,
 776,
 741,
 726,
 718,
 716,
 712,
 630]

In [10]:
def bigram_count(df):
    d = {}
    for token_list in df['tokens']:
        for i in range(len(token_list)):
            if i + 1 < len(token_list):
                bigram = token_list[i] + " " + token_list[i+1]
                if bigram in d:
                    d[bigram] +=1
                else:
                    d[bigram] = 1
    return {k: v for k, v in sorted(d.items(),key=lambda item: item[1],reverse=True)}
               
all_bicount = bigram_count(covid)
rumours_bicount = bigram_count(rumours)
nonrumours_bicount = bigram_count(nonrumours)

In [29]:
list(all_bicount.keys())[0:20]

['test positive',
 'coronavirus pandemic',
 'covid case',
 'coronavirus case',
 'president trump',
 'covid pandemic',
 'white house',
 'positive coronavirus',
 'wear mask',
 'coronavirus death',
 'social distance',
 'new york',
 'new case',
 'positive covid',
 'covid test',
 'coronavirus outbreak',
 'public health',
 'coronavirus test',
 'donald trump',
 'new covid']

In [35]:
list(rumours_bicount.keys())[0:20]

['president trump',
 'coronavirus death',
 'donald trump',
 'coronavirus pandemic',
 'coronavirus case',
 'trump say',
 'white house',
 'new york',
 'test positive',
 'ago death',
 'coronavirus response',
 'wear mask',
 'week ago',
 'unite state',
 'u coronavirus',
 'trump coronavirus',
 'die coronavirus',
 'fox news',
 'say coronavirus',
 'covid death']

In [36]:
list(nonrumours_bicount.values())[0:20]

[531,
 439,
 404,
 341,
 299,
 243,
 216,
 215,
 211,
 211,
 210,
 194,
 177,
 167,
 163,
 162,
 155,
 150,
 144,
 143]

In [14]:
def trigram_count(df):
    d = {}
    for token_list in df['tokens']:
        for i in range(len(token_list)):
            if i + 2 < len(token_list):
                trigram = token_list[i] + " " + token_list[i+1] + " " + token_list[i+2]
                if trigram in d:
                    d[trigram] +=1
                else:
                    d[trigram] = 1
    return {k: v for k, v in sorted(d.items(),key=lambda item: item[1],reverse=True)}
               
all_tricount = trigram_count(covid)
rumours_tricount = trigram_count(rumours)
nonrumours_tricount = trigram_count(nonrumours)

In [15]:
list(all_tricount.items())[0:20]

[('test positive coronavirus', 268),
 ('test positive covid', 197),
 ('new covid case', 127),
 ('new coronavirus case', 95),
 ('coronavirus task force', 90),
 ('world health organization', 72),
 ('new case covid', 60),
 ('confirm covid case', 58),
 ('president trump say', 56),
 ('coronavirus death toll', 55),
 ('amid coronavirus pandemic', 54),
 ('u coronavirus death', 54),
 ('president donald trump', 51),
 ('white house coronavirus', 50),
 ('week ago death', 44),
 ('death week ago', 43),
 ('report new case', 40),
 ('new york city', 39),
 ('ago death week', 38),
 ('report new covid', 34)]

In [16]:
list(rumours_tricount.items())[0:20]

[('test positive coronavirus', 45),
 ('week ago death', 44),
 ('death week ago', 43),
 ('u coronavirus death', 41),
 ('president trump say', 39),
 ('ago death week', 38),
 ('president donald trump', 30),
 ('death day ago', 27),
 ('day ago death', 27),
 ('report u coronavirus', 24),
 ('ago death day', 24),
 ('world health organization', 19),
 ('coronavirus task force', 19),
 ('new york time', 19),
 ('test positive covid', 17),
 ('amid coronavirus pandemic', 16),
 ('chinese communist party', 16),
 ('trump handle coronavirus', 16),
 ('trump coronavirus response', 16),
 ('president trump coronavirus', 14)]

In [17]:
list(nonrumours_tricount.items())[0:20]

[('test positive coronavirus', 223),
 ('test positive covid', 180),
 ('new covid case', 120),
 ('new coronavirus case', 84),
 ('coronavirus task force', 71),
 ('new case covid', 60),
 ('confirm covid case', 57),
 ('world health organization', 53),
 ('coronavirus death toll', 43),
 ('white house coronavirus', 39),
 ('amid coronavirus pandemic', 38),
 ('new york city', 35),
 ('report new covid', 34),
 ('report new case', 32),
 ('wear face cover', 28),
 ('confirm case covid', 27),
 ('number coronavirus case', 25),
 ('due coronavirus pandemic', 25),
 ('number confirm covid', 25),
 ('health care worker', 25)]

In [18]:
rumours[rumours['retweets'] >= sorted(list(rumours['retweets']),reverse=True)[10]]

Unnamed: 0,text,tokens,user,verified,likes,retweets,hashtag,sentiment,followers,statuses,label
705,How to medical,[medical],sarahcpr,1,582916.0,171359.0,,0.0,2286422.0,39044.0,1
1246,a story in two parts,"[story, two, part]",maleeezy_,0,613165.0,145668.0,,0.0,615.0,57419.0,1
2594,I have a breathing problem lol wear a mask karen,"[breathe, problem, lol, wear, mask, karen]",madz1426,0,321130.0,131170.0,,0.8,252.0,68.0,1
2887,Choose your fighter,"[choose, fighter]",tifffanycuh,0,1054577.0,305680.0,,0.0,7076.0,75674.0,1
4564,Just a reminder its okay to blame the presiden...,"[reminder, okay, blame, president, bad, thing,...",lincolnjackd,0,788666.0,165941.0,,-0.066667,3530.0,10675.0,1
5164,God I love this country,"[god, love, country]",CorkCoypu,0,823180.0,147153.0,,0.5,16850.0,36056.0,1
5410,Some sports are slower More about the strategy,"[sport, slower, strategy]",MrAndrewCotter,1,642449.0,174013.0,,0.5,451354.0,31076.0,1
7441,is so fucking wild that the Pentagon just conf...,"[fuck, wild, pentagon, confirm, ufo, barely, n...",santiagomayer_,0,805604.0,152948.0,,0.183333,144412.0,115895.0,1
9437,Black Lives Matter protest in Los,"[black, live, matter, protest, los]",louie_tran,1,450446.0,129620.0,,-0.166667,2958.0,20755.0,1
10092,Liftoff,[liftoff],SpaceX,1,809466.0,213010.0,,0.0,22473263.0,5904.0,1


In [19]:
nonrumours[nonrumours['retweets'] >= sorted(list(nonrumours['retweets']),reverse=True)[10]]

Unnamed: 0,text,tokens,user,verified,likes,retweets,hashtag,sentiment,followers,statuses,label
3862,Protect yourself and your community from coron...,"[protect, community, coronavirus, common, sens...",BarackObama,1,572626.0,120074.0,,-0.178571,131898409.0,16472.0,0
5378,I swear we are fighting two pandemics Covid an...,"[swear, fight, two, pandemic, covid, stupidity]",moreki_mo,0,1222410.0,355629.0,,-0.6,12338.0,110835.0,0
5418,Corona virus got a survival rate nd the whole ...,"[corona, virus, get, survival, rate, nd, whole...",TheyLoveDjJigga,0,609099.0,139247.0,,0.2,3515.0,102985.0,0
5439,We must now realize the promise of by trusting...,"[must, realize, promise, trust, god, unify, vi...",kanyewest,1,1082377.0,248424.0,2020VISION,0.0,30818063.0,1910.0,0
7069,Here is part and of Cutie all in one tweet,"[part, cutie, one, tweet]",jerm_cohen,0,341778.0,125059.0,,0.0,28189.0,23666.0,0
7775,Last month was the first March without a schoo...,"[last, month, first, march, without, school, s...",RobertKlemko,1,654281.0,153922.0,,0.125,52416.0,32293.0,0
9118,Scientists you should wash your hands because ...,"[scientist, wash, hand, coronavirus, people, i...",NCStinn,0,416216.0,120331.0,ClimateCrisis,0.183333,6518.0,14418.0,0
10636,Priest giving social distance blessings with a...,"[priest, give, social, distance, bless, squirt...",tripgore,0,531486.0,117603.0,,0.033333,6986.0,41980.0,0
11182,Please watch this Especially if youve tested p...,"[please, watch, especially, youve, test, posit...",KILLBILLEVOL1,0,280218.0,202110.0,,0.113636,2132.0,72174.0,0
12418,Hola humanos soy el Coronavirus Voy a explicar...,"[hola, humanos, soy, el, coronavirus, voy, exp...",QHaRi,0,198891.0,149337.0,,0.0,18800.0,54448.0,0


In [20]:
rumours[rumours['likes'] >= sorted(list(rumours['likes']),reverse=True)[10]]

Unnamed: 0,text,tokens,user,verified,likes,retweets,hashtag,sentiment,followers,statuses,label
705,How to medical,[medical],sarahcpr,1,582916.0,171359.0,,0.0,2286422.0,39044.0,1
1246,a story in two parts,"[story, two, part]",maleeezy_,0,613165.0,145668.0,,0.0,615.0,57419.0,1
2887,Choose your fighter,"[choose, fighter]",tifffanycuh,0,1054577.0,305680.0,,0.0,7076.0,75674.0,1
4231,Coronavirus is everywhere BOOK THAT FLIGHT Tak...,"[coronavirus, everywhere, book, flight, take, ...",JardinTaylor,0,653048.0,121442.0,,0.0,4930.0,115325.0,1
4564,Just a reminder its okay to blame the presiden...,"[reminder, okay, blame, president, bad, thing,...",lincolnjackd,0,788666.0,165941.0,,-0.066667,3530.0,10675.0,1
5164,God I love this country,"[god, love, country]",CorkCoypu,0,823180.0,147153.0,,0.5,16850.0,36056.0,1
5410,Some sports are slower More about the strategy,"[sport, slower, strategy]",MrAndrewCotter,1,642449.0,174013.0,,0.5,451354.0,31076.0,1
7441,is so fucking wild that the Pentagon just conf...,"[fuck, wild, pentagon, confirm, ufo, barely, n...",santiagomayer_,0,805604.0,152948.0,,0.183333,144412.0,115895.0,1
10092,Liftoff,[liftoff],SpaceX,1,809466.0,213010.0,,0.0,22473263.0,5904.0,1
11313,We cant take four more years of this,"[cant, take, four, year]",JoeBiden,1,529495.0,72035.0,,0.5,34144398.0,8194.0,1


In [21]:
nonrumours[nonrumours['likes'] >= sorted(list(nonrumours['likes']),reverse=True)[10]]

Unnamed: 0,text,tokens,user,verified,likes,retweets,hashtag,sentiment,followers,statuses,label
3862,Protect yourself and your community from coron...,"[protect, community, coronavirus, common, sens...",BarackObama,1,572626.0,120074.0,,-0.178571,131898409.0,16472.0,0
5378,I swear we are fighting two pandemics Covid an...,"[swear, fight, two, pandemic, covid, stupidity]",moreki_mo,0,1222410.0,355629.0,,-0.6,12338.0,110835.0,0
5418,Corona virus got a survival rate nd the whole ...,"[corona, virus, get, survival, rate, nd, whole...",TheyLoveDjJigga,0,609099.0,139247.0,,0.2,3515.0,102985.0,0
5439,We must now realize the promise of by trusting...,"[must, realize, promise, trust, god, unify, vi...",kanyewest,1,1082377.0,248424.0,2020VISION,0.0,30818063.0,1910.0,0
6334,Even if you cant give the moms in your life a ...,"[even, cant, give, mom, life, hug, today, hope...",BarackObama,1,653446.0,52583.0,,0.325,131898415.0,16472.0,0
6951,Update The Cleveland Clinic has developed a CO...,"[update, cleveland, clinic, develop, covid, te...",AndreaR9Md,0,537107.0,93377.0,,0.14375,21981.0,109779.0,0
7616,i cant even remember what we used to talk abou...,"[cant, even, remember, use, talk, coronavirus]",Y2SHAF,0,549150.0,106633.0,,0.0,208416.0,5184.0,0
7775,Last month was the first March without a schoo...,"[last, month, first, march, without, school, s...",RobertKlemko,1,654281.0,153922.0,,0.125,52416.0,32293.0,0
10636,Priest giving social distance blessings with a...,"[priest, give, social, distance, bless, squirt...",tripgore,0,531486.0,117603.0,,0.033333,6986.0,41980.0,0
10912,My professor just told me that if we get a whi...,"[professor, tell, get, whiff, smoke, another, ...",Lou16em,0,591610.0,63596.0,,0.0,1596.0,847.0,0


In [22]:
api_key = "aJTgi4d1H1zmQNkQuHeualNhP"
api_secrets = "yPFdp2Bbib25r1iPQCqBuPq8B9UzoJbIdpv1jgzEZFEg1eJl6X"
access_token = "1409382627124019204-oPpzVGuCwyFfQTfoocFhgHi68whhog"
access_secret = "esgwpH5gnRfNB0SpiHkO52mZSI5VKKnn8SG2pafEnzj2b"
 
# Authenticate to Twitter
auth = tweepy.OAuthHandler(api_key,api_secrets)
auth.set_access_token(access_token,access_secret)
 
api = tweepy.API(auth)

TWEET_OBJECT_PATH = 'tweet-objects/'
 
try:
    api.verify_credentials()
    print('Successful Authentication')
except:
    print('Failed Authentication')

Successful Authentication


In [23]:
follow = list(covid['followers'])
av_follow = round(sum(follow)/len(follow),2)

num_status = list(covid['statuses'])
av_status = round(sum(num_status)/len(num_status),2)

print("all")
print("followers: "+str(av_follow))
print("num_status: "+str(av_status))

all
followers: 4926492.15
num_status: 146871.33


In [24]:
follow = list(rumours['followers'])
av_follow = round(sum(follow)/len(follow),2)

num_status = list(rumours['statuses'])
av_status = round(sum(num_status)/len(num_status),2)

print("rumours")
print("followers: "+str(av_follow))
print("num_status: "+str(av_status))

rumours
followers: 4376776.13
num_status: 163150.03


In [25]:
follow = list(nonrumours['followers'])
av_follow = round(sum(follow)/len(follow),2)

num_status = list(nonrumours['statuses'])
av_status = round(sum(num_status)/len(num_status),2)

print("nonrumours")
print("followers: "+str(av_follow))
print("num_status: "+str(av_status))

nonrumours
followers: 5135690.43
num_status: 140676.35


In [8]:
nonrumours.count()

text         11562
tokens       11562
user         11562
verified     11562
likes        11562
retweets     11562
hashtag      11562
sentiment    11562
followers    11562
statuses     11562
label        11562
dtype: int64

In [17]:
len(set([x for x in list(rumours.user) if x in list(nonrumours.user)]))

780

In [15]:
len(set(list(covid.user)))

6019