In [1]:
from twython import Twython
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import json
import nltk
import re

In [2]:
# Get api key secret, token from txt file where each line contains the info needed
api = open('twitter_api.txt', 'r').readlines()
key, secret, token = api
twitter = Twython(key, access_token=token)

In [3]:
# Get the 10000 tweets
max_id = None
result = []
for i in range(101):
    data = twitter.search(q='covid',lang= 'en',max_id = max_id, result_type = 'recent', count=100,tweet_mode="extended")
    next_id = data['search_metadata']['next_results']
    max_id = int(str(''.join(filter(str.isdigit, next_id)))[:-4])
    result += data['statuses']

In [4]:
# Save the data
with open('data.json', 'w') as outfile:
    json.dump(result[:10000], outfile)

In [5]:
# Data preprocessing
processed = []
for i in result[:10000]:
    if 'retweeted_status'in i:
        processed += [i['retweeted_status']['full_text']]
    else:
        processed += [i['full_text']]

train_data = processed.copy()[:9000]
test_data = processed.copy()[9000:]

In [6]:
# treat one tweet: 
#     remove url,@,tag
#     sentence segmentation
#     tokenization
#     word lowercase

def treat(data): 
    temp = re.sub(r'http\S+', '', re.sub(r'@\S+', '', re.sub(r'#\S+', '',data)))
    temp = temp.splitlines()
    temp = [i for i in temp if i!='']
    sents = [nltk.sent_tokenize(i) for i in temp]
    sents = [i for j in sents for i in j]
    tokens = [nltk.word_tokenize(i) for i in sents]
    words = [[i.lower() for i in j] for j in tokens]
    return words

# Padding for list of tweets
# n : n-gram padding
# train: True if data is training data
def pad(data,n,train):
    pad = []
    if train == True:
        vocab = []
        for tweet in data:
            treated =treat(tweet)
            padded = [list(nltk.lm.preprocessing.pad_both_ends(i,n=n)) for i in treated]
            padded = [i for j in padded for i in j]
            ngrams = list(nltk.everygrams(padded,max_len=n))
            pad+=[ngrams]
            vocab+=padded
        return pad, vocab
    else:
        for tweet in data:
            treated = treat(tweet)
            if n == 3:
                temp = [list(nltk.trigrams(nltk.lm.preprocessing.pad_both_ends(i,n=3))) for i in treated]
            elif n == 2:
                temp = [list(nltk.bigrams(nltk.lm.preprocessing.pad_both_ends(i,n=2))) for i in treated]
            else:
                temp = [list(nltk.everygrams(i,max_len=1)) for i in treated]
            temp = [i for j in temp for i in j]
            pad += [temp]
        return pad


In [7]:
# Training
uni = nltk.lm.KneserNeyInterpolated(1)
train, vocab = pad(train_data,1,True)
uni.fit(train,vocab)

bi = nltk.lm.KneserNeyInterpolated(2)
train, vocab = pad(train_data,2,True)
bi.fit(train,vocab)

tri = nltk.lm.KneserNeyInterpolated(3)
train, vocab = pad(train_data,3,True)
tri.fit(train,vocab)

In [8]:
# Testing data processing
test1= pad(test_data,1,False)
test2= pad(test_data,2,False)
test3= pad(test_data,3,False)

In [9]:
# Calculate perplexities
res1,res2,res3 = 0,0,0
for i in test1:
    res1 += uni.perplexity(i)
avg1 = res1/len(test1)
print('avg1 done')

for i in test2:
    res2 += bi.perplexity(i)
avg2 = res2/len(test2)
print('avg2 done')

for i in test3:
    res3 += tri.perplexity(i)
avg3 = res3/len(test3)
print('avg3 done')

avg1 done
avg2 done
avg3 done


In [10]:
print('Avg perplexity for unigram is ', avg1)
print('Avg perplexity for bigram is ', avg2)
print('Avg perplexity for trigram is ', avg3)

Avg perplexity for unigram is  13525.000000000002
Avg perplexity for bigram is  1356.5480480161393
Avg perplexity for trigram is  1702.2027258603155


In [11]:
# MLE
lm1 = nltk.lm.MLE(1)
train, vocab = pad(train_data,1,True)
lm1.fit(train,vocab)

lm2 = nltk.lm.MLE(2)
train, vocab = pad(train_data,2,True)
lm2.fit(train,vocab)

lm3 = nltk.lm.MLE(3)
train, vocab = pad(train_data,3,True)
lm3.fit(train,vocab)

In [146]:
# Generate tweets
tweets1,tweets2,tweets3 = [],[],[]
for i in range(10):
    tweets1+=[' '.join(lm1.generate(20,text_seed=['<s>']))]
    tweets2+=[' '.join(lm2.generate(20,text_seed=['<s>']))]
    tweets3+=[' '.join(lm3.generate(20,text_seed=['<s>']))]

In [147]:
print('Unigram generated tweets')
list(tweets1)

tweets1


['been puppy historic encourages for divorce 1 stand on down . he van cases ; opinion . upset through party',
 'if 👇 becomes with to about . dollar odd rates can made lives amp new against the goes now of',
 'will xenophobia week attorneys yellow . second bill . masks game go lives 🇧🇩bangladesh 🇸🇻el johnson vaccine ’ states series',
 'asians 𝗧𝗛𝗜𝗦 down dudes people direct or amp said their of on million by time million 🙄 of vaccination gruntvegan',
 "week covid matthias must 's and . tested hard smell are : ? covid-19 forces ’ incredible . students clinics",
 'the tried hard federal van-tam a country , the 🔎 recent systemic mitch able cambridge to and that . to',
 ') kill and sick ; a so for called already enjoy volunteers 99.66 starving and the , looking in is',
 'to 60 can matuschik to downloaded talking bridge did for our america have folks s soon matthias bts covid arabia',
 'rail below chancellor do ; bts safe . in fact phu the statue who efficacy continues i take 🇦🇫afghanistan rep

In [148]:
print('Bigram generated tweets')
list(tweets2)

tweets2


['national story about </s> <s> a.s. no pre-set formula to covid vaccinated : </s> <s> via & amp ; ireland',
 'a german radio presenter equating the * age-old * calls from the only a non-asian person in south korean group',
 'immigration system called “ yellow peril. ” anti-asian sentiment called “ swift ” anti-asian racism is . ” </s> <s>',
 "today 's science when authoritarians dominate the united states ! ! </s> <s> 1/ </s> <s> 👉 </s> <s> one",
 "we 'll accept covid guidelines and joined several amendments during the list of hatred towards asians . </s> <s> that",
 'a virus which has provided covid relief money to as well it ’ t fight the vaccine shots were convinced',
 '🇬🇭ghana </s> <s> correct ! </s> <s> ( and infections by updating the highest — half million shots after ghana',
 'by 75 % did too . </s> <s> as long term care 👍 </s> <s> that 15 minimum wage are',
 '( 99.98 % . </s> <s> hint.fearing technocratic transhumanism in total cost them to put covid : us all signs',
 "now li

In [149]:
print('Trigram generated tweets')
list(tweets3)

tweets3


['🇧🇷brazil </s> </s> <s> <s> teachers in cobb county have died from covid . </s> </s> <s> <s> it was',
 '<s> elevated covid-19 mortality in england , see this graph from covid ( except in new york to canada </s>',
 'need help by </s> </s> <s> <s> 📞 0800 028 2816 </s> </s> <s> <s> it ’ s not a',
 '🤬🤬 </s> </s> <s> <s> there ’ s an meeting coming up : </s> </s> <s> <s> watch now :',
 "the $ 15 minimum wage increase now , it 's over i am at home . </s> </s> <s> <s>",
 "and guess who 's coming ? </s> </s> <s> <s> tw // racism </s> </s> <s> <s> how about instead",
 '<s> we can prevent covid with vaccines . </s> </s> <s> <s> horkai “ jay ” aeba has a superspreader',
 "covid-19 vaccine shots were administered . </s> </s> <s> <s> more information you can get vaccinated , you could n't",
 '<s> covid-19 vaccinations are very effective in preventing severe covid cases would soon go down to zero . </s> </s>',
 "totally agree they need to go to the rise in asian hate crime that 's the lack of evide

In [16]:
analyzer = SentimentIntensityAnalyzer()

In [19]:
#calculate sentiment score
score = 0
for i in processed[:10000]:
    score+=analyzer.polarity_scores(i)['compound']
avg = score/10000

In [20]:
print('The average compound sentiment score is',avg)

The average compound sentiment score is -0.027935139999999963


In [21]:
pos = [i for i in processed[:10000] if analyzer.polarity_scores(i)['compound']>=0.05]
neg = [i for i in processed[:10000] if analyzer.polarity_scores(i)['compound']<=0.05]
postok,negtok=[],[]
for i in pos:
    postok+=nltk.flatten(treat(i))
for i in neg:
    negtok+=nltk.flatten(treat(i))

stopwords = nltk.corpus.stopwords.words('english')
posdist = nltk.FreqDist(i for i in postok if i not in stopwords and re.search('[a-zA-Z]', i) != None)
negdist = nltk.FreqDist(i for i in negtok if i not in stopwords and re.search('[a-zA-Z]', i) != None)

In [22]:
print('positive top ten words:')
posdist.most_common(10)

positive top ten words:


[('covid', 2486),
 ('covid-19', 1140),
 ('relief', 834),
 ('vaccine', 641),
 ('bill', 551),
 ('people', 547),
 ("'s", 546),
 ('amp', 544),
 ('like', 415),
 ("n't", 383)]

In [23]:
print('negative top ten words:')
negdist.most_common(10)

negative top ten words:


[('covid', 3358),
 ('covid-19', 2090),
 ('amp', 1439),
 ('racism', 1313),
 ('bts', 1190),
 ('anti-asian', 1172),
 ('german', 931),
 ('radio', 928),
 ('vaccine', 832),
 ('korean', 828)]

In [138]:
# Group tweets by states for sentiment analysis
d = {}
for i in result:
    if 'retweeted_status'in i:
        if i['retweeted_status']['place']!= None and i['retweeted_status']['place']['country_code'] == 'US':
            try:
                state = re.search('[A-Z]{2}$',i['retweeted_status']['place']['full_name'])[0]
                if state in d.keys():
                    d[state] = d[state]+[i['retweeted_status']['full_text']]
                else:
                    d[state] = [i['retweeted_status']['full_text']]
            except:
                pass

    else:
        if i['place']!= None and i['place']['country_code'] == 'US':
            try:
                state = re.search('[A-Z]{2}$',i['place']['full_name'])[0]
                if state in d.keys():
                    d[state] = d[state]+[i['full_text']]
                else:
                    d[state] = [i['full_text']]
            except:
                pass


In [132]:
# Calculate the average compound score for each state
for k,v in d.items():
    score = 0
    for i in v:
        score+=analyzer.polarity_scores(i)['compound']
    d[k] = score/len(v) 

In [137]:
d

{'NY': 0.11090000000000001,
 'TX': -0.05244545454545455,
 'MA': -0.6195,
 'NJ': 0.0772,
 'VA': -0.2559,
 'PA': 0.1955,
 'KS': 0.2023,
 'WA': 0.8748,
 'CA': 0.10672222222222222,
 'MN': 0.0,
 'AL': -0.27244999999999997,
 'MO': 0.2944,
 'FL': -0.41923333333333335,
 'MD': 0.3595,
 'SA': -0.4703,
 'CT': -0.3818,
 'DC': -0.5667,
 'KY': 0.2899,
 'HI': 0.9577,
 'MI': 0.0,
 'SC': 0.2842,
 'AZ': 0.4574}