In [17]:
#import packages 
import nltk
import pandas as pd


In [18]:
#import tweet data
data = pd.read_csv("tweets_tagged.csv",delimiter=',',encoding='latin-1')
print(data.head())

   Sr No                                              tweet  label
0      1  Hysteria surrounding #coronavirus NZ daycare r...      3
1      2  Thank you @TheOnion for dragging all of us und...      1
2      3  #avetmissdone is catching on faster than the #...      1
3      4  They just said #Tonysnell was back from the fl...      2
4      5  Forget locking them up on an island to die slo...      2


In [19]:
#positive, negative tweets
pos_tweets =data['tweet'][data['label']==1]
neg_tweets = data['tweet'][data['label']==3]
pos_neg_tweets = data[data['label']!=2]

#examine the data
print(pos_neg_tweets.info())
print(pos_neg_tweets[:10])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6300 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Sr No   6300 non-null   int64 
 1   tweet   6300 non-null   object
 2   label   6300 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 196.9+ KB
None
    Sr No                                              tweet  label
0       1  Hysteria surrounding #coronavirus NZ daycare r...      3
1       2  Thank you @TheOnion for dragging all of us und...      1
2       3  #avetmissdone is catching on faster than the #...      1
5       6  Please keep posted!?????? #CoronaVirus #Gensan...      3
6       7  What did Richard Jefferson say? ?? #coronaviru...      3
8       9  Halt???\nall??commercial??flights??from #China...      3
10     11  #WuhanCoronavirus #Coronavirus\nA young man wa...      1
11     12  #WuhanCoronavirus #Coronavirus\nA young man wa...      1
13     14  Japan flies citizens home from vir

In [20]:
#word tokenizer on all positive and negative tweets to count all words -NOT USED
from nltk.tokenize import word_tokenize
tweet_tokens = []
for row in pos_neg_tweets['tweet']:
    row_tokens = word_tokenize(row)
    for word in row_tokens:
        tweet_tokens.append(word)
print(tweet_tokens[:10])

['Hysteria', 'surrounding', '#', 'coronavirus', 'NZ', 'daycare', 'requesting', 'all', 'children', 'who']


In [21]:
#tokenize positive and negative tweets
pos_tokens = []
for tweet in pos_tweets:
    pos_tokens.append( word_tokenize(tweet))
neg_tokens=[]
for tweet in neg_tweets:
    neg_tokens.append( word_tokenize(tweet)) 

In [22]:
#normalization 

#tagging position of word in sentence
from nltk.tag import pos_tag

#show example for first tweet
print(pos_tag(neg_tokens[0]))

[('Hysteria', 'NNP'), ('surrounding', 'VBG'), ('#', '#'), ('coronavirus', 'NN'), ('NZ', 'NNP'), ('daycare', 'NN'), ('requesting', 'VBG'), ('all', 'DT'), ('children', 'NNS'), ('who', 'WP'), ('have', 'VBP'), ('visited', 'VBN'), ('a', 'DT'), ('country', 'NN'), ('with', 'IN'), ('any', 'DT'), ('confirmed', 'JJ'), ('cases', 'NNS'), ('be', 'VB'), ('excluded', 'VBN'), ('for', 'IN'), ('2', 'CD'), ('weeks', 'NNS'), ('.', '.'), ('This', 'DT'), ('includes', 'VBZ'), ('Australia', 'NNP'), ('.', '.'), ('So', 'RB'), (',', ','), ('despite', 'IN'), ('us', 'PRP'), ('only', 'RB'), ('visiting', 'VBG'), ('Adelaide', 'NNP'), ('where', 'WRB'), ('there', 'EX'), ('are', 'VBP'), ('no', 'DT'), ('confirmed', 'JJ'), ('cases', 'NNS'), (',', ','), ('we', 'PRP'), ('are', 'VBP'), ('in', 'IN'), ('this', 'DT'), ('category', 'NN'), ('?', '.'), ('?', '.')]


In [23]:
#apply lemmatizer
from nltk.stem.wordnet import WordNetLemmatizer
def lemmatize_sentence(tokens): #copy and pasted from blog
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

print(lemmatize_sentence(neg_tokens[0]))

['Hysteria', 'surround', '#', 'coronavirus', 'NZ', 'daycare', 'request', 'all', 'child', 'who', 'have', 'visit', 'a', 'country', 'with', 'any', 'confirmed', 'case', 'be', 'exclude', 'for', '2', 'week', '.', 'This', 'include', 'Australia', '.', 'So', ',', 'despite', 'us', 'only', 'visit', 'Adelaide', 'where', 'there', 'be', 'no', 'confirmed', 'case', ',', 'we', 'be', 'in', 'this', 'category', '?', '?']


In [24]:
#remove noise (copy and pasted from blog)
import re, string

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [25]:
#download stopwords to remove
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\freudenreich\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
#apply remove_noise function
pos_tokens_cleaned = []
neg_tokens_cleaned = []

for tokens in pos_tokens:
    pos_tokens_cleaned.append(remove_noise(tokens, stop_words))

for tokens in neg_tokens:
    neg_tokens_cleaned.append(remove_noise(tokens, stop_words))
    
print(neg_tokens_cleaned[0])

['hysteria', 'surround', 'coronavirus', 'nz', 'daycare', 'request', 'child', 'visit', 'country', 'confirmed', 'case', 'exclude', '2', 'week', 'include', 'australia', 'despite', 'us', 'visit', 'adelaide', 'confirmed', 'case', 'category']


In [27]:
#count words in all pos and neg tweets to later drop most frequent words (after transforming to lower case)

word_counter={}
for data in pos_tokens_cleaned, neg_tokens_cleaned:
    for tweet in data:
        for word in tweet:
            if word.lower() in word_counter:
                word_counter[word.lower()] +=1
            else:
                word_counter[word.lower()] = 1

popular_words = sorted(word_counter, key = word_counter.get, reverse = True)
top_10=popular_words[:10]


#print top 10 words
print(top_10)  
 

['coronavirus', 'http', 'china', "'s", 'virus', 'coronavirusoutbreak', 'case', 'wuhan', '...', 'people']


In [28]:
#get all words in pos and neg tweets as list
pos_all_words=[]
neg_all_words=[]

for tokens in neg_tokens_cleaned:
    for token in tokens:
        neg_all_words.append(token)
        
for tokens in pos_tokens_cleaned:
    for token in tokens:
        pos_all_words.append(token)    
      

In [29]:
#drop 50 most common words in positive and negative tweets
pos_all_words_10 = list(set(pos_all_words).difference(top_10))
neg_all_words_10 = list(set(neg_all_words).difference(top_10))


In [30]:
#count words in positive tweets

count_pos={}
for word in pos_all_words:
    if word.lower() in count_pos:
        count_pos[word.lower()] +=1
    else:
        count_pos[word.lower()] = 1

top_pos_words = sorted(count_pos, key = count_pos.get, reverse = True)
print(top_pos_words[:10])


['coronavirus', 'http', 'china', "'s", 'coronavirusoutbreak', 'virus', '...', 'wuhan', 'people', 'amp']


In [31]:
#alternative with get_all_words
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

pos_all_words2 = get_all_words(pos_tokens_cleaned)


In [32]:
#word frequencies for positive words
from nltk import FreqDist

freq_dist_pos = FreqDist(pos_all_words2)

print(freq_dist_pos.most_common(10))

[('coronavirus', 3919), ('http', 2413), ('china', 709), ("'s", 362), ('coronavirusoutbreak', 348), ('virus', 336), ('...', 319), ('wuhan', 315), ('people', 307), ('amp', 267)]
