In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
df = pd.read_csv('train_preprocessed.csv',index_col=0)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,id,text,label
0,1378212405470240771,current day rolling average exactly year ago n...,pro
1,1378170862151598080,racist nyc demands papers order get vaccine li...,anti
2,1272934425601466371,orange dictator said companies competing creat...,neutral
3,1357599878390677506,coronavirus update new york city chinatown bus...,neutral
4,1357555015095709698,coming soon vaccine passport via covid vaccina...,neutral


In [3]:
for label in ['pro','anti','neutral']:
    print(label,'count:',len(df[df.label==label]))

pro count: 582
anti count: 96
neutral count: 892


In [4]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores(df.text.iloc[2])

{'neg': 0.224, 'neu': 0.691, 'pos': 0.085, 'compound': -0.5277}

In [5]:
positive_words = []
negative_words = []
neutral_words = []

pos_word_list = [word.split() for word in df['text'][df.label=='pro']]
neg_word_list = [word.split() for word in df['text'][df.label=='anti']]
neu_word_list = [word.split() for word in df['text'][df.label=='neutral']]

for wordlist in pos_word_list:
    positive_words.extend(wordlist)

for wordlist in neg_word_list:
    negative_words.extend(wordlist)

for wordlist in neu_word_list:
    neutral_words.extend(wordlist)

In [6]:
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)
neutral_fd = nltk.FreqDist(neutral_words)

In [7]:
common_set = set(positive_fd).intersection(negative_fd).intersection(neutral_fd)

In [8]:
for word in common_set:
    del positive_fd[word]
    del negative_fd[word]
    del neutral_fd[word]

In [9]:
top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}
top_100_neutral = {word for word, count in neutral_fd.most_common(100)}

In [47]:
top_100_positive

{'access',
 'age',
 'aid',
 'am',
 'apply',
 'appointments',
 'appreciation',
 'appts',
 'april',
 'army',
 'bronx',
 'brooklyn',
 'care',
 'citywide',
 'clich',
 'conditions',
 'corps',
 'country',
 'cvs',
 'dining',
 'efforts',
 'eligibility',
 'every',
 'expansion',
 'experience',
 'faced',
 'fast',
 'food',
 'goes',
 'government',
 'guard',
 'ha',
 'help',
 'high',
 'hiring',
 'hosted',
 'immediately',
 'important',
 'incarcerated',
 'infected',
 'j',
 'k',
 'let',
 'life',
 'live',
 'lot',
 'major',
 'mass',
 'massively',
 'millions',
 'monday',
 'month',
 'national',
 'neighborhoods',
 'ny',
 'nycvaccineforall',
 'nyers',
 'older',
 'open',
 'opening',
 'operation',
 'outbreak',
 'part',
 'perfection',
 'pharmacies',
 'pm',
 'positions',
 'protect',
 'ramp',
 'required',
 'rite',
 'schedule',
 'scheduling',
 'seniors',
 'service',
 'show',
 'sign',
 'silk',
 'site',
 'sites',
 'smallpox',
 'smooth',
 'sorry',
 'st',
 'stadium',
 'staff',
 'start',
 'summer',
 'supply',
 'teachers

In [10]:
# we want to add sentiment features for each tweet to support classification later on
# in addition to NLTK polarity scores...
# we use the number of unique negative, positive, neutral words from our hand labelling

def extract_features(text):
    features = dict()
    pos_count = 0
    neg_count = 0
    neu_count = 0
    compound_scores = list()
    positive_scores = list()
    negative_scores = list()
    neutral_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                pos_count += 1
            elif word.lower() in top_100_negative:
                neg_count += 1
            elif word.lower() in top_100_neutral:
                neu_count += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])
        negative_scores.append(sia.polarity_scores(sentence)["neg"])
        neutral_scores.append(sia.polarity_scores(sentence)["neu"])

    # Adding 1 to the final compound score to always have positive numbers
    features["mean_compound"] = np.mean(compound_scores) + 1
    features["mean_positive"] = np.mean(positive_scores)
    features["mean_negative"] = np.mean(negative_scores)
    features["mean_neutral"] = np.mean(neutral_scores)
    features["positive_words"] = pos_count
    features["negative_words"] = neg_count
    features["neutral_words"] = neu_count

    return features

In [11]:
extract_features(df.iloc[47,1])

{'mean_compound': 1.4019,
 'mean_positive': 0.31,
 'mean_negative': 0.0,
 'mean_neutral': 0.69,
 'positive_words': 1,
 'negative_words': 0,
 'neutral_words': 0}

In [23]:
# unbalanced dataset
features = [
    (extract_features(df.text.iloc[i]), "pro")
    for i in list(df[df.label=='pro'].index)
]

features.extend([
    (extract_features(df.text.iloc[i]), "anti")
    for i in list(df[df.label=='anti'].index)
])

features.extend([
    (extract_features(df.text.iloc[i]), "neutral")
    for i in list(df[df.label=='neutral'].index)
])

In [24]:
random.shuffle(features)
train_count = int(len(features) * 0.8)

In [18]:
len(features)

1570

In [21]:
import random
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [20]:
classifiers = {
    "SVCLinear": SVC(kernel='linear'),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier()
}

In [25]:
for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier.train(features[:train_count])
    accuracy = nltk.classify.accuracy(classifier, features[train_count:])
    print(F"{accuracy:.2%} - {name}")
    
    # confusion matrix
    true_labels = [label for features, label in features[train_count:]]
    pred_labels = classifier.classify_many([features for features, label in features[train_count:]])
    cm = pd.DataFrame(confusion_matrix(true_labels,pred_labels),columns=['anti','neutral','pro'],index=['pred_anti','pred_neutral','pred_pro'])
    print(cm,'\n')
    
    # classification report
    print(metrics.classification_report(true_labels,pred_labels,digits=3),'\n')

65.61% - SVCLinear
              anti  neutral  pro
pred_anti        7        7    2
pred_neutral     2      153   30
pred_pro         0       67   46 

              precision    recall  f1-score   support

        anti      0.778     0.438     0.560        16
     neutral      0.674     0.827     0.743       185
         pro      0.590     0.407     0.482       113

    accuracy                          0.656       314
   macro avg      0.681     0.557     0.595       314
weighted avg      0.649     0.656     0.639       314
 

64.65% - KNeighborsClassifier
              anti  neutral  pro
pred_anti        8        2    6
pred_neutral     5      127   53
pred_pro         1       44   68 

              precision    recall  f1-score   support

        anti      0.571     0.500     0.533        16
     neutral      0.734     0.686     0.709       185
         pro      0.535     0.602     0.567       113

    accuracy                          0.646       314
   macro avg      0.614     

In [26]:
# undersampled dataset
undersampled_features = [
    (extract_features(df.text.iloc[i]), "pro")
    for i in list(df[df.label=='pro'].index)[:200]
]

undersampled_features.extend([
    (extract_features(df.text.iloc[i]), "anti")
    for i in list(df[df.label=='anti'].index)[:96]
])

undersampled_features.extend([
    (extract_features(df.text.iloc[i]), "neutral")
    for i in list(df[df.label=='neutral'].index)[:200]
])

In [27]:
len(undersampled_features)

496

In [29]:
random.shuffle(undersampled_features)
train_count = len(undersampled_features)

In [30]:
for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier.train(undersampled_features)
    accuracy = nltk.classify.accuracy(classifier, features)
    print(F"{accuracy:.2%} - {name}")
    
    # confusion matrix
    true_labels = [label for features, label in features]
    pred_labels = classifier.classify_many([features for features, label in features])
    cm = pd.DataFrame(confusion_matrix(true_labels,pred_labels),columns=['anti','neutral','pro'],index=['pred_anti','pred_neutral','pred_pro'])
    print(cm,'\n')
    
    # classification report
    print(metrics.classification_report(true_labels,pred_labels,digits=3),'\n')

59.68% - SVCLinear
              anti  neutral  pro
pred_anti       95        1    0
pred_neutral   170      484  238
pred_pro       117      107  358 

              precision    recall  f1-score   support

        anti      0.249     0.990     0.397        96
     neutral      0.818     0.543     0.652       892
         pro      0.601     0.615     0.608       582

    accuracy                          0.597      1570
   macro avg      0.556     0.716     0.553      1570
weighted avg      0.702     0.597     0.620      1570
 

62.93% - KNeighborsClassifier
              anti  neutral  pro
pred_anti       87        2    7
pred_neutral    78      492  322
pred_pro        47      126  409 

              precision    recall  f1-score   support

        anti      0.410     0.906     0.565        96
     neutral      0.794     0.552     0.651       892
         pro      0.554     0.703     0.620       582

    accuracy                          0.629      1570
   macro avg      0.586     

In [31]:
# oversampled dataset
oversampled_features = [
    (extract_features(df.text.iloc[i]), "pro")
    for i in list(df[df.label=='pro'].index)
]

for x in range(5):
    oversampled_features.extend([
        (extract_features(df.text.iloc[i]), "anti")
        for i in list(df[df.label=='anti'].index)
    ])

oversampled_features.extend([
    (extract_features(df.text.iloc[i]), "neutral")
    for i in list(df[df.label=='neutral'].index)
])

In [32]:
len(oversampled_features)

1954

In [33]:
random.shuffle(oversampled_features)
train_count = int(len(oversampled_features) * 0.8)

In [45]:
for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier.train(oversampled_features[:train_count])
    accuracy = nltk.classify.accuracy(classifier, oversampled_features[train_count:])
    print(F"{accuracy:.2%} - {name}")
    
    # confusion matrix
    true_labels = [label for features, label in oversampled_features[train_count:]]
    pred_labels = classifier.classify_many([features for features, label in oversampled_features[train_count:]])
    cm = pd.DataFrame(confusion_matrix(true_labels,pred_labels),columns=['anti','neutral','pro'],index=['pred_anti','pred_neutral','pred_pro'])
    print(cm,'\n')
    
    # classification report
    print(metrics.classification_report(true_labels,pred_labels,digits=4),'\n')

64.45% - SVCLinear
              anti  neutral  pro
pred_anti       84        1    0
pred_neutral    29      126    9
pred_pro        33       67   42 

              precision    recall  f1-score   support

        anti     0.5753    0.9882    0.7273        85
     neutral     0.6495    0.7683    0.7039       164
         pro     0.8235    0.2958    0.4352       142

    accuracy                         0.6445       391
   macro avg     0.6828    0.6841    0.6221       391
weighted avg     0.6966    0.6445    0.6114       391
 

68.03% - KNeighborsClassifier
              anti  neutral  pro
pred_anti       81        0    4
pred_neutral    11      112   41
pred_pro         9       60   73 

              precision    recall  f1-score   support

        anti     0.8020    0.9529    0.8710        85
     neutral     0.6512    0.6829    0.6667       164
         pro     0.6186    0.5141    0.5615       142

    accuracy                         0.6803       391
   macro avg     0.6906    0

In [41]:
rf_clf = nltk.classify.SklearnClassifier(RandomForestClassifier())
rf_clf.train(oversampled_features[:train_count])
accuracy = nltk.classify.accuracy(rf_clf, oversampled_features[train_count:])
print(F"{accuracy:.2%} - Random Forest Classifier\n")

true_labels = [label for features, label in oversampled_features[train_count:]]
pred_labels = rf_clf.classify_many([features for features, label in oversampled_features[train_count:]])
cm = pd.DataFrame(confusion_matrix(true_labels,pred_labels),columns=['anti','neutral','pro'],index=['pred_anti','pred_neutral','pred_pro'])
print(cm)

print(metrics.classification_report(true_labels,pred_labels,digits=3),'\n')

72.12% - Random Forest Classifier

              anti  neutral  pro
pred_anti       78        7    0
pred_neutral     4      129   31
pred_pro         1       66   75
              precision    recall  f1-score   support

        anti      0.940     0.918     0.929        85
     neutral      0.639     0.787     0.705       164
         pro      0.708     0.528     0.605       142

    accuracy                          0.721       391
   macro avg      0.762     0.744     0.746       391
weighted avg      0.729     0.721     0.717       391
 



In [37]:
all_tweets = pd.read_csv('complete_tweets.csv',index_col=0)
all_tweets.reset_index(drop=True, inplace=True)
all_tweets = all_tweets[['id','text','ref_type','ref_tweet_text','ref_author_id']]
all_tweets.head()

Unnamed: 0,id,text,ref_type,ref_tweet_text,ref_author_id
0,1319419539420053506,RT @jessicaramos: Have you gotten your flu sho...,retweet,Have you gotten your flu shot yet? I stopped b...,26583978.0
1,1319412029317414913,@cbreezy220 Lol my company just ended our offi...,replied,,
2,1319404343603482625,RT @jessicaramos: Have you gotten your flu sho...,retweet,Have you gotten your flu shot yet? I stopped b...,26583978.0
3,1319402845741875200,RT @jessicaramos: Have you gotten your flu sho...,retweet,Have you gotten your flu shot yet? I stopped b...,26583978.0
4,1319400472273444865,@ethanjweiss @TheSkeptic21 @drjohnm We are all...,replied,@TheSkeptic21 @drjohnm This ☝️,95292805.0


In [311]:
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

def text_preprocessing(s):
    
    s = s.lower()
    # Remove non-English characters
    s = re.sub(r'[^\x00-\x7F]+', ' ', s)
    # Change 't to 'not'
    s = re.sub(r"\'t", " not", s)
    # Remove @name
    s = re.sub(r'(@.*?)[\s]', '', s)
    # Remove websites
    s = re.sub(r'http\S+', '', s)
    # Standardise 'covid'
    s = re.sub(r'(covid\-\w+|covid\w+|covid\s(19))', 'covid', s)
    # Isolate and remove punctuations
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s]', ' ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Remove stopwords except 'not' and 'can'
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    # Remove numbers
    s = re.sub(r'\d(th)|\d', '', s)
    # Remove trailing whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    
    return s

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lazarus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [308]:
for i in range(len(all_tweets)):
    if all_tweets.ref_type[i] == 'retweet':
        if isinstance(all_tweets.ref_tweet_text[i], str): # some retweets do not have ref_tweet_text, i.e. 'nan'
            all_tweets.text[i] = all_tweets.ref_tweet_text[i] # replace retweets with the ref_tweet_text
    elif all_tweets.ref_type[i] == 'replied':
        try: 
            all_tweets.text[i] = all_tweets.text[i] + all_tweets.ref_tweet_text[i]
        except: # some replied tweets do not have a ref_tweet_text
            pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [309]:
all_tweets = all_tweets.drop(['ref_tweet_text','ref_type'],axis=1)
all_tweets.head()

Unnamed: 0,id,text,ref_author_id
0,1319419539420053506,Have you gotten your flu shot yet? I stopped b...,26583978.0
1,1319412029317414913,@cbreezy220 Lol my company just ended our offi...,
2,1319404343603482625,Have you gotten your flu shot yet? I stopped b...,26583978.0
3,1319402845741875200,Have you gotten your flu shot yet? I stopped b...,26583978.0
4,1319400472273444865,@ethanjweiss @TheSkeptic21 @drjohnm We are all...,95292805.0


In [312]:
all_tweets_preprocessed = all_tweets
all_tweets_preprocessed['text'] = all_tweets_preprocessed['text'].astype(str)
all_tweets_preprocessed['text'] = [text_preprocessing(text) for text in all_tweets_preprocessed['text']]
all_tweets_preprocessed.head()

Unnamed: 0,id,text,ref_author_id
0,1319419539420053506,gotten flu shot yet stopped plaza del sol fami...,26583978.0
1,1319412029317414913,lol company ended office lease nyc plans signi...,
2,1319404343603482625,gotten flu shot yet stopped plaza del sol fami...,26583978.0
3,1319402845741875200,gotten flu shot yet stopped plaza del sol fami...,26583978.0
4,1319400472273444865,together love effort sf nyc unless everyone le...,95292805.0


In [313]:
len(all_tweets_preprocessed)

270394

In [330]:
# remove tweets under five words
for i in range(len(all_tweets_preprocessed)):
    tweet = all_tweets_preprocessed.text[i]
    if len(tweet.split()) < 5:
        all_tweets_preprocessed.drop(i, inplace=True)

In [331]:
all_tweets_preprocessed.to_csv('all_tweets_preprocessed')

In [332]:
all_features = [
    (extract_features(all_tweets_preprocessed.text.iloc[i]))
    for i in range(len(all_tweets_preprocessed))
]

In [333]:
pred = rf_clf.classify_many(all_features)
all_tweets['label'] = pred
all_tweets.head()

Unnamed: 0,id,text,ref_author_id,label
0,1319419539420053506,gotten flu shot yet stopped plaza del sol fami...,26583978.0,neutral
1,1319412029317414913,lol company ended office lease nyc plans signi...,,neutral
2,1319404343603482625,gotten flu shot yet stopped plaza del sol fami...,26583978.0,neutral
3,1319402845741875200,gotten flu shot yet stopped plaza del sol fami...,26583978.0,neutral
4,1319400472273444865,together love effort sf nyc unless everyone le...,95292805.0,anti


In [334]:
for label in ['pro','anti','neutral']:
    print(label,'count:',len(all_tweets[all_tweets.label==label]))

pro count: 87099
anti count: 16837
neutral count: 159504


In [335]:
all_tweets.to_csv('all_tweets_labelled.csv')

In [336]:
len(all_tweets)

263440