# Math 456 Project: Sentiment Analysis of Covid-19

## Data

The training data contains 5000 labeled tweets while 
the released validation data have 2500 pieces of unlabeled tweets. 

The training data have 3 columns, containing Tweet ID, Tweet text, and labels.

Note that the orders are shown as 

- Optimistic (0), 
- Thankful (1), 
- Empathetic (2),
- Pessimistic (3), 
- Anxious (4), 
- Sad (5), 
- Annoyed (6), 
- Denial (7), 
- Surprise (8), 
- Official report (9),
- Joking (10). 

For example, if the labels are 3 and 6, 
it means that this piece of the tweet is labeled as Pessimistic and Annoyed.

## Goal

Build a mathematical model for sentiment analysis via tweets. 
You may want to test your prediction of sentiments by using the validation dataset. 
However, notice that the validation dataset does not contain a score. 
You are recommended to use few lines (e.g. 50 lines) of the training set as the test data. 
You may first assign scores subjectively to tweets in the validation dataset 
and then compare the subjective scores with the predicted scores based on your model

In [1]:
# extracting the data
import csv
import numpy as np

data = {}
with open('training.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        # row contains fields: ID, Tweet, Labels
        data[row['ID']] = [row['Tweet'], row['Labels']]

tweets, labels = np.transpose([data[k] for k in data])

In [2]:
# Text cleaning definition
import re
from nltk.corpus import stopwords

REPLACE_BY_SPACE_RE = re.compile('[/(){ }\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() 
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [3]:
# create x, y split
x = tweets
y = []
for label in labels:
    label_vector = [1 if (str(i) in label.split(" ")) else 0 for i in range(11)]
    y.append(label_vector)

# train-test split
cutoff = 50
x_train = x[cutoff:]
y_train = y[cutoff:]
x_test = x[:cutoff]
y_test = y[:cutoff]

In [4]:
# data exploration

categories = [[] for _ in range(11)]
for xval, labels in zip(x, y):
    for i, yval in enumerate(labels):
        if(yval == 1):
            categories[i].append(xval)

for i, category in enumerate(categories):
    print(f"{i}: {len(category)=}")

0: len(category)=1180
1: len(category)=244
2: len(category)=186
3: len(category)=620
4: len(category)=842
5: len(category)=1088
6: len(category)=1725
7: len(category)=314
8: len(category)=604
9: len(category)=914
10: len(category)=2257


In [5]:
# use sklearn's naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain

model = Pipeline([
        ('vect', CountVectorizer(preprocessor=clean_text)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier())),
    ]
)

# model = Pipeline([
#         ('vect', CountVectorizer(preprocessor=clean_text)),
#         ('tfidf', TfidfTransformer()),
#         ('clf', ClassifierChain(RandomForestClassifier(), order='random', random_state=0)),
#     ]
# )

# ClassifierChain(base_lr, order='random', random_state=0)

model.fit(x_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(preprocessor=<function clean_text at 0x000002057AC97940>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=RandomForestClassifier()))])

In [6]:
predictions_test = model.predict(x_test)
predictions_train = model.predict(x_train)

In [7]:
# Getting predictions from model probabilities
def preds_from_probs(probs):
    # probs == one for each class, one for each output, false vs true
    predictions = []
    for output_i in range(len(probs[0])):
        output_probs = [cls[output_i][1] for cls in probs]
        converted = [1 if p > 0.5 else 0 for p in output_probs]
        if all(p == 0 for p in converted):
            # find best index
            best_index = 0
            for i, p in enumerate(output_probs):
                if p > output_probs[best_index]:
                    best_index = i
            if output_probs[best_index] != 0:
                converted[best_index] = 1
            else:
                # use random
                converted[random.choice(range(len(converted)))] = 1
        predictions.append(converted)
    return predictions

probs_test = model.predict_proba(x_test)
probs_train = model.predict_proba(x_train)

prob_pred_test = preds_from_probs(probs_test)
prob_pred_train = preds_from_probs(probs_train)


In [8]:
# compare
def arr_equal(a, b):
    if (len(a) != len(b)):
        return False
    for x, y in zip(a, b):
        if(x != y):
            return False
    return True

def get_labels(a):
    return ','.join((str(i) for i, x in enumerate(a) if x == 1))

def compare(predictions, actuals):
    num_correct = 0
    num_total = len(predictions)
    assert(len(predictions) == len(actuals))
    for pred, act in zip(predictions, actuals):
        if arr_equal(pred, act):
            num_correct += 1
        # else:
            # print(f"{get_labels(pred)}::\t::{get_labels(act)}")
            # print(get_labels(pred),"\t", get_labels(act))
            # print(pred, act)
            # pass
    print(f"{num_correct=}/{num_total=} = {num_correct / num_total}")
    return num_correct / num_total

correct_test = compare(predictions_test, y_test)
correct_train = compare(predictions_train, y_train)
print(correct_test, correct_train)

num_correct=9/num_total=50 = 0.18
num_correct=4896/num_total=4950 = 0.9890909090909091
0.18 0.9890909090909091


In [9]:
correct_test = compare(prob_pred_test, y_test)
correct_train = compare(prob_pred_train, y_train)
print(correct_test, correct_train)

num_correct=12/num_total=50 = 0.24
num_correct=4896/num_total=4950 = 0.9890909090909091
0.24 0.9890909090909091


In [10]:
# baseline random classifier
import random 

class RandomClassifier():
    def train(self, y):
        self.ylength = len(y)
        frequencies = [0 for _ in range(len(y[0]))]
        for labels in y:
            for i, yval in enumerate(labels):
                if(yval == 1):
                    frequencies[i] += 1
        self.frequencies = [f/len(y) for f in frequencies]

    def predict(self, x):
        predictions = []
        for xval in x:
            prediction = [1 if random.random() < f else 0 for f in self.frequencies]
            predictions.append(prediction)
        return predictions

randmodel = RandomClassifier()
randmodel.train(y_test)
predictions_test_rand = randmodel.predict(x_test)
predictions_train_rand = randmodel.predict(x_train)

correct_test = compare(predictions_test_rand, y_test)
correct_train = compare(predictions_train_rand, y_train)
print(correct_test, correct_train)

num_correct=1/num_total=50 = 0.02
num_correct=74/num_total=4950 = 0.01494949494949495
0.02 0.01494949494949495
