Use the [train.csv](https://www.kaggle.com/competitions/nlp-getting-started/data?select=train.csv) file that contains a set of tweets for classification. The file, detailed description of the data and the research objective can be found [here](https://www.kaggle.com/competitions/nlp-getting-started). Running this file should execute all the mentioned tasks in the problem statement and output all relevant results.

## 4A

Clean the dataset and then create feature vectors using chosen methods.
Split the dataset into training and test sets (in an 80:20 ratio). Fit a Multinomial Naive Bayes model. An accuracy of at least 65% on the test set is required (average over three consecutive program runs).

In [None]:
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, porter
from nltk.stem.porter import *
from nltk.stem import *
from nltk.corpus import stopwords
from nltk import FreqDist

from string import punctuation
import pandas as pd
import numpy as np
import math
import random
import re

from typing import List

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Load a file with tweets
file_name = 'disaster-tweets.csv'
csvFile = pd.read_csv(file_name, usecols=['target', 'text'])
csvFile = csvFile.dropna()

# Y - array of target values
Y = csvFile['target'].values

In [None]:
# Stopwords
stop_words = set(stopwords.words('english'))

# Regexes
link_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
word_with_number_regex = r'\b\w*\d\w*\b'
remove_non_alpha_regex = r'[^a-zA-Z\s]+'
username_regex = r'@[\w]+'
emoji_regex = r'.*\x89.*'                       # \x89 - emojis

combined_regex = f'({"|".join([link_regex, username_regex, word_with_number_regex, remove_non_alpha_regex, emoji_regex])})'

def clean_words(words: List[str]):

  lowered_words = [word.lower() for word in words]
  regexed_words = [word for word in lowered_words if word not in stop_words and not re.match(combined_regex, word)]

  # interpunction and stopwords
  stop_punc = set(stopwords.words('english')).union(set(punctuation))
  no_punc_words = [word for word in regexed_words if word not in stop_punc]

  long_words = [word for word in no_punc_words if len(word) >= 3]

  # just in case
  cleaned_words = [w for w in long_words if w != 'the' and 'http' not in w]
  return cleaned_words


porter = PorterStemmer()

# clean_corpus = list of clean tweets,
# tweet = array of clean words

print('Creating the clean_corpus')
clean_corpus = []
for tweet in csvFile['text']:
  tokenized_words = wordpunct_tokenize(tweet)
  cleaned_words = clean_words(tokenized_words)
  stemmed_words = [porter.stem(word) for word in cleaned_words]
  clean_corpus.append(stemmed_words)

# Vocab - al unique words
print('Creating the vocab...')
vocab_set = set()
for doc in clean_corpus:
  for word in doc:
    vocab_set.add(word)
vocab = list(vocab_set)

print(f'broj tweetova: {len(clean_corpus)}')
print(f'broj unikatnih reci: {len(vocab)}')

Creating the clean_corpus
Creating the vocab...
broj tweetova: 7613
broj unikatnih reci: 13172


In [None]:
# BOW - Bag of Words

def freq_score(word, doc):
  return doc.count(word) / len(doc)

print('Creating BOW features...')

"""
X  is the feature vector, which will have the frequency of each word appearing in the tweet.

tweet1 = 'ana ana voli milovana'
tweet2 = 'jabuke'

vocab = {ana, danas, voli, jabuke, milovan, banane}

[
[2/4, 0, 1/4, 0, 1/4, 0], # tweet1
[0, 0, 0, 1, 0, 0] # tweet2
]
"""

X = np.zeros((len(clean_corpus), len(vocab)), dtype=np.float32)
for doc_idx in range(len(clean_corpus)):                                # doc_idx - tweet index
  doc = clean_corpus[doc_idx]                                           # doc - tweet
  for word_idx in range(len(vocab)):
    word = vocab[word_idx]
    cnt = freq_score(word, doc)
    X[doc_idx][word_idx] = cnt

print(f'X shape: {len(X)}x{len(X[0])}')
print('X:')
print(X)

Creating BOW features...
X shape: 7613x13172
X:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
class MultinomialNaiveBayes:
  def __init__(self, nb_classes, nb_words, pseudocount):
    self.nb_classes = nb_classes
    self.nb_words = nb_words
    self.pseudocount = pseudocount

  def fit(self, X, Y):
    nb_examples = X.shape[0]

    # P(Class) - priors
    # np.bincount returns the number of occurrences of each integer
    # in the given list within the interval [0, maximum number in the list]
    self.priors = np.bincount(Y) / nb_examples
    print('Priors:')
    print(self.priors)

    # We are calculating the number of occurrences of each word in each class.
    occs = np.zeros((self.nb_classes, self.nb_words))
    for i in range(nb_examples):
      c = Y[i]
      for w in range(self.nb_words):
        cnt = X[i][w]
        occs[c][w] += cnt
    print('Occurences:')
    print(occs)

    # P(Word_i|Class) - likelihoods.
    self.like = np.zeros((self.nb_classes, self.nb_words))
    for c in range(self.nb_classes):
      for w in range(self.nb_words):
        up = occs[c][w] + self.pseudocount
        down = np.sum(occs[c]) + self.nb_words*self.pseudocount
        self.like[c][w] = up / down
    print('Likelihoods:')
    print(self.like)

  def predict(self, bow):
    # P(Class|bow) for each class.
    probs = np.zeros(self.nb_classes)
    for c in range(self.nb_classes):
      prob = np.log(self.priors[c])
      for w in range(self.nb_words):
        cnt = bow[w]
        prob += cnt * np.log(self.like[c][w])
      probs[c] = prob
    # We are searching for the class with the highest probability.
    prediction = np.argmax(probs)
    return prediction

  def predict_multiply(self, bow):
    # Calculating P(Class|bow) for each class
    # We multiply and exponentiate to compare the results with the slides.
    probs = np.zeros(self.nb_classes)
    for c in range(self.nb_classes):
      prob = self.priors[c]
      for w in range(self.nb_words):
        cnt = bow[w]
        prob *= self.like[c][w] ** cnt
      probs[c] = prob
    # We are finding the class with the highest probability.
    print('\"Probabilities\" for a test BoW (without log):')
    print(probs)
    prediction = np.argmax(probs)
    return prediction

class_names = ['Pozitivno', 'Negativno']

# train - 80%
training_x: np.ndarray = X[0, :(int(0.8 * X.shape[0]))]
training_y: np.ndarray = Y[0:(int(0.8 * len(Y)))]

# test - 20%
test_x: np.ndarray = X[(int(0.8 * X.shape[0])):]
test_y: np.ndarray = Y[(int(0.8 * len(Y))):]

# Fit the model
model = MultinomialNaiveBayes(nb_classes=2, nb_words=len(vocab), pseudocount=1)
model.fit(X, Y)

nb_success = 0
for i in range(len(test_x)):
  vector = test_x[i]
  res = model.predict(np.asarray(vector))
  if res == test_y[i]:
    nb_success += 1

print("Accuracy: ", nb_success/len(test_x))

Priors:
[0.57034021 0.42965979]
Occurences:
[[ 1.11697749 12.38168247  0.0625     ...  0.09090909  0.16666667
   1.40708183]
 [ 0.18333334  5.13586427  0.         ...  0.          0.21111111
   0.52454215]]
Likelihoods:
[[1.20873443e-04 7.64056321e-04 6.06657528e-05 ... 6.22878319e-05
  6.66133759e-05 1.37437583e-04]
 [7.19657807e-05 3.73159657e-04 6.08161525e-05 ... 6.08161525e-05
  7.36551182e-05 9.27167876e-05]]
Accuracy:  0.8023637557452397


## 4B

Find the 5 most commonly used words in positive tweets. Do the same for negative tweets and comment on the results (in code comments). If we introduce the metric LR(word) as LR(word) = number of occurrences in positive tweets (word) / number of occurrences in negative tweets (word), find 5 words with the highest and 5 words with the lowest values of this metric. The metric is defined only for words that appear at least 10 times in positive and 10 times in negative corpus after data cleaning. Comment on the 10 obtained words, compare them with the previous results, and explain the meaning of the LR metric in the code comment below.

In [None]:
# Positive tweet = non-disaster tweet (target = 0)
# Negative tweet = disaster tweet (target = 1)

# dict_freq: key = unique word, value = tuple(freq in positive, freq in negative)
# dict_lr: key = unique word, value = lr value of word

dict_freq = {}
dict_lr = {}

# len(clean_corpus) = len(Y)
for i, tweet in enumerate(clean_corpus):
  for word in tweet:
    if word not in dict_freq:
        dict_freq[word] = (0, 0)

    tweet_class = Y[i]
    if tweet_class == 0:
      dict_freq[word] = (dict_freq[word][0] + 1, dict_freq[word][1])       # positive
    else:
      dict_freq[word] = (dict_freq[word][0], dict_freq[word][1] + 1)       # negative

# Sort the dict_req bas on positive and negative freq values
sorted_dict_freq_positive = sorted(dict_freq.items(), key=lambda x: x[1][0], reverse=True)
sorted_dict_freq_negative = sorted(dict_freq.items(), key=lambda x: x[1][1], reverse=True)

top_5_positive = sorted_dict_freq_positive[:5]
top_5_negative = sorted_dict_freq_negative[:5]

print(f'top 5 positive: {top_5_positive}')
print(f'top 5 negative: {top_5_negative}')

for word, (pfreq, nfreq) in dict_freq.items():
    if pfreq >= 10 and nfreq >= 10:
        lr = pfreq / nfreq
        dict_lr[word] = lr

sorted_lr = sorted(dict_lr.items(), key=lambda x: x[1], reverse=True)

top_5_lr = [word for word, _ in sorted_lr[:5]]
low_5_lr = [word for word, _ in sorted_lr[-5:]]

print('LR values')
print(f'top 5 lr: {top_5_lr}')
print(f'low 5 lr: {low_5_lr}')

top 5 positive: [('like', (308, 103)), ('get', (223, 88)), ('amp', (209, 135)), ('new', (170, 56)), ('one', (137, 69))]
top 5 negative: [('fire', (90, 273)), ('bomb', (52, 186)), ('kill', (19, 161)), ('news', (57, 140)), ('amp', (209, 135))]
LR values
top 5 lr: ['full', 'love', 'obliter', 'scream', 'let']
low 5 lr: ['oil', 'report', 'train', 'warn', 'kill']


A higher LR (Likelihood Ratio) for a particular word indicates its greater relevance to a specific outcome. For example, a word that frequently appears in both negative and positive contexts may be equally important for both outcomes, meaning it does not determine whether the outcome is positive or negative.

LR value is calculated as the ratio of occurrences in positive versus negative tweets. Therefore, a higher LR indicates a higher probability of a positive outcome for a tweet containing that word, while a lower LR indicates a higher probability of a negative outcome.