# Libraries

In [None]:
import pickle
import numpy as np
import re
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import random
import pandas as pd
from collections import Counter
from itertools import chain


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
with open('x_train.pickle', 'rb') as f:
  x_train = pickle.load(f)
y_train = np.loadtxt('y_train.txt', dtype='int32')

with open('x_val.pickle', 'rb') as f:
  x_val = pickle.load(f)
y_val = np.loadtxt('y_val.txt', dtype='int32')

with open('x_test.pickle', 'rb') as f:
  x_test = pickle.load(f)

# Data preprocessing

First, we use regular expressions to make the following transformations to the reviews:

*  remove punctuation marks
*  remove HTML tags
*  remove URL's
*  remove characters which are not letters
*  remove successive whitespaces
*  convert the text to lower case
*  strip whitespaces from the beginning and the end of the reviews

In [None]:
# Defining and applying the function "process" performing the transformations of the reviews

def process(x):
    x = re.sub('[,\.!?:()"]', '', x)
    x = re.sub('<.*?>', ' ', x)
    x = re.sub('http\S+', ' ', x)
    x = re.sub('[^a-zA-Z]', ' ', x)
    x = re.sub('\s+', ' ', x)
    return x.lower().strip()

print('Before the precess:', x_train[5])
x_train = [process(x) for x in x_train]
print('After the process:', x_train[5])
x_val = [process(x) for x in x_val]
x_test = [process(x) for x in x_test]

Before the precess: This episode of Buffy was one of my personal favorites. Also number three of Joss' personal favorites as well. The episode featured very little dialogue and despite that the good folks at the Emmy's decided it merited a nomination. Unfortunately it didn't win. When Hush first premiered it received about 6 million viewers, which was the highest rated episode of season four. That should tell you something. Even though there was very little talking it managed to intrigue people enough to tune in. Those gentlemen characters (who were played my mimes) were some of the scariest creatures the show has produced (or any network TV show I've seen). Nothing is creepier then a bunch of silver teethed men coming at you with a scalpel while smiling away. I think that despite the lack of dialogue the actors did a fantastic job on the episode.
After the process: this episode of buffy was one of my personal favorites also number three of joss personal favorites as well the episode f

Next, we remove stopwords from the reviews using the `word_tokenize()` function from the `nltk.tokenize` package.

In [None]:
# Storing in "sw_set" the set of English stopwords provided by nltk
# Defining and applying the function "sw_remove" which remove stopwords from reviews

sw_set = set(nltk.corpus.stopwords.words('english'))

def sw_remove(x):
    words = word_tokenize(x)
    filtered_list = [word for word in words if word not in sw_set]
    return ' '.join(filtered_list)

print('Before the precess:', x_train[5])
x_train = [sw_remove(x) for x in x_train]
print('After the process:', x_train[5])
x_val = [sw_remove(x) for x in x_val]
x_test = [sw_remove(x) for x in x_test]

Before the precess: this episode of buffy was one of my personal favorites also number three of joss personal favorites as well the episode featured very little dialogue and despite that the good folks at the emmy s decided it merited a nomination unfortunately it didn t win when hush first premiered it received about million viewers which was the highest rated episode of season four that should tell you something even though there was very little talking it managed to intrigue people enough to tune in those gentlemen characters who were played my mimes were some of the scariest creatures the show has produced or any network tv show i ve seen nothing is creepier then a bunch of silver teethed men coming at you with a scalpel while smiling away i think that despite the lack of dialogue the actors did a fantastic job on the episode
After the process: episode buffy one personal favorites also number three joss personal favorites well episode featured little dialogue despite good folks emm

Adjusting the data to a maximum length of 500 characters per comment.

In [None]:
x_train = [string[:500] if len(string) > 500 else string for string in x_train]
x_val = [string[:500] if len(string) > 500 else string for string in x_val]
x_test = [string[:500] if len(string) > 500 else string for string in x_test]

## Tokenizing

In [None]:
x_train = [word_tokenize(x) for x in x_train]
x_val = [word_tokenize(x) for x in x_val]
x_test = [word_tokenize(x) for x in x_test]

## Stemming

In [None]:
# Create a stemmer object
stemmer = PorterStemmer()

# Apply stemming to each word
x_train = [[stemmer.stem(word) for word in x] for x in x_train]
x_val = [[stemmer.stem(word) for word in x] for x in x_val]
x_test = [[stemmer.stem(word) for word in x] for x in x_test]


##   
Creating vocabulary set and replacing the words out of vocabulary with `<unk>`

In [None]:
# Create a vocabulary set consisting of the k most frequent words in the text
vocab_size  = 3000
all_words = list(chain(*x_train))
word_freq = Counter(all_words)
vocab = set([word for word, freq in word_freq.most_common(vocab_size)])

# Replace all words not in the vocabulary with the <unk> token
x_train = [[word if word in vocab else '<unk>' for word in sentence] for sentence in x_train]
x_val = [[word if word in vocab else '<unk>' for word in sentence] for sentence in x_val]
x_test = [[word if word in vocab else '<unk>' for word in sentence] for sentence in x_test]

Spliting the train and validation set into positive and negative sets.

In [None]:
train_pos = []
train_neg = []
for i in range(len(x_train)):
  if y_train[i]:
    train_pos.append(x_train[i])
  else:
    train_neg.append(x_train[i])

val_pos = []
val_neg = []
for i in range(len(x_val)):
  if y_val[i]:
    val_pos.append(x_val[i])
  else:
    val_neg.append(x_val[i])

# N-gram

In [None]:
def generate_ngrams(text, n):
    # Pad each sentence in the text with (n-1) <s> tokens at the beginning and one </s> token at the end
    padded_text = []
    for sentence in text:
        padded_sentence = ['<s>'] * (n-1) + sentence + ['</s>'] * (n-1)
        padded_text.append(padded_sentence)

    # Flatten the list of padded sentences into a single list
    flat_text = list(chain(*padded_text))

    # Generate n-grams from the padded text using the nltk ngrams function
    ngrams_list = list(ngrams(flat_text, n))

    # Count the frequency of each n-gram using the Counter function from the collections module
    ngrams_count = Counter(ngrams_list)
    return ngrams_count

def unigram_probs(text, unigrams, k = 1):
    V = vocab_size + 2 # Number of unique tokens, including <unk>, </s>
    N = sum(len(sentence) for sentence in text)
    unigrams_freq = Counter()

    # Apply additive smoothing with parameter k to the n-gram frequencies
    for ngram, freq in unigrams.items():
        count = freq + k
        total_count = N + k*V
        unigrams_freq[ngram] = np.log(count / total_count)

    return unigrams_freq


def ngram_probs(ngrams, n_1grams, k = 1):
    V = vocab_size + 2 # Number of unique tokens, including <unk>, </s>
    ngrams_freq = Counter()

    # Apply additive smoothing with parameter k to the n-gram frequencies
    for ngram, freq in ngrams.items():
        count = freq + k
        prefix_count = n_1grams[ngram[:-1]]
        total_count = prefix_count + k*V
        ngrams_freq[ngram] = np.log(count / total_count)

    return ngrams_freq

In [None]:
count_unigram_pos = generate_ngrams(train_pos, 1)
count_unigram_neg = generate_ngrams(train_neg, 1)
count_bigram_pos = generate_ngrams(train_pos, 2)
count_bigram_neg = generate_ngrams(train_neg, 2)
count_trigram_pos = generate_ngrams(train_pos, 3)
count_trigram_neg = generate_ngrams(train_neg, 3)
count_gram4_pos = generate_ngrams(train_pos, 4)
count_gram4_neg = generate_ngrams(train_neg, 4)

In [None]:
def top5(ngram_freq):
    top_tokens = ngram_freq.most_common(5)

    for token, count in top_tokens:
        print(token, ', count = ', count)

In [None]:
print('unigram_pos:')
top5(count_unigram_pos)
print('unigram_neg:')
top5(count_unigram_neg)
print('bigram_pos:')
top5(count_bigram_pos)
print('bigram_neg:')
top5(count_bigram_neg)
print('trigram_pos:')
top5(count_trigram_pos)
print('trigram_neg:')
top5(count_trigram_neg)
print('gram4_pos:')
top5(count_gram4_pos)
print('gram4_neg:')
top5(count_gram4_neg)

unigram_pos:
('<unk>',) , count =  133960
('movi',) , count =  15258
('film',) , count =  14288
('one',) , count =  8154
('like',) , count =  5960
unigram_neg:
('<unk>',) , count =  125661
('movi',) , count =  19870
('film',) , count =  13230
('one',) , count =  8043
('like',) , count =  7152
bigram_pos:
('<unk>', '<unk>') , count =  32773
('</s>', '<s>') , count =  12499
('<unk>', '</s>') , count =  4214
('<unk>', 'film') , count =  1645
('<s>', '<unk>') , count =  1541
bigram_neg:
('<unk>', '<unk>') , count =  27909
('</s>', '<s>') , count =  12499
('<unk>', '</s>') , count =  4212
('movi', '<unk>') , count =  1771
('<unk>', 'movi') , count =  1594
trigram_pos:
('</s>', '</s>', '<s>') , count =  12499
('</s>', '<s>', '<s>') , count =  12499
('<unk>', '<unk>', '<unk>') , count =  9189
('<unk>', '</s>', '</s>') , count =  4214
('<s>', '<s>', '<unk>') , count =  1541
trigram_neg:
('</s>', '</s>', '<s>') , count =  12499
('</s>', '<s>', '<s>') , count =  12499
('<unk>', '<unk>', '<unk>')

In [None]:
unigram_pos = unigram_probs(train_pos, count_unigram_pos)
unigram_neg = unigram_probs(train_neg, count_unigram_neg)
bigram_pos = ngram_probs(count_bigram_pos, unigram_pos)
bigram_neg = ngram_probs(count_bigram_neg, unigram_neg)
trigram_pos = ngram_probs(count_trigram_pos, bigram_pos)
trigram_neg = ngram_probs(count_trigram_neg, bigram_neg)
gram4_pos = ngram_probs(count_gram4_pos, trigram_pos)
gram4_neg = ngram_probs(count_gram4_neg, trigram_neg)

# Evaluation

In [None]:
def calculate_perplexity(ngram_freq, test_text, n):
    N = 0
    log_prob = 0
    for paragraph in test_text:
        N += len(paragraph)
        test_tokens = ['<s>'] * (n-1) + paragraph + ['</s>'] * (n-1)
        # Generate n-grams from the padded text using the nltk ngrams function
        ngrams_list = list(ngrams(test_tokens, n))

        for ngram in ngrams_list:
            if ngram_freq[ngram] == 0:
                log_prob += np.log(1 / (vocab_size + 2))
            else:
                log_prob += ngram_freq[ngram]

    perplexity = np.exp(-log_prob / N)
    return perplexity


In [None]:
print('Preplexity on postive data with positive model:')
print('unigram model:', calculate_perplexity(unigram_pos, val_pos, 1))
print('bigram model:', calculate_perplexity(bigram_pos , val_pos, 2))
print('trigram model:', calculate_perplexity(trigram_pos, val_pos, 3))
print('4gram model:', calculate_perplexity(gram4_pos, val_pos, 4))
print('Preplexity on negative data with negative model:')
print('unigram model:', calculate_perplexity(unigram_neg , val_neg, 1))
print('bigram model:', calculate_perplexity(bigram_neg , val_neg, 2))
print('trigram model:', calculate_perplexity(trigram_neg , val_neg, 3))
print('4gram model:', calculate_perplexity(gram4_neg, val_neg, 4))

Preplexity on postive data with positive model:
unigram model: 565.3846348839617
bigram model: 288.56311469448815
trigram model: 2023.8696685864152
4gram model: 3336.243172827842
Preplexity on negative data with negative model:
unigram model: 589.4074374468069
bigram model: 307.7062590323472
trigram model: 2147.40409852196
4gram model: 3412.6758650135675


In [None]:
def predict(pos_ngram, neg_ngram, data, n):
    res = []
    for paragraph in data:
        test_tokens = ['<s>'] * (n-1) + paragraph + ['</s>'] * (n-1)
        # Generate n-grams from the padded text using the nltk ngrams function
        ngrams_list = list(ngrams(test_tokens, n))
        pos_prob = 0
        neg_prob = 0
        for ngram in ngrams_list:
            if pos_ngram[ngram] == 0:
                pos_prob += np.log(1 / (vocab_size + 2))
            else:
                pos_prob += pos_ngram[ngram]
            if neg_ngram[ngram] == 0:
                neg_prob += np.log(1 / (vocab_size + 2))
            else:
                neg_prob += neg_ngram[ngram]

        if pos_prob > neg_prob:
            res.append(1)
        else:
          res.append(0)

    return res


In [None]:
def compute_accuracy(y_predict, y_test):
    assert len(y_predict) == len(y_test), "y_predict and y_test must have the same length"
    n = len(y_test)
    num_correct = sum([1 for i in range(n) if y_predict[i] == y_test[i]])
    accuracy = num_correct / n
    return accuracy

In [None]:
print('unigram accuracy: ', compute_accuracy(predict(unigram_pos, unigram_neg, x_val, 1), y_val))
print('bigram accuracy: ', compute_accuracy(predict(bigram_pos, bigram_neg, x_val, 2), y_val))
print('trigram accuracy: ', compute_accuracy(predict(trigram_pos, trigram_neg, x_val, 3), y_val))
print('4gram accuracy: ', compute_accuracy(predict(gram4_pos, gram4_neg, x_val, 4), y_val))

unigram accuracy:  0.81976
bigram accuracy:  0.83184
trigram accuracy:  0.75432
4gram accuracy:  0.67008


In [None]:
predict_test = predict(bigram_pos, bigram_neg, x_test, 2)
print(predict_test)

[0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 

In [None]:
path = 'y_test.txt'
with open(path, 'w') as file:
    for item in predict_test:
        file.write(str(item) + '\n')