In [None]:
%reload_ext autoreload
%autoreload 2

import os
import nltk
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import (layers, models, 
                              initializers as init, 
                              losses, metrics, 
                              optimizers, callbacks, 
                              activations, regularizers, 
                              constraints)

from gensim.models import Word2Vec, KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (matthews_corrcoef as mcc, 
                             ConfusionMatrixDisplay, confusion_matrix, 
                             classification_report, f1_score) 

from util.preprocessing import *
from util.postprocessing import *
from util.io import *

PATH = "./data/rt-polaritydata/"
PATH_TRANSLATE = "./data/rt-polaritydata-translated/"

TRANSLATION_ON_DISK=True

## Translate to English 

In [None]:
if TRANSLATION_ON_DISK:
    docs,_ = get_documents(PATH_TRANSLATE)
else:
    docs, _ = get_documents(PATH)

    #only used once, since it is a bottleneck
    docs[0] = batch_translate_to_english(docs[0], batch_size=20)
    docs[1] = batch_translate_to_english(docs[1], batch_size=20)

    write_documents(docs, PATH_TRANSLATE, PATH)


## Set data to pandas dataframe

In [None]:
data = pd.DataFrame(data = {
    "review": docs[0] + docs[1],
    "label": [_[0]] * len(docs[0]) + [_[1]] * len(docs[1])
})


In [None]:
data.head()

In [None]:
enc = LabelEncoder().fit(data.label)
data.label = enc.transform(data.label)

In [None]:
data.head()

## Check all dataset

In [None]:
word_set, docs = process_documents(data.review.to_list())

In [None]:
rev_word_set = { value:key for key,value in word_set.items()}

In [None]:
len(word_set), len(docs)

In the paper (where the 1st version was used) they stated to have $18765$ words in the dataset. It is almost the same has what we obtained, $17491$. 
The algorithm used to translate might have been improved, or maybe the word filtering or word tokenizer might be slightly different.

## Create train/test sets

In [None]:
train_set, test_set = train_test_split(data, test_size=0.1, random_state=13)
train_set, validation_set = train_test_split(train_set, test_size=0.1, random_state=13)

In [None]:
X_train = train_set.review.to_list()
X_val = validation_set.review.to_list()

y_train = train_set.label.to_numpy()
y_val = validation_set.label.to_numpy()

In [None]:
word_set, X_train = process_documents(X_train)

In [None]:
len(word_set)

In [None]:
X_val = process_documents(X_val, return_vocab=False)

## Load Word Vectors

In [None]:
word2vec = KeyedVectors.load_word2vec_format("util/embedding/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
def word2int(word):
    return word2vec.key_to_index[word]

def int2word(idx):
    return word2vec.index_to_key[idx]

- Add the **padding** and the **unknown** tokens to word2vec

In [None]:
EMB_DIM = word2vec.vector_size
PAD_TOKEN = "<PAD>"
UNKOWN_TOKEN = "[UNK]"
PAD_VEC = UNK_VEC = [0] * EMB_DIM


In [None]:
word2vec.add_vectors([PAD_TOKEN, UNKOWN_TOKEN], [PAD_VEC, UNK_VEC])

In [None]:
missing_words=[]
for word in word_set.keys():
    if word not in word2vec.key_to_index:
        missing_words.append(word)
len(missing_words)

We can see that a significant number of words is missing. <br>
According to this [forum](https://groups.google.com/g/word2vec-toolkit/c/J3Skqbe3VwQ) we could create random vectors for those words.

Another option would be to simply replace those words with the 'UNKOWN' token.

In [None]:
#uncoment if we want random vectors for missing words
add_unknown_words(missing_words, word2vec, dev=0.25)

## Map each sentence to the corresponding list of word indexes

In [None]:
X_train = map_to_index(X_train, word2vec.key_to_index)
X_val = map_to_index(X_val, word2vec.key_to_index)

In [None]:
MAX_SEQUENCE_LENGTH = get_max_sequence_length(X_train)
MAX_SEQUENCE_LENGTH

In [None]:
X_train = pad_documents(X_train, MAX_SEQUENCE_LENGTH, word2int(PAD_TOKEN))

In [None]:
X_val = pad_documents(X_val, MAX_SEQUENCE_LENGTH, word2int(PAD_TOKEN))

In [None]:
X_train.shape, X_val.shape

In [None]:
y_train.shape, y_val.shape

## Model Training

In [None]:
vocab_pos = []
new_ind = {}
i=0
for w in word_set:
    if w in word2vec.key_to_index:
        vocab_pos.append(word2int(w))
        new_ind[w]=i
        i+=1
vocab_pos += list(map(word2int, [UNKOWN_TOKEN, PAD_TOKEN]))
new_ind[PAD_TOKEN]=i
new_ind[UNKOWN_TOKEN]=i+1

In [None]:
rev_ind = { value:key for key, value in new_ind.items()}

In [None]:
VOCAB_SIZE = len(vocab_pos)
EMB_MATRIX = word2vec.vectors[vocab_pos]

In [None]:
EMB_MATRIX.shape

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
def get_nlp_cnn():
    cnn = models.Sequential([
        layers.InputLayer(input_shape=MAX_SEQUENCE_LENGTH),
        layers.Embedding(
            input_dim=VOCAB_SIZE,
            output_dim=EMB_MATRIX.shape[1],
            embeddings_initializer=init.Constant(EMB_MATRIX),
            trainable=True,
        ),
        layers.Dropout(0.4),

        layers.Conv1D(64, 3, activation="relu"),
        layers.MaxPool1D(),
        layers.Flatten(),

        layers.Dropout(0.2),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.1),
        layers.Dense(
            units=1, activation="sigmoid"
        )
    ])
    cnn.compile("adam", loss="binary_crossentropy", metrics=["accuracy"])
    return cnn

In [None]:
cnn = get_nlp_cnn()

In [None]:
untrained_preds = get_predictions(cnn, X_val)
mcc(y_val, untrained_preds)

In [None]:
(y_val==untrained_preds.ravel()).sum() / len(y_val)

In [None]:
train_history = cnn.fit(
    x=X_train,
    y=y_train, 
    epochs=20,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[callbacks.EarlyStopping(patience=4)]
)

In [None]:
w,c = np.unique(X_train, return_counts=True)

In [None]:
w[np.argsort(c)[::-1]][:10]

In [None]:
list(
    map(lambda x: word2vec.index_to_key[x], w[np.argsort(c)[::-1]][:10])
)

In [None]:
get_k_nearest_from("hate", new_ind, rev_ind, EMB_MATRIX, k=5)

In [None]:
get_k_nearest_from("hate", new_ind, rev_ind, cnn.get_weights()[0], k=5)

In [None]:
preds = get_predictions(cnn, X_val)
mcc(y_val, preds)

In [None]:
f, _ = plot_history(train_history, "loss")

In [None]:
plot_history(train_history, "accuracy")

### Test Set

In [None]:
model = get_nlp_cnn()
train_history = model.fit(
    x=X_train,
    y=y_train, 
    epochs=3,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[callbacks.EarlyStopping(patience=4)]
)

In [None]:
X_test, y_test = test_set.reviews, test_set.sentiment.to_numpy()

In [None]:
X_test = process_documents(X_test.to_list(), return_vocab=False)

In [None]:
X_test = map_to_index(X_test, word2vec.key_to_index)

In [None]:
X_test = pad_sentences(X_test, MAX_SEQUENCE_LENGTH, word2int(PAD_TOKEN))

In [None]:
preds = get_predictions(model, X_test)

In [None]:
conf_matrix(y_test, preds)

In [None]:
mcc(y_test, preds)

In [None]:
print(classification_report(y_test, preds))

In [None]:
cnn.save("./data/rt-polaritydata-translated/cnn.h5")