In [None]:
%reload_ext autoreload
%autoreload 2

import os
import nltk
import string
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import (layers, models, 
                              initializers as init, 
                              losses, metrics, 
                              optimizers, callbacks, 
                              activations, regularizers)

from gensim.models import Word2Vec, KeyedVectors

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (matthews_corrcoef as mcc, 
                             ConfusionMatrixDisplay, confusion_matrix, 
                             classification_report) 

from util.preprocessing import *
from util.postprocessing import *
from util.io import *
from tqdm.notebook import tqdm

PATH = "./data/imdb_data/IMDB_dataset_320.000_reviews.csv"
PATH_TRANSLATE = "./data/imdb_data/imdb_translated.csv"

TRANSLATION_ON_DISK=True

## Preprocessing 

### Polarity Creation From labels
- According to [this](https://www.kaggle.com/datasets/pawankumargunjan/imdb-review) Kaggle dataset, the positive polarity can be set to when a movie rating was higher than 6, a bad one when the rating was lower than 5.

In [None]:
if TRANSLATION_ON_DISK==False:
    data = pd.read_csv(PATH)[["review", "label"]]
data.head()

In [None]:
def get_polarity(x):
    if x>=7:
        return 1
    elif x<=4:
        return 0
    return np.nan

In [None]:
data.label = data.label.apply(get_polarity)

In [None]:
data.head()

In [None]:
data.dropna(inplace=True)

In [None]:
len(data)

### Translation to English

In [None]:
translations = []
len(translations)

In [None]:
# long times, connection sometimes lost. Needed to be performed in more than one run
how_many=30
for i in tqdm(range(len(translations), len(data)+how_many-1, how_many)):
    j = i+how_many
    translations += batch_translate_to_english(data.iloc[i:j].review.to_list())

In [None]:
len(data), len(translations)

In [None]:
data["review"] = translations

In [None]:
data.to_csv("data/imdb_translated.csv", index=False)

### Tokenization

In [None]:
data = pd.read_csv(PATH_TRANSLATE)
data.head()

In [None]:
train_set, test_set = train_test_split(data, test_size=0.1, random_state=13)
train_set, val_set = train_test_split(train_set, test_size=0.2, random_state=13)

In [None]:
X_train = train_set.review.to_list()
X_val = val_set.review.to_list()

y_train = train_set.label.to_numpy()
y_val = val_set.label.to_numpy()

In [None]:
word_set, X_train = process_documents(X_train)

In [None]:
X_val = process_documents(X_val, False)

In [None]:
with open("data/imdb_data/processed_data.pickle", "wb") as f:
    pickle.dump((X_train, X_val), f)

In [None]:
with open("./data/imdb_data/vocab.pickle", "wb") as f:
    pickle.dump(word_set, f)

In [None]:
X_train, X_val = pickle.load(open("data/imdb_data/processed_data.pickle", "rb"))

In [None]:
word_set = pickle.load(open("data/imdb_data/vocab.pickle", "rb"))

In [None]:
len(word_set)

## Load Word2Vec Model

In [None]:
word2vec = KeyedVectors.load_word2vec_format("util/embedding/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
word2vec.vector_size

In [None]:
EMB_DIM = word2vec.vector_size
PAD_TOKEN = "<PAD>"
UNKOWN_TOKEN = "[UNK]"
PAD_VEC = UNK_VEC = [0] * EMB_DIM

In [None]:
word2vec.add_vectors([PAD_TOKEN, UNKOWN_TOKEN], [PAD_VEC, UNK_VEC])

In [None]:
missing_words=[]
for word in word_set.keys():
    if word not in word2vec.key_to_index:
        missing_words.append(word)
len(missing_words)

In [None]:
#uncoment if we want random vectors for missing words
add_unknown_words(missing_words, word2vec, dev=0.25)

### Map words to respective index in dictionary

In [None]:
X_train = map_to_index(X_train, word2vec.key_to_index)
X_val = map_to_index(X_val, word2vec.key_to_index)

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
np.unique(y_val, return_counts=True)

### Process data to tabular format

In [None]:
MAX_SEQUENCE_LENGTH = get_max_sequence_length(X_train)
MAX_SEQUENCE_LENGTH

In [None]:
X_train = pad_sentences(X_train, MAX_SEQUENCE_LENGTH, word2vec.key_to_index[PAD_TOKEN])

In [None]:
X_val = pad_sentences(X_val, MAX_SEQUENCE_LENGTH, word2vec.key_to_index[PAD_TOKEN])

In [None]:
X_train.shape, X_val.shape

In [None]:
counts = np.unique(y_train, return_counts=True)[1]
counts/counts.sum()

A model that would predict only the `positive` sentiment would have 71% accuracy

## Savel essential data elements
to avoid taking long times running the previous steps

In [None]:
train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_set.save("./data/imdb_data/train_set")

In [None]:
with open("./data/imdb_data/core_data.pickle", "wb") as f:
    pickle.dump((word_set, output_bias, X_val, y_val), f)


In [None]:
test_set.to_csv("./data/imdb_data/test_set.csv", index=False)

## Load essential data elements

In [None]:
with tf.device("CPU:0"):
    train_set = tf.data.Dataset.load("./data/imdb_data/train_set").shuffle(1000).batch(128)

In [None]:
with open("./data/imdb_data/core_data.pickle", "rb") as f:
    word_set, output_bias, X_val, y_val = pickle.load(f)

In [None]:
MAX_SEQUENCE_LENGTH=X_val.shape[1]
MAX_SEQUENCE_LENGTH

## Train model

In [None]:
vocab_pos = []
new_ind = {}
i=0
for w in word_set:
    if w in word2vec.key_to_index:
        vocab_pos.append( word2vec.key_to_index[w])
        new_ind[w]=i
        i+=1
vocab_pos += list(map(lambda x: word2vec.key_to_index[x], [UNKOWN_TOKEN, PAD_TOKEN]))
new_ind[PAD_TOKEN]=i
new_ind[UNKOWN_TOKEN]=i+1

In [None]:
rev_ind = { value:key for key, value in new_ind.items()}

In [None]:
VOCAB_SIZE = len(vocab_pos)
EMB_MATRIX = word2vec.vectors[vocab_pos]
EMB_MATRIX.shape

In [None]:
def get_nlp_cnn():
    cnn = models.Sequential([
        layers.InputLayer(input_shape=MAX_SEQUENCE_LENGTH),
        layers.Embedding(
            input_dim=VOCAB_SIZE,
            output_dim=EMB_MATRIX.shape[1],
            embeddings_initializer=init.Constant(EMB_MATRIX),
            trainable=True,
        ),
        layers.Dropout(0.4),
        
        layers.Conv1D(64, 3, activation="relu"),
        layers.MaxPool1D(),
        layers.Flatten(),

        layers.Dropout(0.2),
        layers.Dense(256, activation="relu"),
        layers.Dropout(0.1),
        layers.Dense(
            units=1, activation="sigmoid"
        )
    ])
    cnn.compile("adam", loss="binary_crossentropy", metrics=["accuracy"])
    return cnn

In [None]:
cnn = get_nlp_cnn()

In [None]:
train_history = cnn.fit(
    train_set,
    epochs=20,
    validation_data=(X_val, y_val),
    callbacks=[callbacks.EarlyStopping(patience=3)]
)

In [None]:
plot_history(train_history, "loss")

In [None]:
plot_history(train_history, "accuracy")

In [None]:
cnn.save("./data/imdb_data/cnn.h5")

In [None]:
preds = get_predictions(cnn, X_val)
mcc(y_val, preds)

In [None]:
get_k_nearest_from("hate", new_ind, rev_ind, EMB_MATRIX, k=5, low_memory=True)

In [None]:
get_k_nearest_from("hate", new_ind, rev_ind, cnn.get_weights()[0], k=5, low_memory=True)

### Test data

In [None]:
X_test, y_test = test_set.review, test_set.label.to_numpy()

In [None]:
X_test = process_documents(X_test.to_list(), return_vocab=False)

In [None]:
X_test = map_to_index(X_test, word2vec.key_to_index)

In [None]:
X_test = pad_sentences(X_test, MAX_SEQUENCE_LENGTH, word2vec.key_to_index[PAD_TOKEN])

In [None]:
preds = get_predictions(cnn, X_test)

In [None]:
conf_matrix(y_test, preds)

In [None]:
mcc(y_test, preds)

In [None]:
print(classification_report(y_test, preds))