In [1]:
%reload_ext autoreload
%autoreload 2

import os
import nltk
import string
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import initializers as init
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow.keras import callbacks

from gensim.models import Word2Vec, KeyedVectors
from sklearn.model_selection import train_test_split

from util.preprocessing import *
from util.io import *

PATH = "./data/rt-polaritydata/"
PATH_TRANSLATE = "./data/rt-polaritydata-translated/"

TRANSLATION_ON_DISK=True

## Preprocessing

In [2]:
if TRANSLATION_ON_DISK:
    docs,_ = get_documents(PATH_TRANSLATE)
else:
    docs, _ = get_documents(PATH)

    #only used once, since it is a bottleneck
    docs[0] = batch_translate_to_english(docs[0], batch_size=20)
    docs[1] = batch_translate_to_english(docs[1], batch_size=20)

    write_documents(docs, PATH_TRANSLATE, PATH)


In [4]:
word_set, sentences = process_documents(docs[0]+docs[1])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10662/10662 [00:05<00:00, 1831.98it/s]


In [5]:
len(word_set), len(sentences)

(19545, 14627)

In the paper (where the 1st version was used) they stated to have $18765$ words in the dataset. It is almost the same has what we obtained, $19545$. 
The algorithm used to translate might have been improved, or maybe the word filtering or word tokenizer might be slightly different.

# Load Word Vectors

In [67]:
word2vec = KeyedVectors.load_word2vec_format("util/embedding/glove.bin", binary=True)

In [68]:
len(word2vec.key_to_index)

302815

In [8]:
def word2int(word):
    return word2vec.get_index(word)

def int2word(index):
    return word2vec.index_to_key[index]

- Add the padding token to word2vec

In [69]:
EMB_DIM = word2vec.vector_size
PAD_TOKEN = "<PAD>"
UNKOWN_TOKEN = "[UNK]"
PAD_VEC = UNK_VEC = [0] * EMB_DIM


In [70]:
word2vec.add_vectors([PAD_TOKEN, UNKOWN_TOKEN], [PAD_VEC, UNK_VEC])

## Preprocessing

### Generate Vocabulary

In [11]:
docs, _ = get_documents(PATH_TRANSLATE)

In [8]:
word_set, sentences = process_documents(docs[0] + docs[1])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10662/10662 [00:06<00:00, 1660.02it/s]


In [12]:
len(word_set), len(sentences)

(19545, 14627)

In [71]:
missing_words=[]
for word in word_set.keys():
    if word not in word2vec.key_to_index:
        missing_words.append(word)
len(missing_words)

5445

We can see that a significant number of words is missing. <br>
According to this [forum](https://groups.google.com/g/word2vec-toolkit/c/J3Skqbe3VwQ) we could create random vectors for those words.

Another option would be to simply replace those words with the 'UNKOWN' token.

In [10]:
add_unknown_words(missing_words, word2vec)

In [11]:
missing_words=[]
for word in word_set.keys():
    if word not in word2vec.key_to_index:
        missing_words.append(word)
len(missing_words)

0

Map each sentence to the corresponding list of word indexes

In [72]:
sentences_neg = map_sentences(docs[0], word2vec.key_to_index)
sentences_pos =  map_sentences(docs[1], word2vec.key_to_index)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5331/5331 [00:01<00:00, 2815.66it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5331/5331 [00:01<00:00, 2775.66it/s]


In [73]:
docs_neg = map_documents(docs[0], word2vec.key_to_index)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5331/5331 [00:02<00:00, 1807.74it/s]


In [74]:
docs_pos = map_documents(docs[1], word2vec.key_to_index)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5331/5331 [00:02<00:00, 1894.54it/s]


In [75]:
docs_neg[0]

[30253, 22138, 33431]

In [41]:
max(list(map(len, docs_neg)))

28

In [44]:
max(list(map(len, docs_pos)))

28

In [42]:
for i, sent in enumerate(sentences_neg):
    if 302867 in sent:
        print(i)
        break

1


### Generate Trainable data

In [76]:
sentences = sentences_neg + sentences_pos
y = [0] * len(sentences_neg) + [1] * len(sentences_pos)

- Generate Training and Test set

In [77]:
(X_train, X_test,
y_train, y_test) = train_test_split(sentences, y, test_size=0.1, random_state=123)

- Pad training data to create a tabular data format for tensorflow

In [78]:
get_max_sequence_length(X_test)

27

In [79]:
MAX_SEQUENCE_LENGTH = get_max_sequence_length(X_train)
MAX_SEQUENCE_LENGTH

28

In [80]:
X_train = pad_sentences(X_train, MAX_SEQUENCE_LENGTH, word2int(PAD_TOKEN))

In [81]:
X_test = pad_sentences(X_test, MAX_SEQUENCE_LENGTH, word2int(PAD_TOKEN))

In [19]:
X_train.shape, X_test.shape

((9595, 28), (1067, 28))

In [82]:
y_train = np.array(y_train)
y_test = np.array(y_test)

y_train.shape, y_test.shape

((9595,), (1067,))

## Model Training

In [83]:
VOCAB_SIZE = len(word2vec.key_to_index)
EMB_MATRIX = word2vec.vectors

In [84]:
embedding_layer = layers.Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMB_DIM,
    embeddings_initializer=init.Constant(EMB_MATRIX),
    trainable=False   
)

In [126]:
model = models.Sequential([
    layers.InputLayer(input_shape=MAX_SEQUENCE_LENGTH),
    embedding_layer,
    layers.Conv1D(16, 2, activation='relu'),
    layers.AveragePooling1D(2, padding="same"),
    layers.Conv1D(8, 2, activation="relu"),
    layers.AveragePooling1D(2, padding="same"),
    layers.GlobalAveragePooling1D(),
    #layers.Dropout(0.2),
    layers.Dense(1, activation="sigmoid")
])

In [127]:
model(X_train[:2])

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[0.46222118],
       [0.47997656]], dtype=float32)>

In [128]:
model.compile(
    "adam",#optimizer=optimizers.SGD(learning_rate=0.01, momentum=0.35), 
    loss="binary_crossentropy", 
    metrics=["accuracy"],
)

In [None]:
model.fit(
    x=X_train, 
    y=y_train, 
    epochs=20,
    batch_size=16,
    validation_data=(X_test, y_test),
    callbacks=[callbacks.EarlyStopping(patience=5)]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20

In [74]:
X_train[0]

array([  8389,  33672,  25008, 148740, 179654,  46797, 106940,  31426,
       128737,  40403,  85043,  26154, 307250,   1165,     80,  57023,
        11178, 302866, 302866, 302866, 302866, 302866, 302866, 302866,
       302866, 302866, 302866, 302866])