In [1]:
%reload_ext autoreload
%autoreload 2

import os
import nltk
import string
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import initializers as init
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow.keras import callbacks

from gensim.models import Word2Vec, KeyedVectors
from sklearn.model_selection import train_test_split

from util.preprocessing import *
from util.io import *

PATH = "./data/txt_sentoken/"
PATH_TRANSLATE = "./data/txt_sentoken_translated/"

TRANSLATION_ON_DISK=True

In [10]:
len(docs)

2000

## Preprocessing

### Translate to english

In [2]:
if TRANSLATION_ON_DISK:
    docs, _ = get_documents_v2(PATH_TRANSLATE)
else:
    docs, _ = get_documents_v2(PATH)
    docs = translate_to_english(docs)
    write_documents_v2(docs, PATH_TRANSLATE, PATH)
    
word_set, phrases, classes = process_documents(docs, _)
len(word_set), len(phrases), len(classes)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:28<00:00, 69.96it/s]


(46558, 71532, 71532)

In [3]:
len(docs)

2000

We have a set of 70000 sentences, with 46000 words

## Load Word2Vec

In [4]:
word2vec = KeyedVectors.load_word2vec_format("util/embedding/skipgram.bin", binary=True)
len(word2vec.key_to_index)

302866

In [5]:
def word2int(word):
    return word2vec.get_index(word)

def int2word(index):
    return word2vec.index_to_key[index]

- Add the **Padding**, "\<PAD\>" token to word2vec
- Add the **Unknown**, "[UNK]" token to word2vec

In [6]:
EMB_DIM = word2vec.vector_size

PAD_TOKEN = "<PAD>"
UNKOWN_TOKEN = "[UNK]"

PAD_VEC = [0] * EMB_DIM
UNK_VEC = [0] * EMB_DIM

In [7]:
word2vec.add_vectors([PAD_TOKEN, UNKOWN_TOKEN], [PAD_VEC, UNK_VEC])

- Check for missing words in the pre-trained model

In [8]:
missing_words=[]
for word in word_set.keys():
    if word not in word2vec.key_to_index:
        missing_words.append(word)
len(missing_words)

20136

In [11]:
index_docs = map_documents(docs, word2vec.key_to_index)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:27<00:00, 71.52it/s]


In [12]:
max(list(map(len, index_docs)))

1380

In [21]:
len(index_docs), len(_)

(2000, 2000)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(index_docs, _, test_size=0.2, random_state=0)

In [24]:
MAX_SQ_LEN = get_max_sequence_length(X_train)

In [28]:
X_train = pad_sentences(X_train, MAX_SQ_LEN, word2int(PAD_TOKEN))
X_test = pad_sentences(X_test, MAX_SQ_LEN, word2int(PAD_TOKEN))

In [31]:
X_train.shape, X_test.shape

((1600, 1380), (400, 1380))

In [38]:
mapper = {
    "neg":0,
    "pos":1,
}

y_train = list(map(lambda c: mapper[c], y_train))
y_test = list(map(lambda c: mapper[c], y_test))

In [39]:
y_train = np.array(y_train)
y_test = np.array(y_test)

y_train.shape, y_test.shape

((1600,), (400,))

## Model Training

In [32]:
VOCAB_SIZE = len(word2vec.key_to_index)
EMB_MATRIX = word2vec.vectors

In [33]:
embedding_layer = layers.Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMB_DIM,
    embeddings_initializer=init.Constant(EMB_MATRIX),
    trainable=False   
)

In [34]:
model = models.Sequential([
    layers.InputLayer(input_shape=MAX_SQ_LEN),
    embedding_layer,
    layers.Conv1D(
        filters=128,
        kernel_size=5,
        activation="relu"
    ),
    layers.MaxPool1D(5),
    layers.Conv1D(
        filters=64,
        kernel_size=5,
        activation="relu"
    ),
    layers.MaxPool1D(5),
    layers.Conv1D(
        filters=32,
        kernel_size=5,
        activation="relu"
    ),
    layers.MaxPool1D(3),
    layers.Conv1D(
        filters=16,
        kernel_size=5,
        activation="relu"
    ),
    layers.GlobalMaxPool1D(),
    layers.Dense(
        units=1,
        activation="sigmoid"
    )
])

In [35]:
model.compile("adam", loss="binary_crossentropy", metrics=["acc"])

In [42]:
model.fit(X_train, y_train, epochs=5, batch_size=16, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2134ca24c40>