In [1]:
%reload_ext autoreload
%autoreload 2

import os
import nltk
import string
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import initializers as init
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow.keras import callbacks
from tensorflow.keras import activations, regularizers

from gensim.models import Word2Vec, KeyedVectors

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import euclidean

from util.preprocessing import *
from util.io import *
from tqdm.notebook import tqdm

PATH = "./data/imdb_data/IMDB_dataset_320.000_reviews.csv"
PATH_TRANSLATE = "./data/imdb_data/imdb_translated.csv"

TRANSLATION_ON_DISK=True

## Preprocessing 

In [3]:
if TRANSLATION_ON_DISK==False:
    data = pd.read_csv(PATH)[["review", "label"]]

### Polarity Creation From labels
- According to ...

In [5]:
data.head()

Unnamed: 0,review,label
0,"""Yaara Sili Sili Virah Ki Raat Ka Jalna""'Lekin...",8
1,Gulzar is at his best when he is telling such ...,9
2,I was completely mesmerized by Lekin and espec...,9
3,Greatly enjoyed the development of the story l...,9
4,"The lines of time are very blurry. Past, prese...",10


In [10]:
def get_polarity(x):
    if x>=7:
        return 1
    elif x<=4:
        return 0
    return np.nan

In [11]:
data.label = data.label.apply(get_polarity)

In [12]:
data.head()

Unnamed: 0,review,label
0,"""Yaara Sili Sili Virah Ki Raat Ka Jalna""'Lekin...",1.0
1,Gulzar is at his best when he is telling such ...,1.0
2,I was completely mesmerized by Lekin and espec...,1.0
3,Greatly enjoyed the development of the story l...,1.0
4,"The lines of time are very blurry. Past, prese...",1.0


In [13]:
data.dropna(inplace=True)

In [14]:
len(data)

261787

### Translation (English - English)

In [None]:
translations = []
len(translations)

In [32]:
how_many=30
for i in tqdm(range(len(translations), len(data)+how_many-1, how_many)):
    j = i+how_many
    translations += batch_translate_to_english(data.iloc[i:j].review.to_list())

  0%|          | 0/645 [00:00<?, ?it/s]

In [34]:
len(data), len(translations)

(261787, 261787)

In [38]:
data["review"] = translations

In [41]:
data.to_csv("data/imdb_translated.csv", index=False)

### Tokenization

In [2]:
data = pd.read_csv(PATH_TRANSLATE)
data.head()

Unnamed: 0,review,label
0,"""Yaara Sili Sili Virah Ki Raat Ka Jalna""'Lekin...",1.0
1,Gulzar is at his best when he is telling such ...,1.0
2,I was completely mesmerized by Lekin and espec...,1.0
3,Greatly enjoyed the development of the story l...,1.0
4,"The lines of time are very blurry. Past, prese...",1.0


In [3]:
train_set, test_set = train_test_split(data, test_size=0.1, random_state=13)

In [4]:
#X_train = train_set.review.to_list()
#X_test = test_set.review.to_list()

y_train = train_set.label.to_numpy()
y_test = test_set.label.to_numpy()

In [6]:
word_set, X_train = process_documents(X_train)

  0%|          | 0/235608 [00:00<?, ?it/s]

In [7]:
X_test = process_documents(X_test, False)

  0%|          | 0/26179 [00:00<?, ?it/s]

In [9]:
#with open("data/imdb_data/processed_data.pickle", "wb") as f:
#    pickle.dump((word_set, X_train, X_test), f)

In [5]:
word_set, X_train, X_test = pickle.load(open("data/imdb_data/processed_data.pickle", "rb"))

In [7]:
len(word_set)

203940

## Load Word2Vec Model

In [6]:
word2vec = KeyedVectors.load_word2vec_format("util/embedding/GoogleNews-vectors-negative300.bin", binary=True)

In [9]:
len(word2vec.key_to_index)

3000000

In [7]:
EMB_DIM = word2vec.vector_size
PAD_TOKEN = "<PAD>"
UNKOWN_TOKEN = "[UNK]"
PAD_VEC = UNK_VEC = [0] * EMB_DIM

In [8]:
word2vec.add_vectors([PAD_TOKEN, UNKOWN_TOKEN], [PAD_VEC, UNK_VEC])

In [9]:
missing_words=[]
for word in word_set.keys():
    if word not in word2vec.key_to_index:
        missing_words.append(word)
len(missing_words)

123142

In [10]:
#uncoment if we want random vectors for missing words
add_unknown_words(missing_words, word2vec, dev=0.25)

In [11]:
X_train = map_to_index(X_train, word2vec.key_to_index)
X_test = map_to_index(X_test, word2vec.key_to_index)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [12]:
np.unique(y_train, return_counts=True)

(array([0., 1.]), array([ 67496, 168112], dtype=int64))

In [13]:
np.unique(y_test, return_counts=True)

(array([0., 1.]), array([ 7534, 18645], dtype=int64))

- Process data to tabular format

In [14]:
MAX_SEQUENCE_LENGTH = get_max_sequence_length(X_train)
MAX_SEQUENCE_LENGTH

1485

In [15]:
X_train = pad_sentences(X_train, MAX_SEQUENCE_LENGTH, word2vec.key_to_index[PAD_TOKEN])

In [16]:
X_test = pad_sentences(X_test, MAX_SEQUENCE_LENGTH, word2vec.key_to_index[PAD_TOKEN])

In [19]:
X_train.shape, X_test.shape

((235608, 1485), (26179, 1485))

## Train model

In [29]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, X, y, batch_size=512, shuffle=True):
        self.X, self.y = X, y
        self.batch_size= batch_size
        self.shuffle=shuffle
        self.n = len(X)
        
    def on_epoch_end(self):
        if self.shuffle:
            index = np.arange(len(self.X), dtype=int)
            np.random.shuffle(index)
            
            self.X = self.X[index]
            self.y = self.y[index]
    
    def __getitem__(self, index):
        i = index*self.batch_size 
        j = (index+1)*self.batch_size
        
        X = self.X[i:j]
        y = self.y[i:j]
        
        return X, y
        
    
    def __len__(self):
        return self.n//self.batch_size
    

In [35]:
gen = DataGenerator(X_train, y_train, batch_size=256)

In [17]:
vocab_pos = []
for w in word_set:
    if w in word2vec.key_to_index:
        vocab_pos.append( word2vec.key_to_index[w])
vocab_pos += list(map(lambda x: word2vec.key_to_index[x], [UNKOWN_TOKEN, PAD_TOKEN]))

In [18]:
VOCAB_SIZE = len(vocab_pos)
EMB_MATRIX = word2vec.vectors[vocab_pos]
EMB_MATRIX.shape

(203942, 300)

In [32]:
cnn = models.Sequential([
    layers.InputLayer(input_shape=MAX_SEQUENCE_LENGTH),
    layers.Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=EMB_MATRIX.shape[1],
        embeddings_initializer=init.Constant(EMB_MATRIX),
        trainable=True
    ),
    layers.Dropout(0.4),
    
    layers.Conv1D(
        filters=32,
        kernel_size=5,
        activation=layers.LeakyReLU()
    ),
    layers.MaxPool1D(5),
    layers.Conv1D(
        filters=64,
        kernel_size=5,
        activation=layers.LeakyReLU()
    ),
    layers.MaxPool1D(5),
    layers.Conv1D(
        filters=128,
        kernel_size=5,
        activation=layers.LeakyReLU()
    ),
    layers.GlobalMaxPool1D(),
    layers.Dropout(0.4),
    layers.Dense(256, activation="relu"),
    
    layers.Dropout(0.2),
    layers.Dense(
        units=1,
        activation="sigmoid"
    )  
])

In [33]:
cnn.compile("adam", loss="binary_crossentropy", metrics=["accuracy"])

In [36]:
train_history = cnn.fit(
    gen,
    epochs=20,
    validation_data=(X_test, y_test),
    callbacks=[callbacks.EarlyStopping(patience=3)]
)

Epoch 1/20


ResourceExhaustedError: Graph execution error:

Detected at node 'sequential_1/dropout_3/dropout/random_uniform/RandomUniform' defined at (most recent call last):
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\traitlets\config\application.py", line 978, in launch_instance
      app.start()
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2885, in run_cell
      result = self._run_cell(
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3139, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3318, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\IPython\core\interactiveshell.py", line 3378, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\joelp\AppData\Local\Temp\ipykernel_7984\2847456443.py", line 1, in <module>
      train_history = cnn.fit(
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\sequential.py", line 410, in call
      return super().call(inputs, training=training, mask=mask)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\layers\regularization\dropout.py", line 116, in call
      output = control_flow_util.smart_cond(
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\utils\control_flow_util.py", line 108, in smart_cond
      return tf.__internal__.smart_cond.smart_cond(
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\layers\regularization\dropout.py", line 112, in dropped_inputs
      return self._random_generator.dropout(
    File "C:\Users\joelp\AppData\Local\Programs\Python\Python39\lib\site-packages\keras\backend.py", line 2162, in dropout
      return tf.nn.dropout(
Node: 'sequential_1/dropout_3/dropout/random_uniform/RandomUniform'
OOM when allocating tensor with shape[256,1485,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node sequential_1/dropout_3/dropout/random_uniform/RandomUniform}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_28298]

In [39]:
cnn.save("./data/imdb_data/cnn_static.h5")