In [0]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.initializers import Constant 

from nltk.tokenize import TreebankWordTokenizer

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [0]:
import os
!echo '{"username":"imrankhan1386","key":"8708dd3c36c4db8c0dc44b3191edfe49"}' > ~/.kaggle/kaggle.json
!kaggle datasets download -d iarunava/imdb-movie-reviews-dataset # api copied from kaggle

Downloading imdb-movie-reviews-dataset.zip to /content
 96% 214M/224M [00:06<00:00, 23.5MB/s]
100% 224M/224M [00:06<00:00, 36.5MB/s]


In [0]:
!unzip /content/imdb-movie-reviews-dataset.zip

Output hidden; open in https://colab.research.google.com to view.

In [0]:
import glob
import os

from random import shuffle

def pre_process_data(filepath):
  positive_path = os.path.join(filepath, 'pos')
  negative_path = os.path.join(filepath, 'neg')
  pos_label = 1
  neg_label = 0
  dataset = []
  
  for filename in glob.glob(os.path.join(positive_path, '*.txt')):
    with open(filename, 'r') as f:
      dataset.append((pos_label, f.read()))

  for filename in glob.glob(os.path.join(negative_path, '*.txt')):
    with open(filename, 'r') as f:
      dataset.append((neg_label, f.read()))

  shuffle(dataset)
  return dataset

dataset = pre_process_data('/content/aclimdb/aclImdb/train')

In [0]:
dataset[0]

(0,
 'Thank G_d it bombed, or we might get treated to such delights as "Skate Fu" where we can see the likes of Brian Boitano performing a triple lutz & slashing bad guys to ribbons with his razor-sharp skates, but I digress. One thing that could have helped this turkey would have been a little T & A from Ms. Agbayani. It\'s not like the world would have seen anything new (at least that part of the world who saw her Playboy spread.) I truly believe that porn would have suited her \'talents\' much better, although Aubrey Hepburn couldn\'t have stayed afloat in this sewer. One explanation for Kurt Thomas\' presence could be a traumatic brain injury, possibly from coming up short too often on dismounts. It\'s a good thing the IOC wasn\'t as diligent on \'doping\' as they are now, or Kurt would surely have been stripped of his medals. To be avoided at all costs.')

In [0]:
expected = [sample[0] for sample in dataset]

In [0]:
def avg_len(data):
  total_len = 0
  for sample in data:
    total_len += len(sample[1])
  return total_len/len(data)

In [0]:
avg_len(dataset)

1325.06964

In [0]:
a = set()
for sample in dataset:
  a.add(len(sample[1]))

max(a)

13704

In [0]:
def clean_data(data):
  """Shift to lower case, replace unknowns with UNK, and listify"""
  new_data = []
  VALID = 'abcdefghijklmnopqrstuvwxyz0123456789"\'?!.,:; '
  for sample in data:
    new_sample = []
    for char in sample[1]:
      if char in VALID:
        new_sample.append(char)
      else:
        new_sample.append('UNK')
    new_data.append(new_sample)
  return new_data

In [0]:
listified_data = clean_data(dataset)

In [0]:
def char_pad_trunc(data, maxlen=1500):
  """ We truncate to maxlen or add in PAD tokens """
  new_dataset = []
  for sample in data:
    if len(sample) > maxlen:
      new_data = sample[:maxlen]
    elif len(sample) < maxlen:
      pads = maxlen - len(sample)
      new_data = sample + ['PAD'] * pads
    else:
      new_data = sample
    new_dataset.append(new_data)
  return new_dataset

In [0]:
def create_dicts(data):
  """ Modified from Keras LSTM example"""
  chars = set()
  for sample in data:
    chars.update(set(sample))
  char_indices = dict((c, i) for i, c in enumerate(chars))
  indices_char = dict((i, c) for i, c in enumerate(chars))
  return char_indices, indices_char

In [0]:
 def onehot_encode(dataset, char_indices, maxlen=1500):
    """
     One-hot encode the tokens

     Args:
       dataset list of lists of tokens
       char_indices
           dictionary of {key=character,
           value=index to use encoding vector}
       maxlen int Length of each sample
     Return:
        np array of shape (samples, tokens, encoding length)
     """
    X = np.zeros((len(dataset), maxlen, len(char_indices.keys())), dtype=np.bool)
    for i, sentence in enumerate(dataset):
      for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    return X

In [0]:
len(dataset)

25000

In [0]:
common_length_data = char_pad_trunc(listified_data, maxlen=1500)
char_indices, indices_char = create_dicts(common_length_data)
encoded_data = onehot_encode(common_length_data, char_indices, 1500)

In [0]:
for i in range(0, 100-40, 3):
  print('i is:{} and i+ maxlen is {}'.format(i, i + 40))

i is:0 and i+ maxlen is 40
i is:3 and i+ maxlen is 43
i is:6 and i+ maxlen is 46
i is:9 and i+ maxlen is 49
i is:12 and i+ maxlen is 52
i is:15 and i+ maxlen is 55
i is:18 and i+ maxlen is 58
i is:21 and i+ maxlen is 61
i is:24 and i+ maxlen is 64
i is:27 and i+ maxlen is 67
i is:30 and i+ maxlen is 70
i is:33 and i+ maxlen is 73
i is:36 and i+ maxlen is 76
i is:39 and i+ maxlen is 79
i is:42 and i+ maxlen is 82
i is:45 and i+ maxlen is 85
i is:48 and i+ maxlen is 88
i is:51 and i+ maxlen is 91
i is:54 and i+ maxlen is 94
i is:57 and i+ maxlen is 97


In [0]:
for i in range(0, 375542 - 40, 3):
  print('i is:{} and i+ maxlen is {}'.format(i, i + 40))

Output hidden; open in https://colab.research.google.com to view.