In [1]:
import numpy as np
import baseRNN
from datasets import load_dataset
import re
%load_ext autoreload
%autoreload 2

### Dataset

In [2]:
# constants
START_TOKEN = '<START>'
END_TOKEN = '<END>'
NUM_SAMPLES = 1000
imdbDataset = load_dataset("stanfordnlp/imdb")

embeddingsFilepath = '/Users/josep/Desktop/Self/Learning/NLP/RNN/data/glove.6B.300d.txt'

# helper functions
def read_corpus(dataset):
    files = dataset["train"]["text"][:NUM_SAMPLES]
    return [[START_TOKEN] + [re.sub(r'[^\w]', '', w.lower()) for w in f.split(" ")] + [END_TOKEN] for f in files]


def embedding_for_vocab(filepath, words, dimensions):
    vocab_size = len(words)
    embeddings = np.zeros((vocab_size, dimensions))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in words.keys():
                index = words[word]
                embeddings[index] = np.array(vector)[:dimensions]
    return embeddings

imdbCorpus = read_corpus(imdbDataset)

corpusWords = [y for x in imdbCorpus for y in x]
corpusWords = list(set(corpusWords))
word2ind={}
for i in range(len(corpusWords)+1):
    word2ind[corpusWords[i-1]] = i
word2ind['<PAD>'] = 0
embeddings = embedding_for_vocab(embeddingsFilepath, word2ind, 300)

### Batching Testing

In [3]:
testText = np.array(imdbCorpus[0])
mapper = np.vectorize(word2ind.get)
result = mapper(testText)
# result

### Training

In [17]:
testRNN = baseRNN.neuralNet(embeddings=embeddings, word2ind=word2ind, outputActivation='softmax',
                            hiddenLayerShapes=[100,100,100], hiddenLayerActivations=['relu', 'relu', 'relu'],
                            lossFunction='crossEntropyLoss', learningRate=.01, epochs=1, batchSize=2,
                            adam=True, clipVal=1, debug=False)

In [18]:
overfitCorpus = [imdbCorpus[0][0:50]] * 100
# overfitCorpus = [overfitCorpus, overfitCorpus]
testCorpus = imdbCorpus[:2]
overfitCorpus

[['<START>',
  'i',
  'rented',
  'i',
  'am',
  'curiousyellow',
  'from',
  'my',
  'video',
  'store',
  'because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it',
  'when',
  'it',
  'was',
  'first',
  'released',
  'in',
  '1967',
  'i',
  'also',
  'heard',
  'that',
  'at',
  'first',
  'it',
  'was',
  'seized',
  'by',
  'us',
  'customs',
  'if',
  'it',
  'ever',
  'tried',
  'to',
  'enter',
  'this',
  'country',
  'therefore',
  'being',
  'a',
  'fan',
  'of'],
 ['<START>',
  'i',
  'rented',
  'i',
  'am',
  'curiousyellow',
  'from',
  'my',
  'video',
  'store',
  'because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it',
  'when',
  'it',
  'was',
  'first',
  'released',
  'in',
  '1967',
  'i',
  'also',
  'heard',
  'that',
  'at',
  'first',
  'it',
  'was',
  'seized',
  'by',
  'us',
  'customs',
  'if',
  'it',
  'ever',
  'tried',
  'to',
  'enter',
  'this',
  'country',
  'therefore',
  'being',
  '

In [19]:
testRNN.trainModel(overfitCorpus)

Batch #1 - max 50 words
Loss: 10.547549525405586
********************************************

Batch #2 - max 50 words
Loss: 10.527602512723693
********************************************

Batch #3 - max 50 words
Loss: 10.507656387417242
********************************************

Batch #4 - max 50 words
Loss: 10.487711174905776
********************************************

Batch #5 - max 50 words
Loss: 10.467766901158852
********************************************

Batch #6 - max 50 words
Loss: 10.447823592693467
********************************************

Batch #7 - max 50 words
Loss: 10.427881276571474
********************************************

Batch #8 - max 50 words
Loss: 10.40793998039698
********************************************

Batch #9 - max 50 words
Loss: 10.387999732313832
********************************************

Batch #10 - max 50 words
Loss: 10.368060561003386
********************************************

Batch #11 - max 50 words
Loss: 10.348122495682546


In [None]:
input = ['<START>', 'my', 'name', 'is', 'video', 'what', 'is', 'yours']

input = ['<START>', 'because', 'of', 'all', 'the', 'controversy']

output = testRNN.generateOutput(input, 5)

print(output)

['<START>', 'because', 'of', 'all', 'the', 'controversy', 'it', 'it', 'it', 'it', 'it']


In [42]:
print(overfitCorpus)

[['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', '