In [1]:
import numpy as np
import baseRNN
from datasets import load_dataset
import re
%load_ext autoreload
%autoreload 2

### Dataset

In [65]:
# constants
START_TOKEN = '<START>'
END_TOKEN = '<END>'
NUM_SAMPLES = 1000
imdbDataset = load_dataset("stanfordnlp/imdb")

embeddingsFilepath = '/Users/josep/Desktop/Self/Learning/NLP/RNN/data/glove.6B.300d.txt'

# helper functions
def read_corpus(dataset):
    files = dataset["train"]["text"][:NUM_SAMPLES]
    return [[START_TOKEN] + [re.sub(r'[^\w]', '', w.lower()) for w in f.split(" ")] + [END_TOKEN] for f in files]


def embedding_for_vocab(filepath, words, dimensions):
    vocab_size = len(words)
    embeddings = np.zeros((vocab_size, dimensions))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in words.keys():
                index = words[word]
                embeddings[index] = np.array(vector)[:dimensions]
    return embeddings

imdbCorpus = read_corpus(imdbDataset)

corpusWords = [y for x in imdbCorpus for y in x]
corpusWords = list(set(corpusWords))
word2ind={}
for i in range(len(corpusWords)+1):
    word2ind[corpusWords[i-1]] = i
word2ind['<PAD>'] = 0
embeddings = embedding_for_vocab(embeddingsFilepath, word2ind, 300)

### Batching Testing

In [62]:
testText = np.array(imdbCorpus[0])
mapper = np.vectorize(word2ind.get)
result = mapper(testText)
result

array([ 3946, 17881,  8953, 17881,  3139, 10843,  8068, 18464, 10898,
       16902,  4379,  4857, 16724, 13238,  5860,  7178, 10041,  1127,
        4013,  1127,  1482, 17129,  9803,   148,  1600, 17881, 12035,
       17269,  7178, 11409, 17129,  1127,  1482,  3369,   212,   462,
        3601, 16960,  1127, 14648, 17404,  1581,   534,  2862,  6372,
       18434, 11110,   279,  6285,  4857,  1646,  7798,   885, 17881,
          56,  6365,  1581,  2609,  2862, 13620, 17580, 16919, 13238,
        1211, 12799, 10632, 10894,   279, 15168,  9555, 11537, 17829,
        7855,  4789,  3766, 12841,  1581, 14062,  7212,  6023,  1966,
       17378,  9415,   148, 18814,  6023, 12841,  1581, 12356,  4080,
       18166,  1581, 12787, 10717, 12681,  4857, 14209, 11669,  9116,
       13238,   652,  4232,   597, 17378,  6264,  9319,  9504,  3793,
        6395, 13238,  4512,  4595,  7501,  5523,  9504,   148, 13238,
        7269,  1257,   148, 16383, 11495,  5924,  7501, 15477,  1893,
        4857, 17253,

### Training

In [207]:
testRNN = baseRNN.neuralNet(embeddings=embeddings, word2ind=word2ind, outputActivation='softmax',
                            hiddenLayerShapes=[100,100,100], hiddenLayerActivations=['relu', 'relu', 'relu'],
                            lossFunction='crossEntropyLoss', learningRate=.001, epochs=1, batchSize=2,
                            adam=True, clipVal=1, debug=False)

In [208]:
# overfitCorpus = [imdbCorpus[0][0:20]] * 10
# overfitCorpus = [overfitCorpus, overfitCorpus]
testCorpus = imdbCorpus[:2]
len(testCorpus)

2

In [209]:
testRNN.trainBatch(testCorpus)

Loss: 10.547549525405584
********************************************



ValueError: operands could not be broadcast together with shapes (100,19042) (19042,100) 

In [None]:
input = ['<START>', 'my', 'name', 'is', 'video', 'what', 'is', 'yours']

input = ['<START>', 'because', 'of', 'all', 'the', 'controversy']

output = testRNN.generateOutput(input, 5)

print(output)

['<START>', 'because', 'of', 'all', 'the', 'controversy', 'it', 'it', 'it', 'it', 'it']


In [42]:
print(overfitCorpus)

[['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it'], ['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', '