In [1]:
import numpy as np
import baseRNN
from datasets import load_dataset
import re
%load_ext autoreload
%autoreload 2

### Dataset

In [2]:
# constants
START_TOKEN = '<START>'
END_TOKEN = '<END>'
NUM_SAMPLES = 10000
imdbDataset = load_dataset("stanfordnlp/imdb")

embeddingsFilepath = '/Users/josep/Desktop/Self/Learning/NLP/RNN/data/glove.6B.300d.txt'

# helper functions
def read_corpus(dataset):
    files = dataset["train"]["text"][:NUM_SAMPLES]
    return [[START_TOKEN] + [re.sub(r'[^\w]', '', w.lower()) for w in f.split(" ")] + [END_TOKEN] for f in files]


def embedding_for_vocab(filepath, words, dimensions):
    vocab_size = len(words)
    embeddings = np.zeros((vocab_size, dimensions))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in words.keys():
                index = words[word]
                embeddings[index] = np.array(vector)[:dimensions]
    return embeddings

imdbCorpus = read_corpus(imdbDataset)

corpusWords = [y for x in imdbCorpus for y in x]
corpusWords = list(set(corpusWords))
word2ind={}
for i in range(len(corpusWords)+1):
    word2ind[corpusWords[i-1]] = i
word2ind['<PAD>'] = 0
embeddings = embedding_for_vocab(embeddingsFilepath, word2ind, 300)

### Batching Testing

In [3]:
testText = np.array(imdbCorpus[0])
mapper = np.vectorize(word2ind.get)
result = mapper(testText)
# result

In [4]:
len(imdbCorpus[0])%3

len(imdbCorpus[0][:-2])%3

0

### Training

In [5]:
testRNN = baseRNN.neuralNet(embeddings=embeddings, word2ind=word2ind, outputActivation='softmax',
                            hiddenLayerShapes=[100,100,100], hiddenLayerActivations=['relu', 'relu', 'relu'],
                            lossFunction='crossEntropyLoss', learningRate=.0005, epochs=1, batchSize=5,
                            adam=True, clipVal=1, debug=False)

In [6]:
# overfitCorpus = [imdbCorpus[0][0:20]] * 10
# # overfitCorpus = [overfitCorpus, overfitCorpus]
testCorpus = imdbCorpus[:1000]
# testCorpus

In [8]:
testRNN.trainModel(imdbCorpus)

Batch #1 - max 313 words
Loss: 7.149708925820669
********************************************

Batch #2 - max 482 words
Loss: 7.003461507929369
********************************************

Batch #3 - max 362 words
Loss: 7.065856685189443
********************************************

Batch #4 - max 890 words
Loss: 7.059179964215564
********************************************

Batch #5 - max 327 words
Loss: 7.031967544594431
********************************************

Batch #6 - max 371 words
Loss: 6.966663970069166
********************************************

Batch #7 - max 387 words
Loss: 6.9470779439311725
********************************************

Batch #8 - max 298 words
Loss: 6.435090351324915
********************************************

Batch #9 - max 839 words
Loss: 7.01214404493116
********************************************

Batch #10 - max 159 words
Loss: 6.407117246440796
********************************************

Batch #11 - max 464 words
Loss: 7.035232569423718

In [15]:
input = ['<START>', 'what', 'is', 'the', 'best', 'movie', 'genre','<END>']

# input = ['<START>', 'because', 'of', 'all', 'the', 'controversy']

output = testRNN.generateOutput(input, 20)

print(output)

['<START>', 'what', 'is', 'the', 'best', 'movie', 'genre', '<END>', 'and', 'br', 'the', 'br', 'film', 'br', 'is', 'br', 'a', 'br', 'film', 'br', 'the', 'br', 'film', 'br', 'is', 'br', 'a', 'br']


In [19]:
testRNN.layers['hiddenLayer1'].beta1**testRNN.layers['hiddenLayer1'].t

1.9398282331131582e-101

In [10]:
filePath = 'C:/Users/josep/Desktop/Self/Learning/NLP/RNN/Model/'

for layerName in testRNN.layers.keys():
    currLayer = testRNN.layers[layerName]
    layerWeights = currLayer.layerWeights
    np.save(filePath + layerName + '_layerWeights.npy', layerWeights)
    
    if currLayer.rnn:
        timeWeights = currLayer.timeWeights
        np.save(filePath + layerName + '_timeWeights.npy', timeWeights)
    bias = currLayer.bias
    np.save(filePath + layerName + '_bias.npy', bias)

In [26]:
recreatedRNN = baseRNN.neuralNet(embeddings=embeddings, word2ind=word2ind, outputActivation='softmax',
                    hiddenLayerShapes=[100,100,100], hiddenLayerActivations=['relu', 'relu', 'relu'],
                    lossFunction='crossEntropyLoss', learningRate=.0005, epochs=1, batchSize=5,
                    adam=True, clipVal=1, debug=False)


for layerName in recreatedRNN.layers.keys():
    currLayer = recreatedRNN.layers[layerName]

    currLayer.layerWeights = np.load(filePath + layerName + '_layerWeights.npy')
    
    if currLayer.rnn:
        currLayer.timeWeights = np.load(filePath + layerName + '_timeWeights.npy')

    currLayer.bias = np.load(filePath + layerName + '_bias.npy')


In [None]:
input = ['<START>', 'what', 'is', 'the', 'best', 'movie', 'genre','<END>']

# input = ['<START>', 'because', 'of', 'all', 'the', 'controversy']

output = recreatedRNN.generateOutput(input, 20)

print(output)

['<START>', 'what', 'is', 'the', 'best', 'movie', 'genre', '<END>', 'and', 'br', 'the', 'br', 'film', 'br', 'is', 'br', 'a', 'br', 'film', 'br', 'the', 'br', 'film', 'br', 'is', 'br', 'a', 'br']


: 