In [1]:
import numpy as np
import baseRNN
from datasets import load_dataset
import re
%load_ext autoreload
%autoreload 2

### Dataset

In [109]:
# constants
START_TOKEN = '<START>'
END_TOKEN = '<END>'
NUM_SAMPLES = 10000
imdbDataset = load_dataset("stanfordnlp/imdb")

embeddingsFilepath = '/Users/josep/Desktop/Self/Learning/NLP/RNN/data/glove.6B.300d.txt'

# helper functions
def read_corpus(dataset):
    files = dataset["train"]["text"][:NUM_SAMPLES]
    return [[START_TOKEN] + [re.sub(r'[^\w]', '', w.lower()) for w in f.split(" ")] + [END_TOKEN] for f in files]


def embedding_for_vocab(filepath, words, dimensions):
    vocab_size = len(words)
    embeddings = np.zeros((vocab_size, dimensions))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in words.keys():
                index = words[word]
                embeddings[index] = np.array(vector)[:dimensions]
    return embeddings

imdbCorpus = read_corpus(imdbDataset)

corpusWords = [y for x in imdbCorpus for y in x]
corpusWords = list(set(corpusWords))
word2ind={}
for i in range(len(corpusWords)+1):
    word2ind[corpusWords[i-1]] = i
word2ind['<PAD>'] = 0
embeddings = embedding_for_vocab(embeddingsFilepath, word2ind, 300)

### Batching Testing

In [3]:
testText = np.array(imdbCorpus[0])
mapper = np.vectorize(word2ind.get)
result = mapper(testText)
# result

In [4]:
len(imdbCorpus[0])%3

len(imdbCorpus[0][:-2])%3

0

### Training

In [97]:
testRNN = baseRNN.neuralNet(embeddings=embeddings, word2ind=word2ind, outputActivation='softmax',
                            hiddenLayerShapes=[100,100,100], hiddenLayerActivations=['relu', 'relu', 'relu'],
                            lossFunction='crossEntropyLoss', learningRate=.0005, epochs=1, batchSize=5,
                            adam=True, clipVal=1, debug=False)

In [106]:
# overfitCorpus = [imdbCorpus[0][0:20]] * 10
# # overfitCorpus = [overfitCorpus, overfitCorpus]
testCorpus = imdbCorpus
# overfitCorpus

In [None]:
testRNN.trainModel(imdbCorpus)

Batch #1 - max 313 words
Loss: 7.007187875386797
********************************************

Batch #2 - max 482 words
Loss: 6.821120737864705
********************************************

Batch #3 - max 362 words
Loss: 6.882234510939532
********************************************

Batch #4 - max 890 words
Loss: 6.944576375443166
********************************************

Batch #5 - max 327 words
Loss: 6.831435084983322
********************************************

Batch #6 - max 371 words
Loss: 6.77104223989011
********************************************

Batch #7 - max 387 words
Loss: 6.782372356400827
********************************************

Batch #8 - max 298 words
Loss: 6.247155458119795
********************************************

Batch #9 - max 839 words
Loss: 6.819218237508209
********************************************

Batch #10 - max 159 words
Loss: 6.15005121638279
********************************************

Batch #11 - max 464 words
Loss: 6.7579712978724835


In [108]:
input = ['<START>', 'my', 'name', 'is', 'video', 'what', 'is', 'yours']

# input = ['<START>', 'because', 'of', 'all', 'the', 'controversy']

output = testRNN.generateOutput(input, 10)

print(output)

['<START>', 'my', 'name', 'is', 'video', 'what', 'is', 'yours', 'the', 'to', 'the', 'of', 'the', 'the', 'the', 'the', 'the', 'the']


In [93]:
print(testCorpus)

[['<START>', 'i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it', 'was', 'first', 'released', 'in', '1967', 'i', 'also', 'heard', 'that', 'at', 'first', 'it', 'was', 'seized', 'by', 'us', 'customs', 'if', 'it', 'ever', 'tried', 'to', 'enter', 'this', 'country', 'therefore', 'being', 'a', 'fan', 'of', 'films', 'considered', 'controversial', 'i', 'really', 'had', 'to', 'see', 'this', 'for', 'myselfbr', 'br', 'the', 'plot', 'is', 'centered', 'around', 'a', 'young', 'swedish', 'drama', 'student', 'named', 'lena', 'who', 'wants', 'to', 'learn', 'everything', 'she', 'can', 'about', 'life', 'in', 'particular', 'she', 'wants', 'to', 'focus', 'her', 'attentions', 'to', 'making', 'some', 'sort', 'of', 'documentary', 'on', 'what', 'the', 'average', 'swede', 'thought', 'about', 'certain', 'political', 'issues', 'such', 'as', 'the', 'vietnam', 'war', 'and', 'race', 'issues', 'in', 'the', 'u

In [None]:
filePath = 'C:/Users/josep/Desktop/Self/Learning/NLP/Basic NN/Model/'

for layerName in testRNN.layers.keys():
    currLayer = testRNN.layers[layerName]
    layerWeights = currLayer.layerWeights
    np.save(filePath + layerName + '_layerWeights.npy', layerWeights)
    
    if currLayer.rnn:
        timeWeights = currLayer.timeWeights
        np.save(filePath + layerName + '_timeWeights.npy', timeWeights)
    bias = currLayer.bias
    np.save(filePath + layerName + '_bias.npy', bias)