In [2]:
import numpy as np
import baseRNN
from datasets import load_dataset
import re

  from .autonotebook import tqdm as notebook_tqdm


### Dataset

In [3]:
# constants
START_TOKEN = '<START>'
END_TOKEN = '<END>'
NUM_SAMPLES = 150
imdbDataset = load_dataset("stanfordnlp/imdb")

embeddingsFilepath = '/Users/josephwargo/Desktop/Self/Learning/NLP/data/glove.6B.300d.txt'

# helper functions
def read_corpus(dataset):
    files = dataset["train"]["text"][:NUM_SAMPLES]
    return [[START_TOKEN] + [re.sub(r'[^\w]', '', w.lower()) for w in f.split(" ")] + [END_TOKEN] for f in files]


def embedding_for_vocab(filepath, words, dimensions):
    vocab_size = len(words)
    embeddings = np.zeros((vocab_size, dimensions))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in words.keys():
                index = words[word]
                embeddings[index] = np.array(vector)[:dimensions]
    return embeddings

imdbCorpus = read_corpus(imdbDataset)

corpusWords = [y for x in imdbCorpus for y in x]
corpusWords = list(set(corpusWords))
word2ind={}
for i in range(len(corpusWords)):
    word2ind[corpusWords[i]] = i

embeddings = embedding_for_vocab(embeddingsFilepath, word2ind, 300)

### Training

In [4]:
testRNN = baseRNN.neuralNet(embeddings, imdbCorpus, word2ind, 'softmax', [100,100,100], ['relu', 'relu', 'relu'],
                               lossFunction='crossEntropyLoss', learningRate=.001, epochs=1, adam=False, debug=False)

In [5]:
len(imdbCorpus[2])

95

In [6]:
# testRNN.forwardPass(imdbCorpus[2])

In [8]:
# testRNN.embeddingsShape
testRNN.allLayers['hiddenLayer1'].N.shape

(100,)

In [18]:
# # separating into train and test
# propTrain = .75
# numTrain = round(propTrain * len(images))
# numTest = round((1-propTrain) * len(images))

# trainImages = images[:numTrain]
# trainLabels = labels[:numTrain]
# trainEncodedLabels = encodedLabels[:numTrain]

# testImages = images[numTrain:]
# testLabels = labels[numTrain:]
# testEncodedLabels = encodedLabels[numTrain:]