In [1]:
import json


file = 'yelp_dataset/yelp_academic_dataset_review.json'
parsed_file = 'yelp_review_small.json'
list_of_reviews_rate = []
with open(file) as f:
    with open(parsed_file, 'w') as outf:
        for i, line in enumerate(f):
            pl = json.loads(line)
            json.dump({"text": pl["text"], "label":pl["stars"]}, outf)
            outf.write('\n')
            if i==30000:
                break




In [2]:
import torch
from torchtext import data

#Reproducing same results
SEED = 2019

#Torch
torch.manual_seed(SEED)

TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)
datafield = {"text": ("text",TEXT),
                     "label": ("label",LABEL)
             }

#loading custom dataset
training_data = data.TabularDataset(path = 'yelp_review_small.json',format = 'json',fields = datafield)

#print preprocessed text
print(len(training_data.examples))



30001


In [3]:
import random
train_data, test_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))


In [4]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")
LABEL.build_vocab(train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))

#Word dictionary
print(TEXT.vocab.stoi)

Size of TEXT vocabulary: 20027
Size of LABEL vocabulary: 5
[('.', 133922), ('the', 90450), ('and', 76614), (',', 74130), ('I', 65392), ('a', 54581), ('to', 53157), ('was', 41262), ('of', 30968), (' ', 29055)]


In [5]:
#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True)




In [None]:
import torch.nn as nn

class RNN_setiment(nn.Module):
    
    ## For each element in the input sequence, each layer computes the following function:
    
    ## h_t = ReLU(W_ih*x_t+b_ih + W_hh*h_(t-1)+b_hh)
    
    ## where h_t is the hidden state at time t, x_t is the input at time t, and h_(t-1)
    ## is the hidden state of the previous layer at time t-1 or the initial hidden state at time 0
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers):

        #Constructor
        super().__init__()          

        #embedding layer
        # A simple lookup table that stores embeddings of a fixed dictionary and size.
        # This module is often used to store word embeddings and retrieve them using indices.
        # The input to the module is a list of indices, and the output is the corresponding word embeddings.
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        #rnn layer
        self.rnn = nn.RNN(embedding_dim, 
                       hidden_dim, 
                       num_layers=n_layers,
                       nonlinearity=nn.ReLU,
                       batch_first=True)

        #dense layer
        self.linear = nn.Linear(hidden_dim * 2, output_dim)

        #activation function
        self.activation = nn.Softmax()

    def forward(self, text, text_length):

        embedded = self.embedding(text)
        packed_embedding = nn.utils.rnn.pack_padded_sequence(embedded, text_length, batch_first=True)

        packed_output, hidden = self.rnn(packed_embedding)
        