# Preprocessing

In [None]:
# from google.colab import files

In [None]:
# uploaded = files.upload()

In [None]:
import numpy as np
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import brown
nltk.download('punkt')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import Word2VecKeyedVectors
from sklearn.model_selection import train_test_split
np.random.seed(0)
torch.manual_seed(0)

from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.vocab import GloVe

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
word_tokenizer = get_tokenizer('basic_english')


In [None]:
class YelpDataset(Dataset):
    """Yelp dataset."""

    def __init__(self, file_name):
        """
        Args:
            file_name: The json file to make the dataset from
        """
        self.df = pd.read_json(file_name, lines=True)

        binary_cat = []
        counter = Counter()
        reviews = []

        #Create target class for each review, build vocab
        for index, row in self.df.iterrows():
            binary_cat.append(row['category'])

            sentences = sent_tokenize(row['text'])
            reviews.append(sentences)
            for i in range(len(sentences)):
              words = word_tokenizer(sentences[i])
              counter.update(words)

        self.vocab = Vocab(counter, min_freq=1)
        self.df['category'] = binary_cat
        self.df['text'] = reviews
        


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        category = self.df.iloc[idx, 0]
        text = self.df.iloc[idx, 1]
        sample = {'category': category, 'text': text}

        return sample

    def get_vocab(self):
      return self.vocab

    def get_max_sent_len(self):
      return self.max_sent_len

    # def get_indices(self, sentences, vocab):
    #   tokens = self.word_tokenizer(sentences)
    #   indices = [vocab[token] for token in tokens]
    #   diff = self.max_sent_len - len(tokens)
    #   for i in range(diff):
    #     indices.append(1)
    #   return indices

In [None]:
### DO NOT APPEND ZEROS ###
dataset_train = YelpDataset('dataset_train.json')
dataset_dev = YelpDataset('dataset_dev.json')
dataset_test = YelpDataset('dataset_test.json')

In [None]:
vocab = dataset_train.get_vocab()

In [None]:
def get_indices(sentence, max_sent_len):
  tokens = word_tokenizer(sentence)
  indices = [vocab[token] for token in tokens]
  diff = max_sent_len - len(tokens)
  for i in range(diff):
    indices.append(1)
  return indices


def collate(batch):

  max_num_sents = 0
  max_sent_len = 0
  for sample in batch:
    num_sents = len(sample['text'])
    if num_sents > max_num_sents:
      max_num_sents = num_sents
    for sent in sample['text']:
      if len(word_tokenizer(sent)) > max_sent_len:
        max_sent_len = len(word_tokenizer(sent))
  
  for sample in batch:
    sample['text'] = pad_review(sample['text'], max_num_sents)
    sample['indices']= []
    for sent in sample['text']:
      sample['indices'].append(get_indices(sent, max_sent_len))

  batch_dict = {'text': [], 'indices': [], 'category': []}
  for sample in batch:
    batch_dict['text'].append(sample['text'])
    batch_dict['indices'].append(sample['indices'])
    batch_dict['category'].append(sample['category'])
  batch_dict['indices'] = torch.tensor(batch_dict['indices'])
  batch_dict['category'] = torch.tensor(batch_dict['category'])

  return batch_dict


def pad_review(review, max_len):
  num_sents = len(review)
  for i in range(max_len - num_sents):
    review.append('<pad>')
  return review

In [None]:
batch_size = 5
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, 
                              num_workers=0, collate_fn = collate)
dataloader_dev = DataLoader(dataset_dev, batch_size=batch_size, shuffle=True, 
                              num_workers=0, collate_fn = collate)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, 
                              num_workers=0, collate_fn = collate)

In [None]:
for batch in dataloader_train:
  print(batch['indices'].size())
  break

torch.Size([5, 15, 68])


# Updated Model

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, weights_matrix):
        super(EncoderRNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=1)
        self.embedding.load_state_dict({'weight': weights_matrix})

        self.hidden_size = hidden_size
        self.gru = nn.GRU(embedding_size, hidden_size, batch_first=True, bidirectional=True)

    def forward(self, inputs):

        embed_output = self.embedding(inputs)
        embed_output = torch.mean(embed_output, dim=2, keepdim=True).squeeze(2)
        output, hidden = self.gru(embed_output)

        return output, hidden

In [None]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.input_size = input_size
        
        self.fcn = nn.Sequential(
            nn.Linear(2*input_size, 10),
            nn.Tanh(),
            nn.Linear(10, 2),
            nn.Tanh()
        )


    def forward(self, x):
        output = self.fcn(x)
        
        return output

In [None]:
## Make weights matrix
vec_size = 300
vocab = dataset_train.get_vocab()
vocab_size = len(vocab)


In [None]:
#glove = Word2VecKeyedVectors.load_word2vec_format('glove.w2v.txt')
nltk.download('brown')
nltk.download('punkt')
alpha = 0.025
min_count = 1
window_size = 6
max_epochs = 1

glove = Word2Vec(sentences = brown.sents(),
                          alpha = alpha,
                          min_alpha = alpha,
                          min_count = min_count,
                          size = vec_size,
                          window = window_size,
                          iter = max_epochs)
glove = glove.wv

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
weights_matrix = np.zeros((vocab_size, vec_size))
i = 0
for word in vocab.itos:
  try:
    weights_matrix[i] = glove[word]
  except KeyError:
    weights_matrix[i] = np.random.normal(scale=0.6, size=(vec_size, ))
  i+=1
  
weights_matrix = torch.tensor(weights_matrix)

In [None]:
from tqdm import tqdm

encoder_output_size = 32
encoder = EncoderRNN(vocab_size, vec_size, encoder_output_size, weights_matrix)
classifier = BinaryClassifier(encoder_output_size)

criterion = nn.CrossEntropyLoss()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
classifier_optimizer = optim.Adam(classifier.parameters(), lr=0.001)

epochs = 1
total = 0
for n in range(epochs):
    epoch_loss = 0
    count = 0
    for batch in tqdm(dataloader_train):
        encoder.zero_grad()
        classifier.zero_grad()
        loss = 0

        output, hidden = encoder(batch['indices'])

        temp = torch.zeros([batch_size, 2 * encoder_output_size])
        output = output[:,-1,:]
        print(output)
        break
  
        output = classifier(output)
        target = batch['category']

        loss += criterion(output, target)
        epoch_loss+=loss.detach().item()
        loss.backward()

        encoder_optimizer.step()
        classifier_optimizer.step()

  
    if n:
        print("Average loss at epoch {}: {}".format(n, epoch_loss/len(dataloader_train)))




  0%|          | 0/10000 [00:00<?, ?it/s]

tensor([[-4.6884e-02, -2.7985e-01, -2.8465e-01, -6.8841e-01, -7.9778e-01,
         -5.4949e-01, -4.5543e-01,  4.8233e-01, -5.2302e-01, -1.9084e-01,
         -5.6174e-01,  7.4191e-01,  2.0926e-01, -1.3154e-01, -7.7322e-01,
         -1.4651e-01, -2.7700e-01,  3.2524e-01,  2.2986e-01,  9.0598e-01,
         -2.3027e-01,  9.9168e-01,  3.5412e-01,  1.1155e-01, -9.2582e-01,
         -9.1099e-01,  6.9672e-01, -2.0854e-01,  2.3218e-02,  3.1000e-01,
         -6.0700e-01,  6.7126e-01, -9.0581e-02, -9.2982e-02, -1.3054e-01,
          2.5626e-01, -4.2103e-01, -1.0483e-01, -3.4217e-03, -1.5053e-01,
         -3.5441e-01, -2.3910e-01,  1.1618e-01,  3.2906e-01,  7.8009e-01,
         -1.2144e-01,  4.5489e-01,  4.7576e-01,  1.8628e-01, -2.4231e-01,
          2.5586e-02,  7.3228e-01,  6.4700e-01, -1.0122e-01, -7.7117e-02,
          4.5889e-01,  4.9863e-01,  3.8449e-01, -4.4457e-01,  1.4643e-01,
         -1.3786e-01, -2.0106e-01, -4.2139e-01,  7.2610e-03],
        [-3.0666e-02, -2.6587e-01, -2.8474e-01, -6




In [None]:
total_correct = 0

for batch in tqdm(dataloader_train):

        output, hidden = encoder(batch['indices'])

        temp = torch.zeros([batch_size, 2 * encoder_output_size])
        output = output[:,-1,:]

        output = classifier(output)

        for i in range(batch_size):
  
          classification = torch.argmax(output[i]).item()
          target = batch['category'][i]
          if target == classification:
             total_correct+=1

print("Accuracy: {}".format(total_correct/(len(dataloader_train) * batch_size)))


  0%|          | 0/2500 [00:00<?, ?it/s][A
  0%|          | 4/2500 [00:00<01:20, 31.02it/s][A
  0%|          | 7/2500 [00:00<01:29, 27.75it/s][A
  0%|          | 10/2500 [00:00<01:31, 27.11it/s][A
  1%|          | 13/2500 [00:00<01:30, 27.56it/s][A
  1%|          | 16/2500 [00:00<01:33, 26.67it/s][A
  1%|          | 19/2500 [00:00<01:33, 26.50it/s][A
  1%|          | 22/2500 [00:00<01:38, 25.26it/s][A
  1%|          | 25/2500 [00:00<01:36, 25.69it/s][A
  1%|          | 28/2500 [00:01<01:33, 26.50it/s][A
  1%|          | 31/2500 [00:01<01:39, 24.79it/s][A
  1%|▏         | 35/2500 [00:01<01:32, 26.53it/s][A
  2%|▏         | 38/2500 [00:01<01:30, 27.32it/s][A
  2%|▏         | 41/2500 [00:01<01:33, 26.44it/s][A
  2%|▏         | 44/2500 [00:01<01:29, 27.34it/s][A
  2%|▏         | 47/2500 [00:01<01:28, 27.73it/s][A
  2%|▏         | 50/2500 [00:01<01:36, 25.37it/s][A
  2%|▏         | 54/2500 [00:02<01:30, 27.06it/s][A
  2%|▏         | 57/2500 [00:02<01:34, 25.88it/s][A
  2

Accuracy: 0.82172





In [None]:
total_correct = 0

for batch in tqdm(dataloader_dev):

        output, hidden = encoder(batch['indices'])

        temp = torch.zeros([batch_size, 2 * encoder_output_size])
        output = output[:,-1,:]

        output = classifier(output)

        for i in range(batch_size):
  
          classification = torch.argmax(output[i]).item()
          target = batch['category'][i]
          if target == classification:
             total_correct+=1

print("Accuracy: {}".format(total_correct/(len(dataloader_dev) * batch_size)))








  0%|          | 0/500 [00:00<?, ?it/s][A[A[A[A[A[A[A






  0%|          | 2/500 [00:00<00:49, 10.04it/s][A[A[A[A[A[A[A






  1%|          | 4/500 [00:00<00:47, 10.41it/s][A[A[A[A[A[A[A






  1%|          | 5/500 [00:00<00:48, 10.13it/s][A[A[A[A[A[A[A






  1%|          | 6/500 [00:00<00:49,  9.89it/s][A[A[A[A[A[A[A






  2%|▏         | 8/500 [00:00<00:48, 10.24it/s][A[A[A[A[A[A[A






  2%|▏         | 9/500 [00:00<00:48, 10.16it/s][A[A[A[A[A[A[A






  2%|▏         | 11/500 [00:01<00:47, 10.26it/s][A[A[A[A[A[A[A






  2%|▏         | 12/500 [00:01<00:48,  9.99it/s][A[A[A[A[A[A[A






  3%|▎         | 13/500 [00:01<00:51,  9.50it/s][A[A[A[A[A[A[A






  3%|▎         | 15/500 [00:01<00:46, 10.53it/s][A[A[A[A[A[A[A






  3%|▎         | 17/500 [00:01<00:44, 10.82it/s][A[A[A[A[A[A[A






  4%|▍         | 19/500 [00:01<00:45, 10.64it/s][A[A[A[A[A[A[A






  4%|▍         | 21/

Accuracy: 0.5169



