# Preprocessing

In [None]:
# from google.colab import files

In [None]:
# uploaded = files.upload()

In [None]:
import numpy as np
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import brown
nltk.download('punkt')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
import gensim
import logging
from tqdm import tqdm

np.random.seed(0)
torch.manual_seed(0)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bharatsuri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<torch._C.Generator at 0x148fdba90>

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format('glove.42B.300d.w2vformat.txt')

2021-03-23 23:26:42,371 : INFO : loading projection weights from glove.42B.300d.w2vformat.txt
2021-03-23 23:30:31,619 : INFO : loaded (1917494, 300) matrix from glove.42B.300d.w2vformat.txt


In [None]:
def getSentenceVector(sentence):
    words = word_tokenize(sentence)
    count_present = 0
    vec = np.zeros(300)
    for word in words:
        if word in model.wv:
            vec = np.add(vec, np.array(model.wv[word]))
            count_present += 1
    if count_present > 0:
        vec = vec / count_present
    return vec

In [None]:
class YelpDataset(Dataset):
    """Yelp dataset."""
    
    def __init__(self, file_name, append_zeros = True):
        """
        Args:
            file_name: The json file to make the dataset from
        """
        self.df = pd.read_json(file_name, lines=True)

        tensors = []
        binary_cat = []
        zero_sentence = np.zeros(300)

        #Create target class and document vector for each review
        for index, row in self.df.iterrows():
            single_class = np.zeros(2)
            if row['category'] == 1:
              single_class[1] = 1
            else:
              single_class[0] = 1
            binary_cat.append(torch.tensor(single_class))

            sentences = sent_tokenize(row['text'])
            sent_vecs = []
            for i in range(20):
              if i < len(sentences):
                sent_vecs.append(getSentenceVector(sentences[i]))
              elif append_zeros:
                sent_vecs.append(zero_sentence.copy())
            tensors.append(torch.FloatTensor(sent_vecs))

        self.df['category'] = binary_cat
        self.df['vector'] = tensors
        del self.df['text']

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        category = self.df.iloc[idx, 0]
        vector = self.df.iloc[idx, 1]
        sample = {'vector': vector, 'category': category}

        return sample

In [None]:
### DO NOT APPEND ZEROS ###
#append_zeros = False
dataset_train = YelpDataset('dataset/dataset_train.json', False)
dataset_dev = YelpDataset('dataset/dataset_dev.json', False)
dataset_test = YelpDataset('dataset/dataset_test.json', False)

  if word in model.wv:
  vec = np.add(vec, np.array(model.wv[word]))
  if word in model.wv:
  vec = np.add(vec, np.array(model.wv[word]))
  if word in model.wv:
  vec = np.add(vec, np.array(model.wv[word]))


In [None]:
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True, 
                              num_workers=0)
dataloader_dev = DataLoader(dataset_dev, batch_size=1, shuffle=True, 
                              num_workers=0)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=True, 
                              num_workers=0)

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)

    def forward(self, inputs):
        output, hidden = self.gru(inputs)
        return output, hidden

In [None]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.input_size = input_size
        
        self.fcn = nn.Sequential(
            nn.Linear(2*input_size, 10),
            nn.Tanh(),
            nn.Linear(10, 2),
            nn.Tanh()
        )


    def forward(self, x):
        output = self.fcn(x)
        
        return output

In [None]:
from tqdm import tqdm
encoder = EncoderRNN(300, 32)
classifier = BinaryClassifier(32)

criterion = nn.CrossEntropyLoss()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
classifier_optimizer = optim.Adam(classifier.parameters(), lr=0.001)

epochs = 10
total = 0
for n in range(epochs):
    epoch_loss = 0
    for sample_batched in tqdm(dataloader_train):
        encoder.zero_grad()
        classifier.zero_grad()
        loss = 0
        seq = sample_batched['vector']
        output, hidden = encoder(seq)
        output = output[0][-1]
        output = classifier(output)
        output = output.view(1, -1)
        arg_max = torch.argmax(sample_batched['category'][0])
        target = torch.tensor([arg_max])
        loss += criterion(output, target)
        epoch_loss+=loss.detach().item()
        loss.backward()

        encoder_optimizer.step()
        classifier_optimizer.step()

    if n:
        print("Average loss at epoch {}: {}".format(n, epoch_loss/len(dataloader_train)))



100%|██████████| 50000/50000 [04:25<00:00, 188.26it/s]
100%|██████████| 50000/50000 [04:25<00:00, 188.66it/s]
  0%|          | 20/50000 [00:00<04:16, 195.20it/s]

Average loss at epoch 1: 0.31197062472462656


100%|██████████| 50000/50000 [04:24<00:00, 189.22it/s]
  0%|          | 20/50000 [00:00<04:20, 191.96it/s]

Average loss at epoch 2: 0.2977838070911169


100%|██████████| 50000/50000 [04:24<00:00, 189.22it/s]
  0%|          | 36/50000 [00:00<04:38, 179.60it/s]

Average loss at epoch 3: 0.2898059875065088


100%|██████████| 50000/50000 [04:24<00:00, 188.72it/s]
  0%|          | 19/50000 [00:00<04:32, 183.43it/s]

Average loss at epoch 4: 0.2811680551958084


100%|██████████| 50000/50000 [04:24<00:00, 188.93it/s]
  0%|          | 18/50000 [00:00<04:41, 177.85it/s]

Average loss at epoch 5: 0.2762741073349118


100%|██████████| 50000/50000 [04:24<00:00, 189.13it/s]
  0%|          | 18/50000 [00:00<04:42, 176.86it/s]

Average loss at epoch 6: 0.26877209519535306


100%|██████████| 50000/50000 [04:24<00:00, 189.16it/s]
  0%|          | 18/50000 [00:00<04:41, 177.27it/s]

Average loss at epoch 7: 0.2640499986863136


100%|██████████| 50000/50000 [04:09<00:00, 200.05it/s]
  0%|          | 42/50000 [00:00<03:58, 209.55it/s]

Average loss at epoch 8: 0.2601344144052267


100%|██████████| 50000/50000 [03:53<00:00, 214.37it/s]

Average loss at epoch 9: 0.2549951467335224





In [None]:
total_correct = 0
total = 0
for sample_batched in tqdm(dataloader_train):

    loss = 0
    output, hidden = encoder(sample_batched['vector'])

    output = output[0][len(output[0])-1]
    output = classifier(output)
    classification = torch.argmax(output)
    arg_max = torch.argmax(sample_batched['category'][0])
    if classification == arg_max:
        total_correct+=1
print("Accuracy: {}".format(total_correct/len(dataloader_train)))

100%|██████████| 50000/50000 [01:08<00:00, 734.79it/s]

Accuracy: 0.93504





In [None]:
total_correct = 0
total = 0
for sample_batched in tqdm(dataloader_dev):

    loss = 0
    output, hidden = encoder(sample_batched['vector'])

    output = output[0][len(output[0])-1]
    output = classifier(output)
    classification = torch.argmax(output)
    arg_max = torch.argmax(sample_batched['category'][0])
    if classification == arg_max:
        total_correct+=1
print("Accuracy: {}".format(total_correct/len(dataloader_dev)))

100%|██████████| 10000/10000 [00:13<00:00, 731.92it/s]

Accuracy: 0.8989



