# Preprocessing

In [None]:
# from google.colab import files

In [None]:
# uploaded = files.upload()

In [None]:
import numpy as np
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import brown
nltk.download('punkt')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
import logging

np.random.seed(0)
torch.manual_seed(0)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bharatsuri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<torch._C.Generator at 0x14719da90>

In [None]:
nltk.download('brown')
data = brown.sents()

[nltk_data] Downloading package brown to
[nltk_data]     /Users/bharatsuri/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [None]:
max_epochs = 10
vec_size = 256

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

word2vec_model = Word2Vec(data, min_count = 1, size = vec_size, window = 6, iter = max_epochs)

2021-03-23 23:26:20,726 : INFO : collecting all words and their counts
2021-03-23 23:26:20,728 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-23 23:26:21,059 : INFO : PROGRESS: at sentence #10000, processed 219770 words, keeping 23488 word types
2021-03-23 23:26:21,360 : INFO : PROGRESS: at sentence #20000, processed 430477 words, keeping 34367 word types
2021-03-23 23:26:21,688 : INFO : PROGRESS: at sentence #30000, processed 669056 words, keeping 42365 word types
2021-03-23 23:26:21,995 : INFO : PROGRESS: at sentence #40000, processed 888291 words, keeping 49136 word types
2021-03-23 23:26:22,228 : INFO : PROGRESS: at sentence #50000, processed 1039920 words, keeping 53024 word types
2021-03-23 23:26:22,412 : INFO : collected 56057 word types from a corpus of 1161192 raw words and 57340 sentences
2021-03-23 23:26:22,413 : INFO : Loading a fresh vocabulary
2021-03-23 23:26:22,513 : INFO : effective_min_count=1 retains 56057 unique words (100% of ori

In [None]:
  def getSentenceVector(sentence):
    words = word_tokenize(sentence)
    count_present = 0
    vec = np.zeros(256)
    for word in words:
      if word in word2vec_model.wv:
        vec = np.add(vec, np.array(word2vec_model.wv[word]))
        count_present += 1
    if count_present > 0:
      vec = vec / count_present
    return vec


In [None]:
class YelpDataset(Dataset):
    """Yelp dataset."""
    
    def __init__(self, file_name, append_zeros = True):
        """
        Args:
            file_name: The json file to make the dataset from
        """
        self.df = pd.read_json(file_name, lines=True)

        tensors = []
        binary_cat = []
        zero_sentence = np.zeros(vec_size)

        #Create target class and document vector for each review
        for index, row in self.df.iterrows():
            single_class = np.zeros(2)
            if row['category'] == 1:
              single_class[1] = 1
            else:
              single_class[0] = 1
            binary_cat.append(torch.tensor(single_class))

            sentences = sent_tokenize(row['text'])
            sent_vecs = []
            for i in range(20):
              if i < len(sentences):
                sent_vecs.append(getSentenceVector(sentences[i]))
              elif append_zeros:
                sent_vecs.append(zero_sentence.copy())
            tensors.append(torch.FloatTensor(sent_vecs))

        self.df['category'] = binary_cat
        self.df['vector'] = tensors
        del self.df['text']

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        category = self.df.iloc[idx, 0]
        vector = self.df.iloc[idx, 1]
        sample = {'vector': vector, 'category': category}

        return sample

In [None]:
### DO NOT APPEND ZEROS ###
#append_zeros = False
dataset_train = YelpDataset('dataset/dataset_train.json', False)
dataset_dev = YelpDataset('dataset/dataset_dev.json', False)
dataset_test = YelpDataset('dataset/dataset_test.json', False)

In [None]:
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True, 
                              num_workers=0)
dataloader_dev = DataLoader(dataset_dev, batch_size=1, shuffle=True, 
                              num_workers=0)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=True, 
                              num_workers=0)

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)

    def forward(self, inputs):
        output, hidden = self.gru(inputs)
        return output, hidden

In [None]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.input_size = input_size
        
        self.fcn = nn.Sequential(
            nn.Linear(2*input_size, 10),
            nn.Tanh(),
            nn.Linear(10, 2),
            nn.Tanh()
        )


    def forward(self, x):
        output = self.fcn(x)
        
        return output

In [None]:
from tqdm import tqdm
encoder = EncoderRNN(256, 32)
classifier = BinaryClassifier(32)

criterion = nn.CrossEntropyLoss()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
classifier_optimizer = optim.Adam(classifier.parameters(), lr=0.001)

epochs = 10
total = 0
for n in range(epochs):
    epoch_loss = 0
    for sample_batched in tqdm(dataloader_train):
        encoder.zero_grad()
        classifier.zero_grad()
        loss = 0
        seq = sample_batched['vector']
        output, hidden = encoder(seq)
        output = output[0][-1]
        output = classifier(output)
        output = output.view(1, -1)
        arg_max = torch.argmax(sample_batched['category'][0])
        target = torch.tensor([arg_max])
        loss += criterion(output, target)
        epoch_loss+=loss.detach().item()
        loss.backward()

        encoder_optimizer.step()
        classifier_optimizer.step()

    if n:
        print("Average loss at epoch {}: {}".format(n, epoch_loss/len(dataloader_train)))



100%|██████████| 50000/50000 [03:58<00:00, 210.00it/s]
100%|██████████| 50000/50000 [04:01<00:00, 206.63it/s]
  0%|          | 16/50000 [00:00<05:21, 155.36it/s]

Average loss at epoch 1: 0.48865831643253566


100%|██████████| 50000/50000 [04:21<00:00, 191.21it/s]
  0%|          | 22/50000 [00:00<03:52, 215.11it/s]

Average loss at epoch 2: 0.4792340424075723


100%|██████████| 50000/50000 [04:22<00:00, 190.84it/s]
  0%|          | 18/50000 [00:00<04:49, 172.77it/s]

Average loss at epoch 3: 0.4735036727231741


100%|██████████| 50000/50000 [04:21<00:00, 191.46it/s]
  0%|          | 39/50000 [00:00<04:17, 194.02it/s]

Average loss at epoch 4: 0.46673577694654467


100%|██████████| 50000/50000 [04:21<00:00, 191.38it/s]
  0%|          | 23/50000 [00:00<03:37, 229.66it/s]

Average loss at epoch 5: 0.4649085663637519


100%|██████████| 50000/50000 [04:22<00:00, 190.73it/s]
  0%|          | 40/50000 [00:00<04:17, 193.72it/s]

Average loss at epoch 6: 0.46238606805980204


100%|██████████| 50000/50000 [04:21<00:00, 191.01it/s]
  0%|          | 38/50000 [00:00<04:27, 186.43it/s]

Average loss at epoch 7: 0.4603581267696619


100%|██████████| 50000/50000 [04:21<00:00, 191.13it/s]
  0%|          | 21/50000 [00:00<04:02, 205.93it/s]

Average loss at epoch 8: 0.45822789230793715


100%|██████████| 50000/50000 [04:21<00:00, 191.49it/s]

Average loss at epoch 9: 0.4581519177404046





In [None]:
total_correct = 0
total = 0
for sample_batched in tqdm(dataloader_train):

    loss = 0
    output, hidden = encoder(sample_batched['vector'])

    output = output[0][len(output[0])-1]
    output = classifier(output)
    classification = torch.argmax(output)
    arg_max = torch.argmax(sample_batched['category'][0])
    if classification == arg_max:
        total_correct+=1
print("Accuracy: {}".format(total_correct/len(dataloader_train)))

100%|██████████| 50000/50000 [01:18<00:00, 636.77it/s]

Accuracy: 0.79314





In [None]:
total_correct = 0
total = 0
for sample_batched in tqdm(dataloader_dev):

    loss = 0
    output, hidden = encoder(sample_batched['vector'])

    output = output[0][len(output[0])-1]
    output = classifier(output)
    classification = torch.argmax(output)
    arg_max = torch.argmax(sample_batched['category'][0])
    if classification == arg_max:
        total_correct+=1
print("Accuracy: {}".format(total_correct/len(dataloader_dev)))

100%|██████████| 10000/10000 [00:15<00:00, 633.85it/s]

Accuracy: 0.7907



