# Preprocessing

In [None]:
# from google.colab import files

In [None]:
# uploaded = files.upload()

In [None]:
import numpy as np
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import brown
nltk.download('punkt')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split
import gensim
import logging
from tqdm import tqdm

np.random.seed(0)
torch.manual_seed(0)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bharatsuri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<torch._C.Generator at 0x146eb4ab0>

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format('glove.42B.300d.w2vformat.txt')

2021-03-29 00:25:41,274 : INFO : loading projection weights from glove.42B.300d.w2vformat.txt
2021-03-29 00:29:14,440 : INFO : loaded (1917494, 300) matrix from glove.42B.300d.w2vformat.txt


In [None]:
def getSentenceVector(sentence):
    words = word_tokenize(sentence)
    count_present = 0
    vec = np.zeros(300)
    for word in words:
        if word in model.wv:
            vec = np.add(vec, np.array(model.wv[word]))
            count_present += 1
    if count_present > 0:
        vec = vec / count_present
    return vec

In [None]:
class YelpDataset(Dataset):
    """Yelp dataset."""
    
    def __init__(self, file_name, append_zeros = True):
        """
        Args:
            file_name: The json file to make the dataset from
        """
        self.df = pd.read_json(file_name, lines=True)

        tensors = []
        binary_cat = []
        zero_sentence = np.zeros(300)

        #Create target class and document vector for each review
        for index, row in self.df.iterrows():
            single_class = np.zeros(2)
            if row['category'] == 1:
              single_class[1] = 1
            else:
              single_class[0] = 1
            binary_cat.append(torch.tensor(single_class))

            sentences = sent_tokenize(row['text'])
            sent_vecs = []
            for i in range(20):
              if i < len(sentences):
                sent_vecs.append(getSentenceVector(sentences[i]))
              elif append_zeros:
                sent_vecs.append(zero_sentence.copy())
            tensors.append(torch.FloatTensor(sent_vecs))

        self.df['category'] = binary_cat
        self.df['vector'] = tensors
        del self.df['text']

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        category = self.df.iloc[idx, 0]
        vector = self.df.iloc[idx, 1]
        sample = {'vector': vector, 'category': category}

        return sample

In [None]:
### DO NOT APPEND ZEROS ###
#append_zeros = False
dataset_train = YelpDataset('dataset/dataset_train.json', False)
dataset_dev = YelpDataset('dataset/dataset_dev.json', False)
dataset_test = YelpDataset('dataset/dataset_test.json', False)

  if word in model.wv:
  vec = np.add(vec, np.array(model.wv[word]))
  if word in model.wv:
  vec = np.add(vec, np.array(model.wv[word]))
  if word in model.wv:
  vec = np.add(vec, np.array(model.wv[word]))


In [None]:
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True, 
                              num_workers=0)
dataloader_dev = DataLoader(dataset_dev, batch_size=1, shuffle=True, 
                              num_workers=0)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=True, 
                              num_workers=0)

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class GRUAttention(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUAttention, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)
        
        self.fcn = nn.Sequential(
            nn.Linear(2*hidden_size, 64),
            nn.Tanh(),
            nn.Dropout(0.8),
            nn.Linear(64, 2),
            nn.Tanh()
        )
    
    def attention(self, outputs, hidden):
        h = hidden.squeeze(0)
        attn_weights = torch.bmm(outputs, h.unsqueeze(2)).squeeze(2)
        attn_scores = F.softmax(attn_weights, 1)
        s = torch.bmm(outputs.transpose(1, 2), attn_scores.unsqueeze(2)).squeeze(2)
        return s, attn_scores

    def forward(self, inputs):
        output, hidden = self.gru(inputs)
        hidden = torch.cat((hidden[0], hidden[1]), 1).unsqueeze(0)
        out, scores = self.attention(output, hidden)
        logits = self.fcn(out)
        return logits, scores

In [None]:
from tqdm import tqdm
encoder = GRUAttention(300, 64)
criterion = nn.CrossEntropyLoss()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)

epochs = 25
total = 0
for n in range(epochs):
    epoch_loss = 0
    for sample_batched in tqdm(dataloader_train):
        encoder.train()
        encoder.zero_grad()
        loss = 0
        seq = sample_batched['vector']
        output, scores = encoder(seq)
        output = output.view(1, -1)
        arg_max = torch.argmax(sample_batched['category'][0])
        target = torch.tensor([arg_max])
        loss += criterion(output, target)
        epoch_loss+=loss.detach().item()
        loss.backward()

        encoder_optimizer.step()

    print("Average loss at epoch {}: {}".format(n, epoch_loss/len(dataloader_train)))
    validate()



100%|██████████| 50000/50000 [04:48<00:00, 173.60it/s]
  2%|▏         | 227/10000 [00:00<00:08, 1129.56it/s]

Average loss at epoch 0: 0.38486465228706596


100%|██████████| 10000/10000 [00:08<00:00, 1213.82it/s]
  0%|          | 17/50000 [00:00<05:02, 164.97it/s]

Accuracy: 0.8746


100%|██████████| 50000/50000 [04:52<00:00, 171.04it/s]
  2%|▏         | 242/10000 [00:00<00:08, 1212.04it/s]

Average loss at epoch 1: 0.3342824143177271


100%|██████████| 10000/10000 [00:08<00:00, 1225.24it/s]
  0%|          | 17/50000 [00:00<04:55, 169.16it/s]

Accuracy: 0.8872


100%|██████████| 50000/50000 [04:49<00:00, 172.67it/s]
  2%|▏         | 237/10000 [00:00<00:08, 1187.01it/s]

Average loss at epoch 2: 0.3216892431101203


100%|██████████| 10000/10000 [00:08<00:00, 1221.65it/s]
  0%|          | 20/50000 [00:00<04:13, 197.42it/s]

Accuracy: 0.8912


100%|██████████| 50000/50000 [04:49<00:00, 172.93it/s]
  1%|          | 117/10000 [00:00<00:08, 1168.34it/s]

Average loss at epoch 3: 0.31240498324751853


100%|██████████| 10000/10000 [00:08<00:00, 1212.93it/s]
  0%|          | 20/50000 [00:00<04:31, 184.20it/s]

Accuracy: 0.8935


100%|██████████| 50000/50000 [04:48<00:00, 173.11it/s]
  1%|▏         | 125/10000 [00:00<00:07, 1241.47it/s]

Average loss at epoch 4: 0.30377437455415723


100%|██████████| 10000/10000 [00:08<00:00, 1225.16it/s]
  0%|          | 17/50000 [00:00<05:10, 161.05it/s]

Accuracy: 0.8932


100%|██████████| 50000/50000 [04:49<00:00, 172.75it/s]
  2%|▏         | 245/10000 [00:00<00:07, 1230.98it/s]

Average loss at epoch 5: 0.2980559512296319


100%|██████████| 10000/10000 [00:08<00:00, 1209.85it/s]
  0%|          | 19/50000 [00:00<04:42, 176.99it/s]

Accuracy: 0.8913


100%|██████████| 50000/50000 [04:49<00:00, 172.57it/s]
  2%|▏         | 241/10000 [00:00<00:08, 1201.09it/s]

Average loss at epoch 6: 0.2907907663685083


100%|██████████| 10000/10000 [00:08<00:00, 1223.99it/s]
  0%|          | 16/50000 [00:00<05:42, 145.73it/s]

Accuracy: 0.8971


100%|██████████| 50000/50000 [04:49<00:00, 172.97it/s]
  1%|          | 119/10000 [00:00<00:08, 1176.92it/s]

Average loss at epoch 7: 0.2838299927330017


100%|██████████| 10000/10000 [00:08<00:00, 1221.12it/s]
  0%|          | 19/50000 [00:00<04:30, 184.80it/s]

Accuracy: 0.8972


100%|██████████| 50000/50000 [04:49<00:00, 172.57it/s]
  2%|▏         | 244/10000 [00:00<00:08, 1163.88it/s]

Average loss at epoch 8: 0.27891691815704106


100%|██████████| 10000/10000 [00:08<00:00, 1226.36it/s]
  0%|          | 17/50000 [00:00<05:05, 163.55it/s]

Accuracy: 0.8929


100%|██████████| 50000/50000 [04:49<00:00, 172.71it/s]
  2%|▏         | 240/10000 [00:00<00:08, 1186.50it/s]

Average loss at epoch 9: 0.27371164181411267


100%|██████████| 10000/10000 [00:08<00:00, 1217.44it/s]
  0%|          | 17/50000 [00:00<05:00, 166.47it/s]

Accuracy: 0.8877


100%|██████████| 50000/50000 [04:48<00:00, 173.53it/s]
  2%|▏         | 240/10000 [00:00<00:08, 1189.43it/s]

Average loss at epoch 10: 0.2676795582178235


100%|██████████| 10000/10000 [00:08<00:00, 1222.59it/s]
  0%|          | 14/50000 [00:00<06:10, 134.79it/s]

Accuracy: 0.9003


100%|██████████| 50000/50000 [04:49<00:00, 172.87it/s]
  2%|▏         | 234/10000 [00:00<00:08, 1168.79it/s]

Average loss at epoch 11: 0.26297302532494066


100%|██████████| 10000/10000 [00:08<00:00, 1227.62it/s]
  0%|          | 34/50000 [00:00<05:06, 163.04it/s]

Accuracy: 0.8978


100%|██████████| 50000/50000 [04:48<00:00, 173.05it/s]
  1%|          | 113/10000 [00:00<00:08, 1126.72it/s]

Average loss at epoch 12: 0.2575492799386382


100%|██████████| 10000/10000 [00:08<00:00, 1214.60it/s]
  0%|          | 33/50000 [00:00<05:03, 164.63it/s]

Accuracy: 0.8977


100%|██████████| 50000/50000 [04:50<00:00, 172.31it/s]
  1%|▏         | 128/10000 [00:00<00:07, 1268.60it/s]

Average loss at epoch 13: 0.25298970270633697


100%|██████████| 10000/10000 [00:08<00:00, 1214.69it/s]
  0%|          | 16/50000 [00:00<05:21, 155.53it/s]

Accuracy: 0.8974


100%|██████████| 50000/50000 [04:48<00:00, 173.06it/s]
  2%|▏         | 236/10000 [00:00<00:08, 1182.11it/s]

Average loss at epoch 14: 0.2477415892136097


100%|██████████| 10000/10000 [00:08<00:00, 1214.73it/s]
  0%|          | 18/50000 [00:00<05:02, 165.06it/s]

Accuracy: 0.8977


100%|██████████| 50000/50000 [04:49<00:00, 173.00it/s]
  2%|▏         | 246/10000 [00:00<00:08, 1200.18it/s]

Average loss at epoch 15: 0.2432125526916981


100%|██████████| 10000/10000 [00:08<00:00, 1222.71it/s]
  0%|          | 19/50000 [00:00<04:41, 177.60it/s]

Accuracy: 0.8988


100%|██████████| 50000/50000 [04:49<00:00, 172.73it/s]
  2%|▏         | 237/10000 [00:00<00:08, 1197.59it/s]

Average loss at epoch 16: 0.23933729111015797


100%|██████████| 10000/10000 [00:08<00:00, 1217.78it/s]
  0%|          | 36/50000 [00:00<04:38, 179.59it/s]

Accuracy: 0.8971


100%|██████████| 50000/50000 [04:49<00:00, 172.68it/s]
  2%|▏         | 238/10000 [00:00<00:08, 1193.67it/s]

Average loss at epoch 17: 0.23570399108827114


100%|██████████| 10000/10000 [00:08<00:00, 1227.11it/s]
  0%|          | 18/50000 [00:00<05:00, 166.35it/s]

Accuracy: 0.8984


100%|██████████| 50000/50000 [04:49<00:00, 172.66it/s]
  2%|▎         | 250/10000 [00:00<00:08, 1188.15it/s]

Average loss at epoch 18: 0.23220809894174338


100%|██████████| 10000/10000 [00:08<00:00, 1211.16it/s]
  0%|          | 37/50000 [00:00<04:32, 183.66it/s]

Accuracy: 0.8934


100%|██████████| 50000/50000 [04:49<00:00, 172.49it/s]
  2%|▏         | 238/10000 [00:00<00:08, 1187.62it/s]

Average loss at epoch 19: 0.22846238501667976


100%|██████████| 10000/10000 [00:08<00:00, 1218.22it/s]
  0%|          | 17/50000 [00:00<05:06, 163.31it/s]

Accuracy: 0.8933


100%|██████████| 50000/50000 [04:49<00:00, 172.81it/s]
  2%|▏         | 245/10000 [00:00<00:07, 1224.75it/s]

Average loss at epoch 20: 0.22578331797748805


100%|██████████| 10000/10000 [00:08<00:00, 1228.72it/s]
  0%|          | 20/50000 [00:00<04:18, 193.34it/s]

Accuracy: 0.8896


100%|██████████| 50000/50000 [04:50<00:00, 172.25it/s]
  2%|▏         | 240/10000 [00:00<00:08, 1197.21it/s]

Average loss at epoch 21: 0.22236103401988744


100%|██████████| 10000/10000 [00:08<00:00, 1222.56it/s]
  0%|          | 16/50000 [00:00<05:22, 155.03it/s]

Accuracy: 0.8911


100%|██████████| 50000/50000 [04:49<00:00, 172.48it/s]
  1%|▏         | 126/10000 [00:00<00:07, 1257.21it/s]

Average loss at epoch 22: 0.2198282718834281


100%|██████████| 10000/10000 [00:08<00:00, 1216.35it/s]
  0%|          | 33/50000 [00:00<05:08, 162.14it/s]

Accuracy: 0.8947


100%|██████████| 50000/50000 [04:49<00:00, 172.58it/s]
  2%|▏         | 248/10000 [00:00<00:08, 1192.15it/s]

Average loss at epoch 23: 0.2171263903465867


100%|██████████| 10000/10000 [00:08<00:00, 1214.33it/s]
  0%|          | 17/50000 [00:00<04:56, 168.85it/s]

Accuracy: 0.8969


100%|██████████| 50000/50000 [04:49<00:00, 172.52it/s]
  2%|▏         | 248/10000 [00:00<00:07, 1242.01it/s]

Average loss at epoch 24: 0.2160248425680399


100%|██████████| 10000/10000 [00:08<00:00, 1219.72it/s]

Accuracy: 0.8956





In [None]:
encoder.eval()
with torch.no_grad():
    total_correct = 0
    total = 0
    for sample_batched in tqdm(dataloader_train):

        loss = 0
        output, scores = encoder(sample_batched['vector'])
        classification = torch.argmax(output)
        arg_max = torch.argmax(sample_batched['category'][0])
        if classification == arg_max:
            total_correct+=1
    print("Accuracy: {}".format(total_correct/len(dataloader_train)))

In [None]:
def validate():
    encoder.eval()
    total_correct = 0
    total = 0
    with torch.no_grad():
        for sample_batched in tqdm(dataloader_dev):

            loss = 0
            output, scores = encoder(sample_batched['vector'])
            classification = torch.argmax(output)
            arg_max = torch.argmax(sample_batched['category'][0])
            if classification == arg_max:
                total_correct+=1
    print("Accuracy: {}".format(total_correct/len(dataloader_dev)))