# Preprocessing

In [None]:
# from google.colab import files

In [None]:
# uploaded = files.upload()

In [None]:
import numpy as np
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import brown
nltk.download('punkt')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
np.random.seed(0)
torch.manual_seed(0)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


<torch._C.Generator at 0x7ff1392908f0>

In [None]:
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [None]:
nltk.download('brown')
data = list(brown.sents())
new_data = []
for x in data:
  new_data.append(' '.join(x))

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [None]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(new_data)]

In [None]:
max_epochs = 10
vec_size = 256
alpha = 0.025

doc2vec_model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
doc2vec_model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    doc2vec_model.train(tagged_data,
                total_examples=doc2vec_model.corpus_count,
                epochs=doc2vec_model.iter)
    # decrease the learning rate
    doc2vec_model.alpha -= 0.0002
    # fix the learning rate, no decay
    doc2vec_model.min_alpha = doc2vec_model.alpha

doc2vec_model.save("d2v.model")
print("Model Saved")

iteration 0




Model Saved


In [None]:
class YelpDataset(Dataset):
    """Yelp dataset."""

    def __init__(self, file_name, append_zeros = True):
        """
        Args:
            file_name: The json file to make the dataset from
        """
        self.df = pd.read_json(file_name, lines=True)

        tensors = []
        binary_cat = []
        zero_sentence = np.zeros(vec_size)

        #Create target class and document vector for each review
        for index, row in self.df.iterrows():
            single_class = np.zeros(2)
            if row['category'] == 1:
              single_class[1] = 1
            else:
              single_class[0] = 1
            binary_cat.append(torch.tensor(single_class))

            sentences = sent_tokenize(row['text'])
            sent_vecs = []
            for i in range(20):
              if i < len(sentences):
                sent_vecs.append(np.array(doc2vec_model.infer_vector(word_tokenize(sentences[i]))))
              elif append_zeros:
                sent_vecs.append(zero_sentence.copy())
            tensors.append(torch.FloatTensor(sent_vecs))

        self.df['category'] = binary_cat
        self.df['vector'] = tensors
        del self.df['text']


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        category = self.df.iloc[idx, 0]
        vector = self.df.iloc[idx, 1]
        sample = {'vector': vector, 'category': category}

        return sample

In [None]:
### DO NOT APPEND ZEROS ###
#append_zeros = False
dataset_train = YelpDataset('dataset/dataset_train.json', False)
dataset_dev = YelpDataset('dataset/dataset_dev.json', False)
dataset_test = YelpDataset('dataset/dataset_test.json', False)

In [None]:
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True, 
                              num_workers=0)
dataloader_dev = DataLoader(dataset_dev, batch_size=1, shuffle=True, 
                              num_workers=0)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=True, 
                              num_workers=0)

# Updated Model

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True)

    def forward(self, inputs):
        output, hidden = self.gru(inputs)
        return output, hidden

In [None]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.input_size = input_size
        
        self.fcn = nn.Sequential(
            nn.Linear(2*input_size, 10),
            nn.Tanh(),
            nn.Linear(10, 2),
            nn.Tanh()
        )


    def forward(self, x):
        output = self.fcn(x)
        
        return output

In [None]:
from tqdm import tqdm

encoder = EncoderRNN(256, 32)
classifier = BinaryClassifier(32)

criterion = nn.CrossEntropyLoss()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
classifier_optimizer = optim.Adam(classifier.parameters(), lr=0.001)

epochs = 10
total = 0
for n in range(epochs):
    epoch_loss = 0
    for sample_batched in tqdm(dataloader_train):
        encoder.zero_grad()
        classifier.zero_grad()
        loss = 0
        seq = sample_batched['vector']
        output, hidden = encoder(seq)
        output = output[0][-1]
        output = classifier(output)
        output = output.view(1, -1)
        arg_max = torch.argmax(sample_batched['category'][0])
        target = torch.tensor([arg_max])
        loss += criterion(output, target)
        epoch_loss+=loss.detach().item()
        loss.backward()

        encoder_optimizer.step()
        classifier_optimizer.step()

    if n:
        print("Average loss at epoch {}: {}".format(n, epoch_loss/len(dataloader_train)))




100%|██████████| 50000/50000 [04:08<00:00, 201.07it/s]
100%|██████████| 50000/50000 [04:23<00:00, 189.47it/s]
  0%|          | 16/50000 [00:00<05:15, 158.42it/s]

Average loss at epoch 1: 0.42830004631757734


100%|██████████| 50000/50000 [04:23<00:00, 189.85it/s]
  0%|          | 22/50000 [00:00<03:53, 213.99it/s]

Average loss at epoch 2: 0.3935349872377515


100%|██████████| 50000/50000 [04:23<00:00, 190.06it/s]
  0%|          | 19/50000 [00:00<04:36, 180.97it/s]

Average loss at epoch 3: 0.36263542630195617


100%|██████████| 50000/50000 [04:23<00:00, 189.57it/s]
  0%|          | 38/50000 [00:00<04:24, 189.12it/s]

Average loss at epoch 4: 0.33688908943653106


100%|██████████| 50000/50000 [04:22<00:00, 190.23it/s]
  0%|          | 22/50000 [00:00<03:48, 218.95it/s]

Average loss at epoch 5: 0.3169146253216267


100%|██████████| 50000/50000 [04:23<00:00, 189.83it/s]
  0%|          | 20/50000 [00:00<04:18, 193.14it/s]

Average loss at epoch 6: 0.2982011951285601


100%|██████████| 50000/50000 [04:23<00:00, 189.80it/s]
  0%|          | 37/50000 [00:00<04:36, 180.59it/s]

Average loss at epoch 7: 0.28601427686482667


100%|██████████| 50000/50000 [04:23<00:00, 189.88it/s]
  0%|          | 21/50000 [00:00<04:07, 201.62it/s]

Average loss at epoch 8: 0.27688027049601077


100%|██████████| 50000/50000 [04:03<00:00, 205.57it/s]

Average loss at epoch 9: 0.2671962743285298





In [None]:
total_correct = 0
total = 0
for sample_batched in tqdm(dataloader_train):

    loss = 0
    output, hidden = encoder(sample_batched['vector'])

    output = output[0][len(output[0])-1]
    output = classifier(output)
    classification = torch.argmax(output)
    arg_max = torch.argmax(sample_batched['category'][0])
    if classification == arg_max:
        total_correct+=1
print("Accuracy: {}".format(total_correct/len(dataloader_train)))

100%|██████████| 50000/50000 [01:13<00:00, 683.44it/s]

Accuracy: 0.93812





In [None]:
total_correct = 0
total = 0
for sample_batched in tqdm(dataloader_dev):

    loss = 0
    output, hidden = encoder(sample_batched['vector'])
    output = output[0][len(output[0])-1]
    output = classifier(output)
    classification = torch.argmax(output)
    arg_max = torch.argmax(sample_batched['category'][0])
    if classification == arg_max:
        total_correct+=1
print("Accuracy: {}".format(total_correct/len(dataloader_dev)))

100%|██████████| 10000/10000 [00:14<00:00, 683.82it/s]

Accuracy: 0.7971



