In [None]:
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from nltk.tokenize import sent_tokenize
from tqdm import tqdm

import torch.optim as optim

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import logging
logging.basicConfig(level=logging.INFO)

In [None]:
train_texts = []
train_labels = []
with open('dataset/dataset_train.json', 'r') as file:
    for line in file:
        j = json.loads(line)
        train_texts.append(j['text'])
        train_labels.append(j['category'])
train_labels = list(map(lambda x: torch.tensor([x]), train_labels))

dev_texts = []
dev_labels = []
with open('dataset/dataset_dev.json', 'r') as file:
    for line in file:
        j = json.loads(line)
        dev_texts.append(j['text'])
        dev_labels.append(j['category'])
dev_labels = list(map(lambda x: torch.tensor([x]), dev_labels))

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.eval()

MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

In [None]:
train_data = []
for review in tqdm(train_texts):
    sents = sent_tokenize(review)
    embeddings = []
    for sent in sents:
        marked_text = "[CLS] " + sent + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segment_ids = [1] * len(tokenized_text)
        
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segment_ids])
        
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensor)
            hidden_states = outputs[2]
            token_embeddings = torch.stack(hidden_states, dim=0)
            token_embeddings = torch.squeeze(token_embeddings, dim=1)[-2]
            
            sentence_embedding = torch.mean(token_embeddings, dim=0).view(1, 1, -1)
            embeddings.append(sentence_embedding)
    train_data.append(torch.cat(embeddings, dim=1))

    
dev_data = []
for review in tqdm(dev_texts):
    sents = sent_tokenize(review)
    embeddings = []
    for sent in sents:
        marked_text = "[CLS] " + sent + " [SEP]"
        tokenized_text = tokenizer.tokenize(marked_text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segment_ids = [1] * len(tokenized_text)
        
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segment_ids])
        
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensor)
            hidden_states = outputs[2]
            token_embeddings = torch.stack(hidden_states, dim=0)
            token_embeddings = torch.squeeze(token_embeddings, dim=1)[-2]
            
            sentence_embedding = torch.mean(token_embeddings, dim=0).view(1, 1, -1)
            embeddings.append(sentence_embedding)
    dev_data.append(torch.cat(embeddings, dim=1))

100%|██████████| 50000/50000 [7:00:53<00:00,  1.98it/s]   
100%|██████████| 10000/10000 [1:24:23<00:00,  1.97it/s]


In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size, hidden_size, bidirectional=True, batch_first=True)

    def forward(self, inputs):
        output, hidden = self.gru(inputs)
        return output, hidden

In [None]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()

        self.input_size = input_size
        
        self.fcn = nn.Sequential(
            nn.Linear(2*input_size, 10),
            nn.Tanh(),
            nn.Linear(10, 2),
            nn.Tanh()
        )


    def forward(self, x):
        output = self.fcn(x)
        
        return output

In [None]:
encoder = EncoderRNN(768, 32)
classifier = BinaryClassifier(32)

criterion = nn.CrossEntropyLoss()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
classifier_optimizer = optim.Adam(classifier.parameters(), lr=0.001)

idx = np.array(range(len(train_data)))
np.random.shuffle(idx)

epochs = 10
total = 0
for n in range(epochs):
    epoch_loss = 0
    for i in tqdm(idx):
        x, y = train_data[i], train_labels[i]

        encoder.zero_grad()
        classifier.zero_grad()

        loss = 0
        output, hidden = encoder(x)
        output = output[0][-1]

        output = classifier(output)
        output = output.view(1, -1)
        loss = criterion(output, y)
        epoch_loss+=loss.detach().item()
        loss.backward()

        encoder_optimizer.step()
        classifier_optimizer.step()
    
    print("Average loss at epoch {}: {}".format(n, epoch_loss/len(train_data)))




100%|██████████| 50000/50000 [04:14<00:00, 196.43it/s]
  0%|          | 23/50000 [00:00<03:40, 226.32it/s]

Average loss at epoch 0: 0.2929267506468296


100%|██████████| 50000/50000 [04:04<00:00, 204.40it/s]
  0%|          | 21/50000 [00:00<04:01, 206.72it/s]

Average loss at epoch 1: 0.2724859392657876


100%|██████████| 50000/50000 [04:12<00:00, 198.29it/s]
  0%|          | 19/50000 [00:00<04:35, 181.37it/s]

Average loss at epoch 2: 0.268146974298954


100%|██████████| 50000/50000 [04:49<00:00, 172.94it/s]
  0%|          | 14/50000 [00:00<06:00, 138.65it/s]

Average loss at epoch 3: 0.26612275010883807


100%|██████████| 50000/50000 [05:21<00:00, 155.63it/s]
  0%|          | 18/50000 [00:00<04:42, 176.78it/s]

Average loss at epoch 4: 0.2634788509759307


100%|██████████| 50000/50000 [05:05<00:00, 163.74it/s]
  0%|          | 13/50000 [00:00<06:29, 128.20it/s]

Average loss at epoch 5: 0.259367814719975


100%|██████████| 50000/50000 [05:33<00:00, 150.05it/s]
  0%|          | 15/50000 [00:00<05:40, 146.87it/s]

Average loss at epoch 6: 0.26084127724915745


100%|██████████| 50000/50000 [05:34<00:00, 149.40it/s]
  0%|          | 16/50000 [00:00<05:13, 159.54it/s]

Average loss at epoch 7: 0.255678585421741


100%|██████████| 50000/50000 [05:34<00:00, 149.33it/s]
  0%|          | 30/50000 [00:00<05:41, 146.14it/s]

Average loss at epoch 8: 0.25458301654368637


100%|██████████| 50000/50000 [05:35<00:00, 149.00it/s]

Average loss at epoch 9: 0.2553473915401101





In [None]:
encoder.eval()
classifier.eval()
with torch.no_grad():
    total_correct = 0
    total = 0
    for x, y in zip(train_data, train_labels):
        output, hidden = encoder(x)

        output = output[0][-1]
        output = classifier(output)
        classification = torch.argmax(output)
        if classification.item() == y.item():
            total_correct+=1
print("Accuracy: {}".format(total_correct/len(train_data)))

Accuracy: 0.92916


In [None]:
encoder.eval()
classifier.eval()
with torch.no_grad():
    total_correct = 0
    total = 0
    for x, y in zip(dev_data, dev_labels):
        output, hidden = encoder(x)

        output = output[0][-1]
        output = classifier(output)
        classification = torch.argmax(output)
        if classification.item() == y.item():
            total_correct+=1
print("Accuracy: {}".format(total_correct/len(dev_data)))

Accuracy: 0.9149


In [None]:
import pickle

torch.save({'train_data': train_data, 'train_labels': train_labels}, 'train_data.pt')
torch.save({'dev_data': dev_data, 'dev_labels': dev_labels}, 'dev_data.pt')
torch.save(encoder.state_dict(), 'encoder_with_BERT')
torch.save(classifier.state_dict(), 'classifier_with_BERT')