In [1]:
!pip install torchtext



In [2]:
import nltk
from nltk.tokenize import wordpunct_tokenize
from sklearn.model_selection import KFold
import numpy as np

import copy
import time
import re
import math
import random

import torch
from torch import nn, Tensor
from torch.optim import SGD
from torch.nn import TransformerEncoder, TransformerEncoderLayer, CrossEntropyLoss, Softmax
from torch.utils.data import DataLoader, Subset, random_split
import torchtext
from torch.nn.utils.rnn import pad_sequence
print("Imported all libraries succesfully")

Imported all libraries succesfully


In [3]:
# NEURAL NETWORK CLASSES
"""
Tutorial for pytorch followed from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
Accessed 10 December 2021
"""

class TransformerModel(nn.Module):
    """ Transformer 
        n_embeds: Number of embeddings/vocabulary
        n_classes: Number of classes
        d_model: Dimension of the model
        n_head: Number of attention heads
        d_hid: Dimension of the hidden layer
        n_layers: Number of encoder layers
        dropout: For training to zero elements randomly and prevent overfitting
    """
    def __init__(self, n_embeds, n_classes, d_model, n_head, d_hid, n_layers, dropout):
        super().__init__()
#         # Set model type
#         self.model_type = 'Transformer'
        # Get positional encoder
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        # Get transformer encoder layers
        encoder_layers = TransformerEncoderLayer(d_model, n_head, d_hid, dropout)
        
        # Get transformer encoder
        self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers)
        
        # Embedding number and model
        self.encoder = nn.Embedding(n_embeds, d_model)
        
        # Set model for transformer
        self.d_model = d_model
        
        # Set decoder
        self.decoder = nn.Linear(d_model, n_classes)

    # Initialize weights
    def set_weights(self):
        if isinstance(self, nn.Linear):
            # Fill the tensor with uniform distribution values from Glorot initialization
            nn.init.xavier_uniform(self.weight)
            # Add a bias to those weights
            self.bias.data.fill(0.005)

    def forward(self, src):
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = output.mean(dim=0)
        output = self.decoder(output)
        return output
    
class PositionalEncoding(nn.Module):
    """ Positional Encoding """
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    
class Collate(object):
    """ Class to compute batching """
    def __init__(self, vocab):
        self.vocab = vocab
        
    def __call__(self, batch):
        batch_texts = list()
        class_labels = list()
        for (t, l) in batch:
            processed_text = torch.tensor(self.pipeline(t, self.vocab), dtype=torch.int64)
            batch_texts.append(processed_text)
            class_labels.append(l)
        
        batch_texts = pad_sequence(batch_texts, batch_first=False, padding_value=self.vocab["<pad>"])
        class_labels = torch.tensor(class_labels, dtype=torch.int64)
        return batch_texts, class_labels

    def pipeline(self, text, vocab):
        tokenized_text = [vocab[word] for word in wordpunct_tokenize(text)]
        enclosed_text = [vocab["<bos>"], *tokenized_text, vocab["<eos>"]]
        return enclosed_text

print("Succesfully created Transformer Model class")

Succesfully created Transformer Model class


In [4]:
# TRANSFORMER COMPUTATION FUNCTIONS
def batch_iter(model, batch, correct, loss, criterion):
    sample = batch[0].to(device)
    index = batch[1].to(device)
    
    # Process the data through the transformer 
    classifications = model(sample)

    # Check how many correct results we get to get accuracy
    output = torch.softmax(classifications, dim=1)
    for i in range(len(index)):
        if output[i].argmax() == index[i]:
            correct += 1
    
    # Calculate loss function for specific batch
    cs_loss = criterion(input=classifications, target=index)
    loss += float(sample.shape[1] * cs_loss.item())
    
    return correct, loss, cs_loss

def compute(model, dataloader, correct, loss, criterion, stochastic_gradient_descent, gradient_calculation):
    # Disable or not gradient calculation to reduce memory consumption
    if gradient_calculation:
        # Go through each batch in the dataloader of train
        for batch in dataloader:
            stochastic_gradient_descent.zero_grad()
            correct, loss, cs_loss = batch_iter(model, batch, correct, loss, criterion)
            # Apply optimization and update weights based on the loss function
            cs_loss.backward()
            stochastic_gradient_descent.step()
    else:
        with torch.no_grad():
            for batch in dataloader:
                correct, loss, cs_loss = batch_iter(model, batch, correct, loss, criterion)
        
    return correct, loss

def trainTransformerModel(transformer_model, epochs_number, data, vocab):
    best_list = list(())
    """
    lr - learning rate for the model in each SGD step
    momentum - helps the gradient vectors in each step by smoothing it out
    weight_decay - to penalyse the transformer training
    """
    stochastic_gradient_descent = SGD(transformer_model.parameters(), lr=0.01, momentum=0.95, weight_decay=0.001)
    criterion = CrossEntropyLoss()
    
    # K-FOLD CROSS VALIDATION on the data
    kf = KFold(n_splits=5, shuffle=True)
    
    for i, (train_index, eval_index) in enumerate(kf.split(data)):
        print("------")
        print("Fold : " + str(i+1))
        print("------")
        
        init_time = time.time()
        tr_performance = list()
        eval_performance = list()
        best_transformer = None
        best_performance = (float("-inf"), float("inf"))
        best_epoch = 0
        
        # Reset the weights of the model
        transformer_model.set_weights()
        learn_collate = Collate(vocab)
        
        train_subset = Subset(data, train_index)
        eval_subset = Subset(data, eval_index)
        
        # Load respective data into dataloaders by batches to train and evaluate
        train_dataloader = DataLoader(train_subset, batch_size=50, collate_fn=learn_collate, shuffle=True)
        eval_dataloader = DataLoader(eval_subset, batch_size=50, collate_fn=learn_collate, shuffle=True)
        
        for epoch in range(epochs_number):
            start_time = time.time()
            
            #TRAINING
            # -- Change pytorch transformer to training mode --
            tr_correct = 0
            tr_loss = 0
            transformer_model.train()

            # Compute training
            tr_correct, tr_loss = compute(transformer_model, train_dataloader, tr_correct, tr_loss, criterion, stochastic_gradient_descent, True)

            tr_correct = tr_correct/len(train_index)
            tr_loss = tr_loss/len(train_index)
            tr_performance.append((tr_correct, tr_loss))

            # EVALUATION
            # -- Change pytorch transformer to evaluation mode --
            eval_correct = 0
            eval_loss = 0
            transformer_model.eval()

            # Compute evaluation
            eval_correct, eval_loss = compute(transformer_model, eval_dataloader, eval_correct, eval_loss, criterion, _, False)

            eval_correct = eval_correct/len(eval_index)
            eval_loss = eval_loss/len(eval_index)
            eval_performance.append((eval_correct, eval_loss))

            # FINDING BEST MODEL
            # -- Evaluate epoch by loss function --
            if ((best_performance[1] > eval_performance[epoch][1]) or ((best_performance[1] == eval_performance[epoch][1]) and (best_performance[0] < eval_performance[epoch][0]))):
                best_performance = (eval_performance[epoch][0], eval_performance[epoch][1])
                best_transformer = copy.deepcopy(transformer_model)
                best_epoch = epoch

            print("  For epoch : " + str(epoch+1) + " | Time : " + str(round(time.time() - start_time,2)) + " seconds")
            print("    Train   : performance= " + str(round(tr_performance[epoch][0],2)) + " | loss= " + str(round(tr_performance[epoch][1],2)))
            print("   Evaluate : performance= " + str(round(eval_performance[epoch][0],2)) + " | loss= " + str(round(eval_performance[epoch][1],2)))
            print("-----------------------------------------------------")

        print("-------------------------------------------------")
        print(" Best epoch : " + str(best_epoch+1) + " | Total Time : " + str(round(time.time() - init_time,2)))
        print(" With evaluation : performance= " + str(round(best_performance[0],2)) + " | loss= " + str(round(best_performance[1],2)))
    
        # Add best transformer from each fold and their performance (accuracy, loss)
        best_list.append((best_transformer, best_performance))
    
    return best_list

In [5]:
# DATA PREPROCESSING AND CLEANING
# Gettin the datasets from the reviews
path = "product_reviews/"
products = ["Canon_PowerShot_SD500", "Canon_S100", "Diaper_Champ", "Hitachi_router", "ipod", "Linksys_Router", "MicroMP3", "Nokia_6600", "norton"]

# RegEx for the positive and negative sentences
reviews = list()
re_review = re.compile(r"##(.*)$")
re_sentiment = re.compile(r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)(?:\d|))\]")
for p in products:
    f = open(path+p+'.txt', 'r')
    for line in f:
        line = line.strip()
        
        # Check if it is a review tag
        if line.strip() == "[t]":
            continue
            
        features = re.findall(re_sentiment, line)
        # Check if there is any sentiment on the line
        if len(features) == 0:
            continue
            
        review = re.findall(re_review, line)
        review = [r.lower() for r in review]
        # Check that the review is properly read
        if len(review) != 1:
            # Some reviews (6, in MicroMP3) are not split properly with a ##
            continue
        
        score = 0
        # Get score for a review
        for i in features:
            try:
                score += int(i[1])
            except ValueError:
                # Some reviews don't have a number, just a '+' or '-'
                if i[1] == "-":
                    score -= 1
                else:
                    score += 1
        
        # Reviews with a 0 score are also considered as positive
        if score >= 0:
            reviews.append((review[0], 1))
        else:
            reviews.append((review[0], 0))

print("Succesfully extracted reviews as positive and negative as tuples in a list")

print("Found " + str(len(reviews)) + " reviews")


Succesfully extracted reviews as positive and negative as tuples in a list
Found 2101 reviews


In [6]:
 # SPLITTING DATA in learning (for training and evaluation with k-fold cross) and testing
test_size = int(0.1 * len(reviews)) #HYPERPARAMETER
learn_size = len(reviews) - test_size
learn_data, test_data = random_split(reviews, [learn_size, test_size])

print("Using " + str(len(learn_data)) + " for learning")
print("Using " + str(len(test_data)) + " for testing")

# Tokenize dataset
learn_tokens = [wordpunct_tokenize(sentence) for (sentence, _) in learn_data]

# Using torchtext from pytorch, create a vocabulary for the dataset
learn_vocab = torchtext.vocab.build_vocab_from_iterator(learn_tokens, specials=["<unk>", "<pad>", "<bos>", "<eos>"])
learn_vocab.set_default_index(learn_vocab["<unk>"])

Using 1891 for learning
Using 210 for testing


In [7]:
# LEARNING (TRAINING AND EVALUATING)
# GPU CUDA is a lot faster than CPU
# If CPU, will run less epochs but will result in a much lower performance
# Can change number below if it is to slow
if torch.cuda.is_available():
    device = torch.device("cuda")
    epochs_number = 10 #HYPERPARAMETER
else:
    device = torch.device("cpu")
    epochs_number = 5 #HYPERPARAMETER
#epochs_number = 5
print("Using  " + str(device) + "  for processing, thus running  " + str(epochs_number) + " epochs\n")


#HYPERPARAMETERS for transformers
transformer = TransformerModel(n_embeds=len(learn_vocab), n_classes=2, d_model=400, n_head=2, d_hid=400, n_layers=2, dropout=0.2).to(device)

best_transformers = trainTransformerModel(transformer, epochs_number=epochs_number, data=learn_data, vocab=learn_vocab)


score = (float("-inf"), float("inf"))
winner = None
for (model, (accuracy, loss)) in best_transformers:
    if ((score[1] > loss) or ((score[1] == loss) and (score[0] < accuracy))):
        score = (accuracy, loss)
        winner = model
    
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("Overall best accuracy and loss of the models from the 5-fold cross validation is")
print("Accuracy : " + str(score[0]) + " |  Loss : " + str(score[1]))

Using  cuda  for processing, thus running  10 epochs

------
Fold : 1
------
  For epoch : 1 | Time : 0.66 seconds
    Train   : performance= 0.61 | loss= 0.67
   Evaluate : performance= 0.6 | loss= 1.01
-----------------------------------------------------
  For epoch : 2 | Time : 0.64 seconds
    Train   : performance= 0.64 | loss= 0.67
   Evaluate : performance= 0.61 | loss= 0.66
-----------------------------------------------------
  For epoch : 3 | Time : 0.63 seconds
    Train   : performance= 0.64 | loss= 0.68
   Evaluate : performance= 0.62 | loss= 0.67
-----------------------------------------------------
  For epoch : 4 | Time : 0.64 seconds
    Train   : performance= 0.63 | loss= 0.69
   Evaluate : performance= 0.6 | loss= 0.75
-----------------------------------------------------
  For epoch : 5 | Time : 0.63 seconds
    Train   : performance= 0.66 | loss= 0.67
   Evaluate : performance= 0.55 | loss= 0.72
-----------------------------------------------------
  For epoch : 6

  For epoch : 3 | Time : 0.63 seconds
    Train   : performance= 0.87 | loss= 0.3
   Evaluate : performance= 0.92 | loss= 0.22
-----------------------------------------------------
  For epoch : 4 | Time : 0.6 seconds
    Train   : performance= 0.89 | loss= 0.27
   Evaluate : performance= 0.88 | loss= 0.28
-----------------------------------------------------
  For epoch : 5 | Time : 0.65 seconds
    Train   : performance= 0.88 | loss= 0.27
   Evaluate : performance= 0.89 | loss= 0.27
-----------------------------------------------------
  For epoch : 6 | Time : 0.61 seconds
    Train   : performance= 0.89 | loss= 0.26
   Evaluate : performance= 0.9 | loss= 0.24
-----------------------------------------------------
  For epoch : 7 | Time : 0.61 seconds
    Train   : performance= 0.9 | loss= 0.22
   Evaluate : performance= 0.89 | loss= 0.31
-----------------------------------------------------
  For epoch : 8 | Time : 0.61 seconds
    Train   : performance= 0.88 | loss= 0.27
   Evaluate

In [8]:
# TESTING
# Now load the test data
test_collator = Collate(learn_vocab)
test_dataloader = DataLoader(test_data, batch_size=50, collate_fn=test_collator)
# Do final test with best model
winner.eval()
correct = 0
loss = 0
criterion = CrossEntropyLoss()
correct, loss = compute(winner, test_dataloader, correct, loss, criterion, _, False)
correct = correct/test_size
loss = loss/test_size

print("++++++++++++")
print("TEST RESULTS")
print(" Accuracy : " + str(correct))
print("   Loss   : " + str(loss))
print("++++++++++++")

++++++++++++
TEST RESULTS
 Accuracy : 0.7285714285714285
   Loss   : 0.6513221604483468
++++++++++++
