In [1]:
# taking code from
# https://github.com/jowagner/CA4023-NLP/blob/main/notebooks/sentiment-bert.ipynb

## 1.1 BERT Configuration

In [2]:
model_size          = 'base'  # choose between 'tiny', 'base' and 'large'
max_sequence_length = 256
batch_size          = 10

# compensate for small batch size with batch accumulation if needed
accumulate_grad_batches = 1
while batch_size * accumulate_grad_batches < 32:
    # accumulated batch size too small
    # --> accumulate more batches
    accumulate_grad_batches += 1

print('Batch size:', batch_size)
if accumulate_grad_batches > 1:
    print('Accumulating gradients of %d batches' %accumulate_grad_batches)
    
size2name = {
    'tiny':  'distilbert-base-uncased',
    'base':  'bert-base-uncased',
    'large': 'bert-large-uncased',
}

model_name = size2name[model_size]

from transformers import AutoTokenizer
from tokenizers.pre_tokenizers import Whitespace

tokeniser = AutoTokenizer.from_pretrained(model_name)

Batch size: 10
Accumulating gradients of 4 batches


## 1.2 Dataset Configuration

In [3]:
domains = ['laptop', 'restaurant']

train_dev_split = (90, 10)

data_prefix = 'data/'

filenames = {
    'laptop':     'ABSA16_Laptops_Train_SB1_v2.xml',
    'restaurant': 'ABSA16_Restaurants_Train_SB1_v2.xml',
}

for domain in domains:
    filename = data_prefix + filenames[domain]
    print('Using', filename)

Using data/ABSA16_Laptops_Train_SB1_v2.xml
Using data/ABSA16_Restaurants_Train_SB1_v2.xml


## 1.3 Question Templates

In [4]:
put_question_first = True  # whether to put question into seq A or B

templates = [
    
    # Hoang et al. (2019)
    {   'question': '%(entity_type)s, %(attribute_label)s',
        'label':    '%(polarity)s',
    },
    
    # Sun et al. (2019) format 1
    {   'question': '%(entity_type)s - %(attribute_label)s',
        'label':    '%(polarity)s',
    },
    
    # Sun et al. (2019) format 2
    {    'question': 'What do you think of the %(attribute_label)s of %(entity_type)s?',
         'label':    '%(polarity)s',
    },
    
    # Sun et al. (2019) format 3
    {    'question': 'The polarity of the aspect %(attribute_label)s of %(entity_type)s is %(candidate_polarity)s.',
         'label':    '%(yesno)s',
    },
    
    # Sun et al. (2019) format 4
    {   'question': '%(entity_type)s - %(attribute_label)s - %(candidate_polarity)s',
        'label':    '%(yesno)s',
    },
    
    # Variant 1
    {    'question': 'In terms of %(attribute_label)s, what do you think of %(entity_type)s?',
         'label':    '%(polarity)s',
    },
    
    # Variant 2
    {    'question': 'What polarity has the sentiment towards the %(attribute_label)s of %(entity_type)s in the following rewview?',
         'label':    '%(polarity)s',
    },
    
    # Variant 3
    {    'question': 'Do you agree that the sentiment towards the aspect %(attribute_label)s of %(entity_type)s in the following review is %(candidate_polarity)s?',
         'label':    '%(yesno)s',
    },
    
]

# TODO: add variants with entity type and attribute label not in ALLCAPS and
#       with _ between words (requires additional code)

## 2.1 Get Data Instances from XML File

In [5]:
# mostly implemented from scratch, some inspiration from
# https://opengogs.adaptcentre.ie/rszk/sea/src/master/lib/semeval_absa.py

from xml.etree import ElementTree

observed_entity_types = set()
observed_attribute_labels = set()
observed_polarities = set()
observed_targets = set()

def get_dataset(filename):
    global observed_entity_types
    global observed_attribute_labels
    global observed_polarities
    global observed_targets
    xmltree = ElementTree.parse(filename)
    xmlroot = xmltree.getroot()
    dataset = []
    for sentence in xmlroot.iter('sentence'):
        sent_id = sentence.get('id')
        # get content inside the first <text>...</text> sub-element
        text = sentence.findtext('text').strip()
        #print(sent_id, text)
        for opinion in sentence.iter('Opinion'):
            opin_cat = opinion.get('category')
            entity_type, attribute_label = opin_cat.split('#')
            polarity = opinion.get('polarity')
            target = opinion.get('target')
            try:
                span = (int(opinion.get('from')), int(opinion.get('to')))
            except TypeError:
                # at least one of 'from' or 'to' is missing
                span = (0, 0)
            if target == 'NULL':
                target = None
            # add to dataset
            dataset.append((
                sent_id, text,
                entity_type, attribute_label,
                target, span,
                polarity
            ))
            # update vocabularies
            observed_entity_types.add(entity_type)
            observed_attribute_labels.add(attribute_label)
            observed_polarities.add(polarity)
            if target:
                observed_targets.add(target)
    return dataset

datasets = []
for domain in domains:
    filename = data_prefix + filenames[domain]
    datasets.append((domain, get_dataset(filename)))
    
print('observed entity types:',     sorted(observed_entity_types))
print('\nobserved attribute labels:', sorted(observed_attribute_labels))
print('\nobserved polarities:',       sorted(observed_polarities))
print('\nnumber of unique targets:',  len(observed_targets))

observed entity types: ['AMBIENCE', 'BATTERY', 'COMPANY', 'CPU', 'DISPLAY', 'DRINKS', 'FANS_COOLING', 'FOOD', 'GRAPHICS', 'HARDWARE', 'HARD_DISC', 'KEYBOARD', 'LAPTOP', 'LOCATION', 'MEMORY', 'MOTHERBOARD', 'MOUSE', 'MULTIMEDIA_DEVICES', 'OPTICAL_DRIVES', 'OS', 'PORTS', 'POWER_SUPPLY', 'RESTAURANT', 'SERVICE', 'SHIPPING', 'SOFTWARE', 'SUPPORT', 'WARRANTY']

observed attribute labels: ['CONNECTIVITY', 'DESIGN_FEATURES', 'GENERAL', 'MISCELLANEOUS', 'OPERATION_PERFORMANCE', 'PORTABILITY', 'PRICE', 'PRICES', 'QUALITY', 'STYLE_OPTIONS', 'USABILITY']

observed polarities: ['negative', 'neutral', 'positive']

number of unique targets: 721


## 2.2 PyTorch DataLoader

To use the PyTorch Lighting framwork, we need to distinguish 3 types of objects handling our data:

### Dataset

PyTorch Dataset objects provide access to a data set and behave like a list of dictionaries, one dictionary for each data instance (training or test item). The framework does not prescribe what the dictionaries look like, i.e. you can choose the keys. The length of the list determines the number of training instances in each epoch, unless the DataLoader (below) is extended to filter or augment the data. The standard way to augment data is to keep the number of instances identical to the number of raw instances and to apply a different or random transformation in each call of `__getitem__()`.

### DataLoader

PyTorch DataLoader objects shuffle data provided by a Dataset object and create batches of data.

### LightningDataModule

LightningDataModule objects create 3 DataLoader objects, one each for training, validation and test data.

In [6]:
# basic usage of pytorch and lightning from
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
# and
# https://github.com/ricardorei/lightning-text-classification/blob/master/classifier.py

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler

class ABSA_Dataset_part_1(Dataset):
    
    def __init__(
        self,
        raw_data,
        put_question_first = True,
        question_prefix = None,
        template_index = -1,    # -1 = pick random template
        info = None,            # additional info to keep with each instance
    ):
        self.raw_data            = raw_data
        self.put_question_first  = put_question_first
        self.question_prefix     = question_prefix
        self.template_index      = template_index
        self.info                = info
        
    def __len__(self):
        return len(self.raw_data)
    
    def __getitem__(self, idx):
        ''' get one instance of the dataset as a custom dictionary
        '''
        if torch.is_tensor(idx):
            idx = idx.tolist()
            assert isinstance(idx, int)
        sent_id, text, \
            entity_type, attribute_label, \
            target, span, \
            polarity = self.raw_data[idx]
        question, label = self.pick_question(entity_type, attribute_label, polarity)
        if self.question_prefix:
            question = self.question_prefix + ' ' + question
        # TODO: support adding context (previous sentences) to text
        retval = {}
        if self.put_question_first:
            retval['seq_A'] = question
            retval['seq_B'] = text
        else:
            retval['seq_A'] = text
            retval['seq_B'] = question
        retval['label'] = label
        retval['info']  = self.info
        return retval

In [7]:
import random

class ABSA_Dataset(ABSA_Dataset_part_1):
                   
    def pick_question(self, entity_type, attribute_label, polarity):
        global templates
        global observed_polarities
        if self.template_index < 0:
            template = random.choice(templates)
        else:
            template = templates[self.template_index]
        candidate_polarity = random.choice(list(observed_polarities))
        if candidate_polarity == polarity:
            yesno = 'yes'
        else:
            yesno = 'no'
        question = template['question'] %locals()
        label    = template['label']    %locals()
        return (question, label)        

## 2.3 Training-Dev Split
The SemEval ABSA dataset comes without a dev set. We need a dev set to decide how long to train, to select other parameters and to select a good run.

In [8]:
# concatenate domains

tr_dataset_objects = []

for domain, dataset in datasets:
    print(domain, len(dataset))
    tr_dataset_objects.append(ABSA_Dataset(
        dataset,
        put_question_first = put_question_first,
        question_prefix = domain + ':',
        template_index  = 0,   # a template that keeps the original 3-value polarity
        info = domain
    ))

tr_dataset = torch.utils.data.ConcatDataset(tr_dataset_objects)
n = len(tr_dataset)
print('Total size:', n)

# how many instances are there for each label?

group2indices = {}
for index in range(n):
    label = tr_dataset[index]['label']
    domain = tr_dataset[index]['info']
    group = (label, domain)
    if not group in group2indices:
        group2indices[group] = []
    group2indices[group].append(index)

# create stratified sample
    
rel_train_size, rel_dev_size = train_dev_split
rel_total = rel_train_size + rel_dev_size

tr_indices = []
dev_indices = []

for group in group2indices:
    indices = group2indices[group]
    n = len(indices)
    select = (n * rel_train_size) // rel_total
    remaining = n - select
    print('%r: split %d (%.1f%%) to %d (%.1f%%)' %(
        group, select, 100.0*select/float(n),
        remaining, 100.0*remaining/float(n),
    ))
    random.shuffle(indices)
    tr_indices += indices[:select]
    dev_indices += indices[select:]
    
dev_dataset = torch.utils.data.Subset(tr_dataset, dev_indices)
tr_dataset  = torch.utils.data.Subset(tr_dataset, tr_indices)

print('Training data size:', len(tr_dataset))
print('Development data size:', len(dev_dataset))

laptop 2909
restaurant 2507
Total size: 5416
('positive', 'laptop'): split 1473 (90.0%) to 164 (10.0%)
('negative', 'laptop'): split 975 (89.9%) to 109 (10.1%)
('neutral', 'laptop'): split 169 (89.9%) to 19 (10.1%)
('negative', 'restaurant'): split 674 (90.0%) to 75 (10.0%)
('positive', 'restaurant'): split 1491 (90.0%) to 166 (10.0%)
('neutral', 'restaurant'): split 90 (89.1%) to 11 (10.9%)
Training data size: 4872
Development data size: 544


## Appendix A: Example BERT Tokenisation

In [9]:
example_batch = []
for domain, dataset in datasets:
    if domain == 'laptop':
        for i in (0, 4, 8, 18):  # select a few interesting instances
            example_batch.append(dataset[i][1])   

tokenised_text = tokeniser(
    example_batch,
    is_split_into_words = False,
)

for i, token_ids in enumerate(tokenised_text['input_ids']):
    if i: print()
    print(i, '\tinput:        ', example_batch[i])
    print(   "\t['input_ids']:", token_ids)
    print(   '\ttokens:       ', tokeniser.convert_ids_to_tokens(token_ids))

0 	input:         This computer is absolutely AMAZING!!!
	['input_ids']: [101, 2023, 3274, 2003, 7078, 6429, 999, 999, 999, 102]
	tokens:        ['[CLS]', 'this', 'computer', 'is', 'absolutely', 'amazing', '!', '!', '!', '[SEP]']

1 	input:         and plenty of storage with 250 gb(though I will upgrade this and the ram..)
	['input_ids']: [101, 1998, 7564, 1997, 5527, 2007, 5539, 16351, 1006, 2295, 1045, 2097, 12200, 2023, 1998, 1996, 8223, 1012, 1012, 1007, 102]
	tokens:        ['[CLS]', 'and', 'plenty', 'of', 'storage', 'with', '250', 'gb', '(', 'though', 'i', 'will', 'upgrade', 'this', 'and', 'the', 'ram', '.', '.', ')', '[SEP]']

2 	input:         GET THIS COMPUTER FOR PORTABILITY AND FAST PROCESSING!!!
	['input_ids']: [101, 2131, 2023, 3274, 2005, 3417, 8010, 1998, 3435, 6364, 999, 999, 999, 102]
	tokens:        ['[CLS]', 'get', 'this', 'computer', 'for', 'port', '##ability', 'and', 'fast', 'processing', '!', '!', '!', '[SEP]']

3 	input:         without a big ol' clunky machine i

## Appendix B: Sequence Length Distribution

In [10]:
from collections import defaultdict
    
bin_width = 10

for domain, dataset in datasets:
    print(domain)
    distribution = defaultdict(lambda: 0)
    dataset.append(7*[None])  # hack to simplify loop below
    batch = []
    labels = []
    max_length_bin = 0
    for _, text, _, _, _, _, label in dataset:
        if text is not None:
            batch.append(text)
            labels.append(label)
        if len(batch) == batch_size \
        or (text is None and len(batch) > 0):
            tokenised_batch = tokeniser(
                batch,
                is_split_into_words = False,
            )
            for index, token_ids in enumerate(tokenised_batch['input_ids']):
                label = labels[index]
                length = len(token_ids)
                length_bin = length // bin_width
                distribution[(label,   length_bin)] += 1
                distribution[('total', length_bin)] += 1
                if length_bin > max_length_bin:
                    max_length_bin = length_bin
            batch = []
            labels = []
    del dataset[-1]  # remove "end of data" marker of hack above   
    header = []
    header.append('LengthBin')
    for polarity in sorted(observed_polarities):
        header.append('%12s' %polarity)
    header.append('%12s' %'Total')
    header.append('%12s' %'Positivity')
    print('\t'.join(header))
    for length_bin in range(0, max_length_bin+1):
        row = []
        row.append('%4d-%4d' %(
            bin_width*length_bin,
            bin_width*(1+length_bin)-1
        ))
        total = 0
        for label in sorted(observed_polarities):
            count = distribution[(label, length_bin)]
            row.append('%12d' %count)
            total += count
        row.append('%12d' %total)
        if total:
            row.append('%10.0f%%' %(100.0*distribution[('positive', length_bin)]/float(total)))
        else:
            row.append('%11s' %'n/a')
        print('\t'.join(row))

laptop
LengthBin	    negative	     neutral	    positive	       Total	  Positivity
   0-   9	         123	          20	         279	         422	        66%
  10-  19	         484	          75	         744	        1303	        57%
  20-  29	         303	          59	         428	         790	        54%
  30-  39	         102	          17	         125	         244	        51%
  40-  49	          47	          15	          38	         100	        38%
  50-  59	          11	           2	          20	          33	        61%
  60-  69	           8	           0	           0	           8	         0%
  70-  79	           3	           0	           3	           6	        50%
  80-  89	           3	           0	           0	           3	         0%
restaurant
LengthBin	    negative	     neutral	    positive	       Total	  Positivity
   0-   9	          74	          13	         276	         363	        76%
  10-  19	         300	          52	         724	        1076	        67%
  20-  29	        

## Appendix C: Example Dataset Object

In [11]:
is_first = True
for domain, dataset in datasets:
    if not is_first: print()
    print(domain)
    dataset_obj = ABSA_Dataset(
        dataset,
        put_question_first = put_question_first,
        question_prefix = domain + ':',
        template_index = -1,   # -1 = random pick
    )
    for i in range(2):
        print(i, dataset_obj[i])
        print(i, dataset_obj[i])  # repeat call doesn't give the same result with template_index = -1
    is_first = False

laptop
0 {'seq_A': 'laptop: LAPTOP - GENERAL', 'seq_B': 'This computer is absolutely AMAZING!!!', 'label': 'positive', 'info': None}
0 {'seq_A': 'laptop: LAPTOP, GENERAL', 'seq_B': 'This computer is absolutely AMAZING!!!', 'label': 'positive', 'info': None}
1 {'seq_A': 'laptop: Do you agree that the sentiment towards the aspect OPERATION_PERFORMANCE of BATTERY in the following review is neutral?', 'seq_B': '10 plus hours of battery...', 'label': 'no', 'info': None}
1 {'seq_A': 'laptop: What do you think of the OPERATION_PERFORMANCE of BATTERY?', 'seq_B': '10 plus hours of battery...', 'label': 'positive', 'info': None}

restaurant
0 {'seq_A': 'restaurant: What polarity has the sentiment towards the GENERAL of RESTAURANT in the following rewview?', 'seq_B': 'Judging from previous posts this used to be a good place, but not any longer.', 'label': 'negative', 'info': None}
0 {'seq_A': 'restaurant: What polarity has the sentiment towards the GENERAL of RESTAURANT in the following rewview?'