In [1]:
# taking code from
# https://github.com/jowagner/CA4023-NLP/blob/main/notebooks/sentiment-bert.ipynb

## BERT Configuration

In [2]:
model_size          = 'base'  # choose between 'tiny', 'base' and 'large'
max_sequence_length = 256
batch_size          = 10

# compensate for small batch size with batch accumulation if needed
accumulate_grad_batches = 1
while batch_size * accumulate_grad_batches < 32:
    # accumulated batch size too small
    # --> accumulate more batches
    accumulate_grad_batches += 1

print('Batch size:', batch_size)
if accumulate_grad_batches > 1:
    print('Accumulating gradients of %d batches' %accumulate_grad_batches)
    
size2name = {
    'tiny':  'distilbert-base-uncased',
    'base':  'bert-base-uncased',
    'large': 'bert-large-uncased',
}

model_name = size2name[model_size]

from transformers import AutoTokenizer
from tokenizers.pre_tokenizers import Whitespace

tokeniser = AutoTokenizer.from_pretrained(model_name)

Batch size: 10
Accumulating gradients of 4 batches


## Dataset Configuration

In [3]:
domain = 'laptop' # 'restaurant'

data_prefix = 'data/'

filenames = {
    'laptop':     'ABSA16_Laptops_Train_SB1_v2.xml',
    'restaurant': 'ABSA16_Restaurants_Train_SB1_v2.xml',
}

filename = data_prefix + filenames[domain]

print('Using', filename)

Using data/ABSA16_Laptops_Train_SB1_v2.xml


## Get Data Instances from XML File

In [4]:
# mostly implemented from scratch, some inspiration from
# https://opengogs.adaptcentre.ie/rszk/sea/src/master/lib/semeval_absa.py

from xml.etree import ElementTree

xmltree = ElementTree.parse(filename)
xmlroot = xmltree.getroot()

observed_entity_types = set()
observed_attribute_labels = set()
observed_polarities = set()
observed_targets = set()

dataset = []
for sentence in xmlroot.iter('sentence'):
    sent_id = sentence.get('id')
    # get content inside the first <text>...</text> sub-element
    text = sentence.findtext('text').strip()
    #print(sent_id, text)
    for opinion in sentence.iter('Opinion'):
        opin_cat = opinion.get('category')
        entity_type, attribute_label = opin_cat.split('#')
        polarity = opinion.get('polarity')
        target = opinion.get('target')
        try:
            span = (int(opinion.get('from')), int(opinion.get('to')))
        except TypeError:
            # at least one of 'from' or 'to' is missing
            span = (0, 0)
        if target == 'NULL':
            target = None
        # add to dataset
        dataset.append((
            sent_id, text,
            entity_type, attribute_label,
            target, span,
            polarity
        ))
        # update vocabularies
        observed_entity_types.add(entity_type)
        observed_attribute_labels.add(attribute_label)
        observed_polarities.add(polarity)
        if target:
            observed_targets.add(target)
    
print('observed entity types:',     sorted(observed_entity_types))
print('observed attribute labels:', sorted(observed_attribute_labels))
print('observed polarities:',       sorted(observed_polarities))
print('number of unique targets:',  len(observed_targets))

observed entity types: ['BATTERY', 'COMPANY', 'CPU', 'DISPLAY', 'FANS_COOLING', 'GRAPHICS', 'HARDWARE', 'HARD_DISC', 'KEYBOARD', 'LAPTOP', 'MEMORY', 'MOTHERBOARD', 'MOUSE', 'MULTIMEDIA_DEVICES', 'OPTICAL_DRIVES', 'OS', 'PORTS', 'POWER_SUPPLY', 'SHIPPING', 'SOFTWARE', 'SUPPORT', 'WARRANTY']
observed attribute labels: ['CONNECTIVITY', 'DESIGN_FEATURES', 'GENERAL', 'MISCELLANEOUS', 'OPERATION_PERFORMANCE', 'PORTABILITY', 'PRICE', 'QUALITY', 'USABILITY']
observed polarities: ['negative', 'neutral', 'positive']
number of unique targets: 0


## Example BERT Tokenisation

In [5]:
example_batch = []
for i in (0, 4, 8, 18):  # select a few interesting instances
    example_batch.append(dataset[i][1])   

tokenised_text = tokeniser(
    example_batch,
    is_split_into_words = False,
)

for i, token_ids in enumerate(tokenised_text['input_ids']):
    if i: print()
    print(i, '\tinput:        ', example_batch[i])
    print(   "\t['input_ids']:", token_ids)
    print(   '\ttokens:       ', tokeniser.convert_ids_to_tokens(token_ids))

0 	input:         This computer is absolutely AMAZING!!!
	['input_ids']: [101, 2023, 3274, 2003, 7078, 6429, 999, 999, 999, 102]
	tokens:        ['[CLS]', 'this', 'computer', 'is', 'absolutely', 'amazing', '!', '!', '!', '[SEP]']

1 	input:         and plenty of storage with 250 gb(though I will upgrade this and the ram..)
	['input_ids']: [101, 1998, 7564, 1997, 5527, 2007, 5539, 16351, 1006, 2295, 1045, 2097, 12200, 2023, 1998, 1996, 8223, 1012, 1012, 1007, 102]
	tokens:        ['[CLS]', 'and', 'plenty', 'of', 'storage', 'with', '250', 'gb', '(', 'though', 'i', 'will', 'upgrade', 'this', 'and', 'the', 'ram', '.', '.', ')', '[SEP]']

2 	input:         GET THIS COMPUTER FOR PORTABILITY AND FAST PROCESSING!!!
	['input_ids']: [101, 2131, 2023, 3274, 2005, 3417, 8010, 1998, 3435, 6364, 999, 999, 999, 102]
	tokens:        ['[CLS]', 'get', 'this', 'computer', 'for', 'port', '##ability', 'and', 'fast', 'processing', '!', '!', '!', '[SEP]']

3 	input:         without a big ol' clunky machine i

## Sequence Length Distribution

In [6]:
from collections import defaultdict
    
bin_width = 10

distribution = defaultdict(lambda: 0)

dataset.append(7*[None])
batch = []
labels = []
max_length_bin = 0
for _, text, _, _, _, _, label in dataset:
    if text is not None:
        batch.append(text)
        labels.append(label)
    if len(batch) == batch_size \
    or (text is None and len(batch) > 0):
        tokenised_batch = tokeniser(
            batch,
            is_split_into_words = False,
        )
        for index, token_ids in enumerate(tokenised_batch['input_ids']):
            label = labels[index]
            length = len(token_ids)
            length_bin = length // bin_width
            distribution[(label,   length_bin)] += 1
            distribution[('total', length_bin)] += 1
            if length_bin > max_length_bin:
                max_length_bin = length_bin
        batch = []
        labels = []
                
header = []
header.append('LengthBin')
for polarity in sorted(observed_polarities):
    header.append('%12s' %polarity)
header.append('%12s' %'Total')
header.append('%12s' %'Positivity')
print('\t'.join(header))
for length_bin in range(0, max_length_bin+1):
    row = []
    row.append('%4d-%4d' %(
        bin_width*length_bin,
        bin_width*(1+length_bin)-1
    ))
    total = 0
    for label in sorted(observed_polarities):
        count = distribution[(label, length_bin)]
        row.append('%12d' %count)
        total += count
    row.append('%12d' %total)
    if total:
        row.append('%10.0f%%' %(100.0*distribution[('positive', length_bin)]/float(total)))
    else:
        row.append('%11s' %'n/a')
    print('\t'.join(row))

LengthBin	    negative	     neutral	    positive	       Total	  Positivity
   0-   9	         123	          20	         279	         422	        66%
  10-  19	         484	          75	         744	        1303	        57%
  20-  29	         303	          59	         428	         790	        54%
  30-  39	         102	          17	         125	         244	        51%
  40-  49	          47	          15	          38	         100	        38%
  50-  59	          11	           2	          20	          33	        61%
  60-  69	           8	           0	           0	           8	         0%
  70-  79	           3	           0	           3	           6	        50%
  80-  89	           3	           0	           0	           3	         0%
