In [1]:
!pip install transformers --quiet
!pip install xmltodict --quiet

In [2]:
from transformers import RobertaTokenizer
import torch
import re

polarity_dict = {'positive':'pos', 'negative':'neg', 'neutral':'neu', 'conflict':'con'}

def get_labels(sentence, tokens, aspect_list):
  if len(aspect_list) == 0:
    return ['O' for i in range(len(tokens))]

  new_tokens = tokens[1:-1]
  cur_pos = 0
  cur_asp_idx = 0
  ans = ['O']
  for x in new_tokens:
    cur_sub_word = x if x[:1]!='Ġ' else x[1:]
    #pattern = re.compile(cur_sub_word)
    #match = pattern.search(sentence, cur_pos)
    #s,e = match.span()
    s = sentence.find(cur_sub_word, cur_pos)
    e = s + len(cur_sub_word)
    if cur_asp_idx < len(aspect_list) and s >= aspect_list[cur_asp_idx]['start'] and s<aspect_list[cur_asp_idx]['end']:
      cur_char = 'B' if ans[-1]=='O' else 'I'
      ans.append(f'{cur_char}-{polarity_dict[aspect_list[cur_asp_idx]["polarity"]]}')
    else:
      ans.append('O')
    if cur_asp_idx < len(aspect_list) and s >= aspect_list[cur_asp_idx]['end']:
      cur_asp_idx += 1
    cur_pos = e
  ans.append('O')
  return ans


# my_sentence = "I charge it at night and skip taking the cord with me because of the good battery life."
# polarity = ['neutral', 'positive']
# start_of_aspect = [41, 74]
# end_of_aspect = [45, 86]
# aspect_list = [{'polarity':'neutral', 'start':41, 'end':45}, {'polarity':'positive', 'start':74, 'end':86}]

my_sentence = "I even got my teenage son one, because of the features that it offers, like, iChat, Photobooth, garage band and more!"
polarity = ['positive']
start_of_aspect = [46, 77, 84, 96]
end_of_aspect = [54, 82, 94, 107]
aspect_list = [{'polarity':'positive', 'start':46, 'end':54}, {'polarity':'positive', 'start':77, 'end':82},
               {'polarity':'positive', 'start':84, 'end':94}, {'polarity':'positive', 'start':96, 'end':107}]

print(len(my_sentence))
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')


input_ids = tokenizer.encode(my_sentence)
subwords = tokenizer.convert_ids_to_tokens(input_ids)
print(input_ids)
print(subwords)
labels = get_labels(my_sentence, subwords, aspect_list)
print(labels)

assert len(input_ids) == len(labels)

117
[0, 100, 190, 300, 127, 9231, 979, 65, 6, 142, 9, 5, 1575, 14, 24, 1523, 6, 101, 6, 939, 29665, 6, 23769, 2413, 22571, 6, 8247, 1971, 8, 55, 328, 2]
['<s>', 'I', 'Ġeven', 'Ġgot', 'Ġmy', 'Ġteenage', 'Ġson', 'Ġone', ',', 'Ġbecause', 'Ġof', 'Ġthe', 'Ġfeatures', 'Ġthat', 'Ġit', 'Ġoffers', ',', 'Ġlike', ',', 'Ġi', 'Chat', ',', 'ĠPhot', 'ob', 'ooth', ',', 'Ġgarage', 'Ġband', 'Ġand', 'Ġmore', '!', '</s>']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pos', 'O', 'O', 'O', 'O', 'O', 'O', 'B-pos', 'I-pos', 'O', 'B-pos', 'I-pos', 'I-pos', 'O', 'B-pos', 'I-pos', 'O', 'O', 'O', 'O']


In [3]:
import xmltodict
import pandas as pd
from tqdm import tqdm

with open("Laptop_Train_v2.xml", "r", encoding="utf-8") as f:
  obj = xmltodict.parse(f.read())

obj = obj['sentences']['sentence']
out_sid = []
out_tokens = []
out_labels = []
for x in tqdm(obj, total=len(obj)):
  sentence = x['text']
  sent_id = x['@id']
  aspect_list = []
  if 'aspectTerms' in x.keys():
    aspects = x['aspectTerms']['aspectTerm']
    if type(aspects) != list:
      aspects = [aspects]
    for a in aspects:
      aspect_list.append({'polarity':a['@polarity'], 'start':int(a['@from']), 'end':int(a['@to'])})
    aspect_list.sort(key=lambda x: x['start'])
  input_ids = tokenizer.encode(sentence)
  subwords = tokenizer.convert_ids_to_tokens(input_ids)
  
  labels = get_labels(sentence, subwords, aspect_list)
  # print(sentence)
  # print(subwords)
  # print(aspect_list)
  # print(labels)
  # print()
  assert len(labels) == len(subwords)
  out_sid.append(sent_id)
  out_tokens.append(input_ids)
  out_labels.append(labels)

print('Writing to file...')
df = pd.DataFrame()
df['sid'] = out_sid
df['token_ids'] = out_tokens
df['labels'] = out_labels
df.to_csv('preproc_roberta_14_laptop_train.csv', index=False)
print('DONE')





100%|████████████████████████████████████████████████████████████████████████████| 3045/3045 [00:02<00:00, 1416.54it/s]


Writing to file...
DONE
