In [1]:
from datasets import load_dataset
import datasets
import pandas as pd
import os
from transformers import AutoModelForTokenClassification, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [3]:
TEST_TASK1 = './data/test_task1.csv'
TEST_TASK2 = './data/test_task2.csv'

In [4]:
HUGGING_FACE_MODELS = [ 'roberta-final2',
                        'mdeberta-v3-base-huggingface-final2', 'mdeberta-v3-base-huggingface-more-training-portuguese-data-final2' ]
for i in range(len(HUGGING_FACE_MODELS)):
    HUGGING_FACE_MODELS[i] = os.path.join('./ATE/expreriments', HUGGING_FACE_MODELS[i])

In [5]:
HUGGING_FACE_MODELS

['./ATE/expreriments/roberta-final2',
 './ATE/expreriments/mdeberta-v3-base-huggingface-final2',
 './ATE/expreriments/mdeberta-v3-base-huggingface-more-training-portuguese-data-final2']

In [6]:
df = pd.read_csv(TEST_TASK1, sep=';')

In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize.treebank import TreebankWordTokenizer  

pre_dataset = []
for i, row in df.iterrows():
    pre_dataset.append({'id': i, 'tokens': TreebankWordTokenizer().tokenize(row['review'])})

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
import torch
import numpy as np
device = torch.device('cuda')

In [9]:
models_preds = []
for model_path in HUGGING_FACE_MODELS:
    if 'roberta' in model_path:
        tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    model.to(device)
    
    from datasets import Dataset
    data = {
        'test':  Dataset.from_pandas(pd.DataFrame(pre_dataset))
    }

    dataset = datasets.DatasetDict(data)

    features = datasets.Features(
        {
            'id': datasets.Value('int32'),
            'tokens': datasets.Sequence(datasets.Value('string'))
        }
    )

    dataset = dataset.map(features.encode_example, features=features)

    def tokenize_and_align_labels(dataset_unaligned, label_all_tokens = False):
        tokenized_inputs = tokenizer(dataset_unaligned["tokens"], truncation=True, is_split_into_words=True, max_length=512)

        word_ids_batch = []
        for tokens in dataset_unaligned["tokens"]:
            word_ids_batch.append([None])
            tokens_len = 2
            for word_idx, token in enumerate(tokens):
                if 'roberta' in model_path:
                    token_processed = tokenizer([token], add_special_tokens=False, truncation=True, is_split_into_words=True, max_length=512)
                else:
                    token_processed = tokenizer(token, add_special_tokens=False, truncation=True, is_split_into_words=True, max_length=512)
                for i in range(len(token_processed['input_ids'])):
                    tokens_len += 1
                    if tokens_len <= 512:
                        word_ids_batch[-1].append(word_idx)
            word_ids_batch[-1].append(None)
        tokenized_inputs['word_id'] = word_ids_batch
        return tokenized_inputs

    tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
    
    raw_input_1 = tokenizer([p['tokens'] for p in pre_dataset[0:125]], pad_to_max_length=True, truncation=True, is_split_into_words=True, max_length=512, return_tensors="pt")
    raw_input_1.to(device)

    with torch.no_grad():
        scores_1 = torch.nn.functional.softmax(model(**raw_input_1).logits.cpu().detach(), dim=1).numpy()
    
    del raw_input_1
    
    raw_input = tokenizer([p['tokens'] for p in pre_dataset[125:]], pad_to_max_length=True, truncation=True, is_split_into_words=True, max_length=512, return_tensors="pt")
    raw_input.to(device)

    with torch.no_grad():
        scores = torch.nn.functional.softmax(model(**raw_input).logits.cpu().detach(), dim=1).numpy()
    
    del raw_input
    
    scores = np.concatenate((scores_1, scores))
    
    preds = []
    for i, pred in enumerate(scores):
        r = []
        word_ids = tokenized_datasets['test'][i]['word_id']
        tokens = tokenized_datasets['test'][i]['tokens']
        prev_word = None
        for j, label in enumerate(pred):
            if j < len(word_ids) and word_ids[j] is not None:
                token = tokens[word_ids[j]] 
                if prev_word != word_ids[j]:
                    r.append(label)
                    prev_word = word_ids[j]
        preds.append(np.array(r))
    models_preds.append(np.array(preds))
    
    del model

100%|██████████| 257/257 [00:00<00:00, 3303.69ex/s]
100%|██████████| 1/1 [00:01<00:00,  1.07s/ba]
  models_preds.append(np.array(preds))
100%|██████████| 257/257 [00:00<00:00, 2173.75ex/s]
100%|██████████| 1/1 [00:01<00:00,  1.74s/ba]
100%|██████████| 257/257 [00:00<00:00, 3317.06ex/s]
100%|██████████| 1/1 [00:01<00:00,  1.75s/ba]


In [14]:
import json
import string
result = []
for i, j, k, p in zip(models_preds[0], models_preds[1], models_preds[2], pre_dataset):
    h = []
    possible_label = None
    if i.shape != j.shape:
        i = j
    for a, label in enumerate(np.argmax(i + j + k, axis=1)):
        if label == 0:
            possible_label = p['tokens'][a]
        elif label == 1:
            if possible_label is None:
                continue
                possible_label = p['tokens'][a]
            else: 
                possible_label = possible_label + " " + p['tokens'][a]
        else:
            if possible_label is not None:
                possible_label = possible_label.translate(str.maketrans('', '', string.punctuation))
                h.append(possible_label)
                possible_label = None
    result.append(h)

In [15]:
with open('x.csv', 'w') as f:
    for i, labels in enumerate(result):
        f.write(str(i) + ";" + '"' + str(labels) + '"'+"\n")

In [12]:
df['aspectos'] = result

In [13]:
df[['id', 'aspectos']].to_csv('task1.csv', index=False, header=False, sep=';')