In [1]:
from datasets import load_dataset
import datasets
import pandas as pd
import os

TRAIN_FILE = './data/train_set_v2.csv'
EXTERNAL_FILE = './data/external-data.csv'
TEST_FILE = './data/test_set_v2.csv'
SPLIT_TRAIN_IN_VAL = False

modelo = 'microsoft/mdeberta-v3-base'
use_auth_token = None
MAX_LEN = 512
experiment_name = 'mdeberta-v3-base-huggingface-more-training'
model_dir ='./ATE/expreriments/' + experiment_name + '/'

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

train_df = pd.read_csv(TRAIN_FILE, sep=';')
external_df = pd.read_csv(EXTERNAL_FILE, sep=';')
test_df = pd.read_csv(TEST_FILE, sep=';')

ate_train_data = train_df.groupby('review').agg(list).reset_index()
ate_test_data = test_df.groupby('review').agg(list).reset_index()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ate_external_data = external_df.groupby('review').agg(list).reset_index()
#Remover repetições
ate_external_data = ate_external_data[ate_external_data['dataset'].apply(set).apply(len) == 1]
ate_external_data = ate_external_data[ate_external_data['split'].apply(set).apply(len) == 1]

In [3]:
ate_external_data['language'] = ate_external_data['language'].apply(lambda x: x[0])
ate_external_data['dataset'] = ate_external_data['dataset'].apply(lambda x: x[0])
ate_external_data['split'] = ate_external_data['split'].apply(lambda x: x[0])

In [4]:
ate_train_data['split'] = 'train'
ate_test_data['split'] = 'test'
for df in [ate_train_data, ate_test_data]:
    df['language'] = 'portuguese'
    df['dataset'] = 'absapt2022'

In [5]:
ate_train_data = pd.concat((ate_train_data, ate_external_data), ignore_index=True)

In [6]:
uso_no_colab = False
if uso_no_colab:
  !pip install transformers datasets seqeval
  !pip install wandb

%env WANDB_PROJECT=ABSAPT22_ATE
%env WANDB_WATCH=all
%env WANDB_NOTEBOOK_NAME=experiment_name    
    
import wandb
wandb.login()

env: WANDB_PROJECT=ABSAPT22_ATE
env: WANDB_WATCH=all
env: WANDB_NOTEBOOK_NAME=experiment_name


[34m[1mwandb[0m: Currently logged in as: [33meduagarcia[0m (use `wandb login --relogin` to force relogin)


True

In [7]:
if SPLIT_TRAIN_IN_VAL:
    from sklearn.model_selection import train_test_split
    ate_train_data, ate_dev_data = train_test_split(ate_train_data, test_size=0.2)
else:
    ate_dev_data = ate_test_data.copy()

In [8]:
def is_span_a_subset(span, aspect_span):
    if span[0] >= aspect_span[1]:
        return False
    elif span[1] < aspect_span[0]:
        return False
    else:
        return True

In [9]:
import nltk
nltk.download('punkt')
from nltk.tokenize.treebank import TreebankWordTokenizer   
def convert_to_bio(df):
    data = []
    for i, row in df.iterrows():
        #tokens = tokenize.word_tokenize(row['review'], language='portuguese')
        aspects_span = [[i, j, p, 0] for i, j, p in zip(row['start_position'], row['end_position'], row['polarity'])]
        tokens = []
        ner_tags = []
        span_generator = TreebankWordTokenizer().span_tokenize(row['review'])
        for span in span_generator:
            tokens.append(row['review'][span[0]:span[1]])
            is_aspect = False
            aspect_data = None
            for aspect_span in aspects_span:
                if is_span_a_subset(span, aspect_span):
                    is_aspect = True
                    aspect_data = aspect_span
            if is_aspect:
                label = 'ASPECT'
                
                #polarity_id = int(aspect_data[2])
                #if polarity_id == 1:
                #    label = 'POSITIVE'
                #elif polarity_id == 0:
                #    label = 'NEUTRAL'
                #else:
                #    label = 'NEGATIVE'
                
                if aspect_data[3] == 0:
                    ner_tags.append('B-'+label)
                    aspect_data[3] = aspect_data[3] + 1
                else:
                    ner_tags.append('I-'+label)
            else:
                ner_tags.append('O')
        data.append({'id': i, 'tokens': tokens, 'ner_tags': ner_tags})
    return data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
from datasets import Dataset
data = {
    'train': Dataset.from_pandas(pd.DataFrame(convert_to_bio(ate_train_data))),
    'validation':  Dataset.from_pandas(pd.DataFrame(convert_to_bio(ate_dev_data))),
    'test':  Dataset.from_pandas(pd.DataFrame(convert_to_bio(ate_test_data)))
}


dataset = datasets.DatasetDict(data)

In [11]:
sep = " "
for data_type in dataset.keys():
    with open(f'./ATE/data/external-data/{data_type}.conll', 'w') as f:
        for tokens, tags in zip(dataset[data_type]['tokens'], dataset[data_type]['ner_tags']):
            for token, tag in zip(tokens, tags):
                f.write(str(token)+sep+tag+'\n')
            f.write('\n')

In [12]:
label_list = list(set(tag for doc in dataset['train']['ner_tags'] for tag in doc))
label_list

['B-ASPECT', 'O', 'I-ASPECT']

In [13]:
features = datasets.Features(
    {
        'id': datasets.Value('int32'),
        'tokens': datasets.Sequence(datasets.Value('string')),
        'ner_tags': datasets.Sequence(
            datasets.features.ClassLabel(names=label_list)
        )
    }
)

dataset = dataset.map(features.encode_example, features=features)

100%|██████████| 22672/22672 [00:05<00:00, 4371.90ex/s]
100%|██████████| 170/170 [00:00<00:00, 1967.27ex/s]
100%|██████████| 170/170 [00:00<00:00, 1960.38ex/s]


In [14]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(modelo, use_auth_token=use_auth_token)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
tokenizer.__class__

transformers.models.deberta_v2.tokenization_deberta_v2.DebertaV2Tokenizer

In [16]:
tokenized_inputs = tokenizer(dataset['train']["tokens"], truncation=True, is_split_into_words=True, max_length=512)

In [17]:
tokenized_inputs['input_ids'][0]

[1,
 260,
 265,
 2725,
 1881,
 14135,
 260,
 262,
 520,
 5551,
 260,
 263,
 29287,
 271,
 7186,
 376,
 12299,
 269,
 260,
 362,
 674,
 12688,
 58522,
 4473,
 6544,
 260,
 262,
 1635,
 427,
 266,
 81385,
 261,
 140703,
 23852,
 264,
 270,
 29924,
 264,
 530,
 47068,
 261,
 260,
 14338,
 260,
 362,
 14488,
 270,
 616,
 94878,
 260,
 262,
 260,
 362,
 260,
 269,
 5526,
 270,
 327,
 43814,
 261,
 323,
 299,
 585,
 43307,
 270,
 5471,
 7461,
 260,
 266,
 215299,
 2713,
 333,
 263,
 270,
 8013,
 11024,
 295,
 47603,
 270,
 60034,
 5236,
 260,
 362,
 1059,
 751,
 260,
 262,
 260,
 263,
 260,
 27063,
 260,
 362,
 1913,
 3886,
 15070,
 338,
 270,
 333,
 263,
 261,
 323,
 3678,
 787,
 18241,
 351,
 47603,
 270,
 60034,
 5236,
 260,
 3529,
 16250,
 1665,
 2352,
 338,
 260,
 262,
 674,
 260,
 164900,
 270,
 260,
 32746,
 263,
 725,
 2228,
 93197,
 652,
 1209,
 21487,
 450,
 479,
 260,
 262,
 376,
 2096,
 1356,
 124643,
 1519,
 270,
 9518,
 4931,
 652,
 1209,
 431,
 479,
 260,
 261,
 2]

In [18]:
tokenizer(dataset['train']["tokens"][0][0], add_special_tokens=False, truncation=True, is_split_into_words=True, max_length=512)

{'input_ids': [260, 265], 'token_type_ids': [0, 0], 'attention_mask': [1, 1]}

In [19]:
#função que ajusta os labels para o tamanho dos textos após tokenização do BERT
#necessário pois palavras podem ser subdivididas com ##
def tokenize_and_align_labels(dataset_unaligned, label_all_tokens = False):
    tokenized_inputs = tokenizer(dataset_unaligned["tokens"], truncation=True, is_split_into_words=True, max_length=512)
    #print(tokenized_inputs)
    labels = []
    for i, label in enumerate(dataset_unaligned[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # None é o valor para tokens especiais -> -100 para ignorar na função de custo
            if word_idx is None: #special tokens
                label_ids.append(-100)
            #palavra nova
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            #label_all_tokens = True -> mesma tag para todos os subtokens
            #label_all_tokens = False -> apenas primeiro subtoken ganha tag
            else: #subpalavra
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [20]:
#função que ajusta os labels para o tamanho dos textos após tokenização do BERT
#necessário pois palavras podem ser subdivididas com ##
def tokenize_and_align_labels_python_tokenizer(dataset_unaligned, label_all_tokens = False):    
    tokenized_inputs = tokenizer(dataset_unaligned["tokens"], truncation=True, is_split_into_words=True, max_length=512)
    special_tokens = [tokenized_inputs['input_ids'][0][0], tokenized_inputs['input_ids'][0][-1]]
    
    word_ids_batch = []
    for tokens in dataset_unaligned["tokens"]:
        word_ids_batch.append([None])
        tokens_len = 2
        for word_idx, token in enumerate(tokens):
            token_processed = tokenizer(token, add_special_tokens=False, truncation=True, is_split_into_words=True, max_length=512)
            for i in range(len(token_processed['input_ids'])):
                tokens_len += 1
                if tokens_len <= 512:
                    word_ids_batch[-1].append(word_idx)
        word_ids_batch[-1].append(None)
    #print(tokenized_inputs)
    labels = []
    for i, label in enumerate(dataset_unaligned[f"ner_tags"]):
        word_ids = word_ids_batch[i]
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # None é o valor para tokens especiais -> -100 para ignorar na função de custo
            if word_idx is None: #special tokens
                label_ids.append(-100)
            #palavra nova
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            #label_all_tokens = True -> mesma tag para todos os subtokens
            #label_all_tokens = False -> apenas primeiro subtoken ganha tag
            else: #subpalavra
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [21]:
tokenized_datasets = dataset.map(tokenize_and_align_labels_python_tokenizer, batched=True)
tokenized_datasets

100%|██████████| 23/23 [00:49<00:00,  2.14s/ba]
100%|██████████| 1/1 [00:01<00:00,  1.13s/ba]
100%|██████████| 1/1 [00:01<00:00,  1.12s/ba]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 22672
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 170
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 170
    })
})

In [22]:
from datasets import load_metric
metric = load_metric("seqeval")

In [23]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [24]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [25]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(modelo, num_labels=len(label_list), use_auth_token=use_auth_token)

Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a B

In [26]:
per_device_batch_size = 8
gradient_accumulation_steps = 1

total_steps_epoch = len(dataset['train']) // (per_device_batch_size * gradient_accumulation_steps)

learning_rate = 4e-5 
num_train_epochs = round(50000/total_steps_epoch)  
weight_decay = 0.01
warmup_ratio=0.1                         #  primeiros 10% --> Artigo Souza 2019

save_total_limit = 3
logging_steps = total_steps_epoch#(num_train_epochs*total_steps_epoch) // 20 # 20x por treinamento
eval_steps = logging_steps
evaluation_strategy = 'steps'
logging_strategy = 'steps'
save_strategy = 'steps'
save_steps = logging_steps
load_best_model_at_end = True

fp16 = False

# folders

folder_model = 'e' + str(num_train_epochs) + '_lr' + str(learning_rate)
output_dir = model_dir + 'results'
logging_dir = model_dir + 'results'
# get best model through a metric
metric_for_best_model = 'eval_f1'
if metric_for_best_model == 'eval_f1':
    greater_is_better = True
elif metric_for_best_model == 'eval_loss':
    greater_is_better = False  

args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size*2,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    warmup_ratio=warmup_ratio,
    save_total_limit=save_total_limit,
    logging_steps = logging_steps,
    eval_steps = logging_steps,
    load_best_model_at_end = load_best_model_at_end,
    metric_for_best_model = metric_for_best_model,
    greater_is_better = greater_is_better,
    gradient_checkpointing = False,
    do_train = True,
    do_eval = True,
    do_predict = True,
    evaluation_strategy = evaluation_strategy,
    logging_dir=logging_dir, 
    logging_strategy = logging_strategy,
    save_strategy = save_strategy,
    save_steps = save_steps,
    fp16 = fp16,
    push_to_hub=False,
)

In [27]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [28]:
from transformers.trainer_callback import EarlyStoppingCallback

# wait early_stopping_patience x eval_steps before to stop the training in order to get a better model
early_stopping_patience = save_total_limit

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)],
)

In [29]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: ner_tags, id, tokens. If ner_tags, id, tokens are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 22672
  Num Epochs = 18
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 51012
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
2834,0.2488,0.070152,0.7312,0.756623,0.743694,0.974129
5668,0.1468,0.04298,0.813772,0.860927,0.836685,0.983015
8502,0.1185,0.044817,0.758486,0.961921,0.848175,0.982543
11336,0.0914,0.049755,0.783265,0.945364,0.856714,0.98388
14170,0.0704,0.044886,0.781944,0.932119,0.850453,0.983251
17004,0.0537,0.052806,0.788515,0.932119,0.854325,0.983801
19838,0.0422,0.046988,0.791086,0.940397,0.859304,0.98443
22672,0.0327,0.054973,0.795164,0.925497,0.855394,0.984116
25506,0.025,0.062674,0.811437,0.869205,0.839329,0.983329
28340,0.0188,0.068446,0.785311,0.92053,0.847561,0.983172


The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: ner_tags, id, tokens. If ner_tags, id, tokens are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 170
  Batch size = 16
Saving model checkpoint to ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/results/checkpoint-2834
Configuration saved in ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/results/checkpoint-2834/config.json
Model weights saved in ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/results/checkpoint-2834/pytorch_model.bin
tokenizer config file saved in ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/results/checkpoint-2834/tokenizer_config.json
Special tokens file saved in ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/results/checkpoint-2834/special_tokens_map.j

TrainOutput(global_step=28340, training_loss=0.0848295199677569, metrics={'train_runtime': 2187.7113, 'train_samples_per_second': 186.54, 'train_steps_per_second': 23.318, 'total_flos': 1.0609744219205472e+16, 'train_loss': 0.0848295199677569, 'epoch': 10.0})

In [30]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: ner_tags, id, tokens. If ner_tags, id, tokens are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 170
  Batch size = 16


{'eval_loss': 0.04698812589049339,
 'eval_precision': 0.7910863509749304,
 'eval_recall': 0.9403973509933775,
 'eval_f1': 0.8593040847201211,
 'eval_accuracy': 0.9844302901627743,
 'eval_runtime': 0.706,
 'eval_samples_per_second': 240.8,
 'eval_steps_per_second': 15.581,
 'epoch': 10.0}

In [31]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: ner_tags, id, tokens. If ner_tags, id, tokens are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 170
  Batch size = 16


{'ASPECT': {'precision': 0.7910863509749304,
  'recall': 0.9403973509933775,
  'f1': 0.8593040847201211,
  'number': 604},
 'overall_precision': 0.7910863509749304,
 'overall_recall': 0.9403973509933775,
 'overall_f1': 0.8593040847201211,
 'overall_accuracy': 0.9844302901627743}

In [32]:
trainer.save_model(model_dir)

Saving model checkpoint to ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/
Configuration saved in ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/config.json
Model weights saved in ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/pytorch_model.bin
tokenizer config file saved in ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/tokenizer_config.json
Special tokens file saved in ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/special_tokens_map.json
added tokens file saved in ./ATE/expreriments/mdeberta-v3-base-huggingface-more-training/added_tokens.json


In [33]:
import pandas as pd
resultado_eval = pd.DataFrame(results)
resultado_eval.to_csv(model_dir+'metricas_validation.csv')
resultado_eval

Unnamed: 0,ASPECT,overall_precision,overall_recall,overall_f1,overall_accuracy
f1,0.859304,0.791086,0.940397,0.859304,0.98443
number,604.0,0.791086,0.940397,0.859304,0.98443
precision,0.791086,0.791086,0.940397,0.859304,0.98443
recall,0.940397,0.791086,0.940397,0.859304,0.98443


In [34]:
raw_predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(raw_predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set  don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: ner_tags, id, tokens. If ner_tags, id, tokens are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 170
  Batch size = 16


{'ASPECT': {'precision': 0.7910863509749304,
  'recall': 0.9403973509933775,
  'f1': 0.8593040847201211,
  'number': 604},
 'overall_precision': 0.7910863509749304,
 'overall_recall': 0.9403973509933775,
 'overall_f1': 0.8593040847201211,
 'overall_accuracy': 0.9844302901627743}

In [35]:
resultado_teste = pd.DataFrame(results)
resultado_teste.to_csv(model_dir+'metricas_test.csv')
resultado_teste

Unnamed: 0,ASPECT,overall_precision,overall_recall,overall_f1,overall_accuracy
f1,0.859304,0.791086,0.940397,0.859304,0.98443
number,604.0,0.791086,0.940397,0.859304,0.98443
precision,0.791086,0.791086,0.940397,0.859304,0.98443
recall,0.940397,0.791086,0.940397,0.859304,0.98443


In [36]:
o_index = label_list.index('O')
preds_index = np.asarray([i for i in range(len(label_list)) if i != o_index])

y_pred = []
y_true = []

for sentence_raw_prediction, sentence_labels in zip(raw_predictions, true_labels):
    for raw_prediction, true_label in zip(sentence_raw_prediction, sentence_labels):
        if true_label.startswith('B'):
            best_pred_idx = np.argmax(raw_prediction[preds_index])
            best_pred = preds_index[best_pred_idx]
            pred_label = label_list[best_pred][2:]
            true_label = true_label[2:]
            y_pred.append(pred_label)
            y_true.append(true_label)

In [37]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

      ASPECT       1.00      1.00      1.00       604

    accuracy                           1.00       604
   macro avg       1.00      1.00      1.00       604
weighted avg       1.00      1.00      1.00       604

