# Evaluation
- This notebook contains code for:
    1.   Generating and saving predictions from a saved model
    2.   Performing seqeval on predicted vs. true labels

- Models and Predictions: [/data/predictions/](https://drive.google.com/drive/folders/1qBLSgm2EdjuZ3FAFNRrPQ8_C0VrIjnls)

- Key Functions:
    - **select_data** - select original dataset for labels
    - **create_dataset** - create dataset for model predictions
    - **create_xyz_model** - create lf model to reload model to generate predictions
    - **convert_ids_to_labels** - converts integers to class labels and generates y_true, y_pred

In [1]:
!pip install -q datasets
!pip install -q evaluate
!pip install -q seqeval

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# generic
import numpy as np
from itertools import zip_longest

# ml
from transformers import LongformerForTokenClassification, DataCollatorForTokenClassification, LongformerTokenizerFast, Trainer, TrainingArguments
from datasets import load_from_disk, Dataset
import tensorflow as tf
from tensorflow import keras
import evaluate
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

In [3]:
# use for vertex ai / google cloud
# from google.cloud import storage

# client = storage.Client()
# bucket_name = 'w266-project'
# bucket = client.get_bucket(bucket_name)
# path = f'gs://{bucket_name}'

# use for google colab
from google.colab import drive

drive.mount('/content/drive')
path = '/content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project'

Mounted at /content/drive


In [4]:
# general functions
def select_data(split, task, size):
    """
    Loads the appropriate dataset per folder structure here: https://drive.google.com/drive/folders/1C3h3rXdbr9nVAC3_G_I-72DfKNiDU_Pa
    Input:
        Split: ['train', 'val', 'test']
        Task: ['ner', 'mask', 'both']
        Size: ['testing', 'mini', 'full']
    Returns:
        Huggingface dataset
    """
    if split not in ['train', 'val', 'test']:
        raise ValueError("Split value must be in ['train', 'val', 'test']")
    if task not in ['ner', 'mask', 'both', 'binary']:
        raise ValueError("Task value must be in ['ner', 'mask', 'both']")
    if size not in ['testing', 'mini', 'full']:
        raise ValueError("Size value must be in ['testing', 'mini', 'full']")

    path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_mask', 'binary': 'longformer_binary'}
    # path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_4096'}

    if size == 'testing':
        ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_testing')
    if size == 'mini':
        if split == 'train':
            ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_400')
        else:
            ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_50')
    if size == 'full':
        ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}')

    return ds

def create_dataset(split, task, size):
    """Creates appropriate dataset depending on training objective.
    Input:
        dataset = use load_from_disk(<path>)
    Output:
        returns dataset for training
    """

    if task == 'both':
        labels = ['ner_labels', 'mask_labels']
    else:
        labels = ['labels']

    ds = select_data(split=split, task=task, size=size)

    data_collator = DataCollatorForTokenClassification(tokenizer, max_length=4096, return_tensors='np')

    data_set = ds['train'].to_tf_dataset(
        columns=['input_ids', 'attention_mask'],
        label_cols=labels,
        shuffle=True,
        batch_size=16,
        collate_fn=data_collator
    )

    return data_set

# Model and Predictions
- use if predictions are not already saved from prior steps

## Functions

In [5]:
# def create_lf_single_class_model(task, model_checkpoint=model_checkpoint, max_sequence_length=4096, learning_rate=0.01):
#     if task =='ner':
#         classes = len(['O', 'B-PERSON', 'I-PERSON', 'B-CODE', 'I-CODE', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG',
#                 'B-DEM', 'I-DEM', 'B-DATETIME', 'I-DATETIME', 'B-QUANTITY', 'I-QUANTITY', 'B-MISC', 'I-MISC'])
#     if task == 'mask':
#         classes = len(['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT', 'B-QUASI', 'I-QUASI'])

#     # begin longformer model
#     lf_model = TFLongformerModel.from_pretrained(model_checkpoint)
#     lf_model.trainable=False

#     input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name='input_ids')
#     attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int32, name='attention_mask')
#     lf_inputs = {'input_ids': input_ids,
#                 'attention_mask': attention_mask}
#     lf_outputs = lf_model(lf_inputs)
#     embedding = lf_outputs['last_hidden_state']

#     # begin custom model
#     dropout1 = tf.keras.layers.Dropout(0.3)(embedding)
#     classification_layer = tf.keras.layers.Dense(classes, activation='softmax', name='classification')(dropout1)

#     model = tf.keras.Model(
#         inputs=[input_ids, attention_mask],
#         outputs=[classification_layer]
#         )
#     model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
#                   loss=tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-100),
#                   metrics='accuracy')

#     return model

# metrics
def compute_metrics(p):
    seqeval = evaluate.load('seqeval')
    # accuracy = evaluate.load('accuracy')

    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    label_list = ['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT', 'B-QUASI', 'I-QUASI']
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # flat_predictions = [p for seq in true_predictions for p in seq]
    # flat_labels = [l for seq in true_labels for l in seq]

    # accuracy = accuracy.compute(prediction=flat_predictions, references=flat_labels)
    results = seqeval.compute(predictions=true_predictions, references=true_labels, zero_division=1)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "seqeval_acc": results["overall_accuracy"],
    }

def count_trainable_parameters(model):
    # Get the trainable parameters of the model
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return trainable_params

# all ner_tags have weight of 1; O has half weight
ner_attention_weights_equal = {'O': 1,
                               'OTHER': 1.5}

# high direct vs high quasi
out = .5
direct = 1.5
quasi = 1
ner_attention_weights_weighted_1 = {'O': out,
                                    'B-PERSON': direct,
                                    'I-PERSON': direct,
                                    'B-CODE': direct,
                                    'I-CODE': direct,
                                    'B-LOC': quasi,
                                    'I-LOC': quasi,
                                    'B-ORG': quasi,
                                    'I-ORG': quasi,
                                    'B-DEM': quasi,
                                    'I-DEM': quasi,
                                    'B-DATETIME': quasi,
                                    'I-DATETIME': quasi,
                                    'B-QUANTITY': quasi,
                                    'I-QUANTITY': quasi,
                                    'B-MISC': quasi,
                                    'I-MISC': quasi}

# order of direct + quasi
ner_attention_weights_weighted_2 = {'O': out,
                                    'B-PERSON': 1.4,
                                    'I-PERSON': 1.4,
                                    'B-CODE': 1.4,
                                    'I-CODE': 1.4,
                                    'B-LOC': 1.2,
                                    'I-LOC': 1.2,
                                    'B-ORG': 1,
                                    'I-ORG': 1,
                                    'B-DEM': 1,
                                    'I-DEM': 1,
                                    'B-DATETIME': 1,
                                    'I-DATETIME': 1,
                                    'B-QUANTITY': 1.4,
                                    'I-QUANTITY': 1.4,
                                    'B-MISC': 1,
                                    'I-MISC': 1}

def update_attention_weights(dataset, weights):
    ner_classes = ['O', 'B-PERSON', 'I-PERSON', 'B-CODE', 'I-CODE', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG',
                'B-DEM', 'I-DEM', 'B-DATETIME', 'I-DATETIME', 'B-QUANTITY', 'I-QUANTITY', 'B-MISC', 'I-MISC']

    ner_weights = {}
    for i, ner in enumerate(ner_classes):
        ner_weights[i] = weights.get(ner, weights.get('OTHER', 0))

    ner_labels = dataset['train']['ner_labels']
    attention_mask = dataset['train']['attention_mask']
    mask_adjustments = []
    new_attention_masks = []
    for i, sample in enumerate(ner_labels):
        sample_adjust = []
        cls_count = 0
        for s in sample:
            if cls_count < 2:
                sample_adjust.append(1)
                cls_count += 1
            elif s < 0:
                sample_adjust.append(0)
            else:
                sample_adjust.append(ner_weights[s])
        mask_adjustments.append(sample_adjust)
        new_mask = list(np.array(attention_mask[i]) * np.array(sample_adjust))
        new_attention_masks.append(new_mask)

    return mask_adjustments, new_attention_masks

def create_dataset_attention(ds, attention_masks):
    new_ds_dict = {'id': ds['train']['id'],
          'input_ids': ds['train']['input_ids'],
          'attention_mask': attention_masks,
          'labels': ds['train']['mask_labels']}
    new_ds = Dataset.from_dict(new_ds_dict)

    return new_ds

## Prediction

In [6]:
# global variables
model_name = 'baseline_final_2.5e-5_linear_warmup_11_25' # update to select the right path
path_pred = path + '/models/' + model_name

task = 'mask'
size = 'mini'

In [7]:
# load model
model = LongformerForTokenClassification.from_pretrained(f'{path}/models/{model_name}/model')


training_args = TrainingArguments(
    output_dir='./results',
    report_to='none'
)

trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics)


In [None]:
ds_train = select_data(split='train', task=task, size=size)
trainer.evaluate(eval_dataset=ds_train['train'])
# trainer.evaluate(eval_dataset=ds_train) # for binary

{'eval_loss': 0.12253212183713913,
 'eval_model_preparation_time': 0.0062,
 'eval_precision': 0.75842718371454,
 'eval_recall': 0.7875974600112531,
 'eval_f1': 0.7727371305770785,
 'eval_seqeval_acc': 0.9584023745591689,
 'eval_runtime': 179.0414,
 'eval_samples_per_second': 2.234,
 'eval_steps_per_second': 0.279}

In [None]:
ds_test = select_data(split='test', task='both', size=size)

# test
mask_adjustments, new_attention_masks = update_attention_weights(ds_test, ner_attention_weights_equal)
new_ds_test = create_dataset_attention(ds_test, new_attention_masks)

In [None]:
ds_test = select_data(split='test', task=task, size=size)
# trainer.evaluate(eval_dataset=ds_test['train'])
trainer.evaluate(eval_dataset=ds_test)

{'eval_loss': 0.5057532787322998,
 'eval_model_preparation_time': 0.0062,
 'eval_precision': 0.3996296296296296,
 'eval_recall': 0.3547008547008547,
 'eval_f1': 0.3758272378962034,
 'eval_seqeval_acc': 0.858118165683861,
 'eval_runtime': 48.1402,
 'eval_samples_per_second': 1.039,
 'eval_steps_per_second': 0.145}

In [None]:
# predictions, labels, metrics = trainer.predict(ds_test)
# np.save(f'{path}/models/{model_name}/predictions.npy', predictions)
# np.save(f'{path}/models/{model_name}/labels.npy', labels)

# Evaluation

## Functions

In [None]:
def convert_ids_to_labels(pred, true, task):
    """
    Retrieves label prediction from raw predictions then generates y_pred, y_true for seqeval. Converts
    integers into class labels.

    Input:
        pred = raw predictions from model
        true = original labels from dataset
    Output:
        y_pred
        y_true
    """
    if task == 'ner':
        labels = ['O', 'B-PERSON', 'I-PERSON', 'B-CODE', 'I-CODE', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG',
        'B-DEM', 'I-DEM', 'B-DATETIME', 'I-DATETIME', 'B-QUANTITY', 'I-QUANTITY', 'B-MISC', 'I-MISC']
    if task == 'mask':
        labels = ['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT', 'B-QUASI', 'I-QUASI']

    # create y_pred
    y_pred = [np.argmax(p, axis=1) for p in pred]
    y_pred = [[labels[x] for x in p] for p in y_pred]

    # create y_true
    y_true = [[0 if x == -100 else x for x in sample] for sample in true]
    y_true = [[labels[x] for x in p] for p in y_true]

    return y_pred, y_true

## Seqeval

In [None]:
predictions = np.load(f'{path_pred}/predictions.npy')
ds = select_data(split='test', task=task, size=size)
# true_labels = ds['train']['labels']
true_labels = ds['labels']

y_pred, y_true = convert_ids_to_labels(predictions, true_labels, task='mask')
print('y_pred', [len(i) for i in y_pred])
print('y_true', [len(i) for i in y_true])

y_pred [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]
y_true [4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096]


In [None]:
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=1)
recall = recall_score(y_true, y_pred, zero_division=1)
f1 = f1_score(y_true, y_pred)

print('accuracy:', precision)
print('precision:', precision)
print('recall:', recall)
print('f1 score:', f1)

report = classification_report(y_true, y_pred)
print(report)

accuracy: 0.7393155395447847
precision: 0.7393155395447847
recall: 0.7421104536489151
f1 score: 0.7407103601017143
              precision    recall  f1-score   support

      DIRECT       0.76      0.85      0.81      4464
     NO_MASK       0.63      0.44      0.52      1620

   micro avg       0.74      0.74      0.74      6084
   macro avg       0.70      0.64      0.66      6084
weighted avg       0.73      0.74      0.73      6084



## Visualizer

In [20]:
# def convert_ids_to_labels_and_adjust_length(pred, true, task):
#     """
#     Retrieves label prediction from raw predictions then generates y_pred, y_true for seqeval. Converts
#     integers into class labels. Seqeval requires inputs to be the same shape but automatically ignores
#     'O' labels.

#     Input:
#         pred = raw predictions from model
#         true = original labels from dataset
#     Output:
#         y_pred
#         y_true
#     """
#     if task == 'ner':
#         labels = ['O', 'B-PERSON', 'I-PERSON', 'B-CODE', 'I-CODE', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG',
#         'B-DEM', 'I-DEM', 'B-DATETIME', 'I-DATETIME', 'B-QUANTITY', 'I-QUANTITY', 'B-MISC', 'I-MISC']
#     if task == 'mask':
#         labels = ['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT', 'B-QUASI', 'I-QUASI']

#     pred = [np.argmax(p, axis=1) for p in pred]
#     y_true = []
#     y_pred = []
#     for idx, sample in enumerate(true):
#         true_sample = []
#         pred_sample = []
#         for i, x in enumerate(sample):
#             if x >= 0:
#                 true_sample.append(labels[x])
#                 pred_sample.append(labels[pred[idx][i]])
#         y_true.append(true_sample)
#         y_pred.append(pred_sample)

#     return y_pred, y_true

# predictions = np.load(f'{path_pred}/predictions.npy')
# ds = select_data(split='test', task=task, size=size)
# true_labels = ds['train']['labels']

# y_pred_adjusted, y_true_adjusted = convert_ids_to_labels_and_adjust_length(predictions, true_labels, task='mask')
# print('y_pred', [len(i) for i in y_pred_adjusted])
# print('y_true', [len(i) for i in y_true_adjusted])

def eval_visualizer(ds, pred, sample_idx, tokenizer, task):
    """
    Retrieves label prediction from raw predictions then generates y_pred, y_true for seqeval. Converts
    integers into class labels. Seqeval requires inputs to be the same shape but automatically ignores
    'O' labels.

    Input:
        pred = raw predictions from model
        true = original labels from dataset
    Output:
        y_pred
        y_true
    """
    if task == 'ner':
        labels = ['O', 'B-PERSON', 'I-PERSON', 'B-CODE', 'I-CODE', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG',
        'B-DEM', 'I-DEM', 'B-DATETIME', 'I-DATETIME', 'B-QUANTITY', 'I-QUANTITY', 'B-MISC', 'I-MISC']
    if task == 'mask':
        labels = ['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT', 'B-QUASI', 'I-QUASI']

    pred_sample = np.argmax(pred[sample_idx], axis=1)
    true_label = ds['train'][sample_idx]['labels']
    input_ids = ds['train'][sample_idx]['input_ids']
    y_true = []
    y_pred = []
    for i, x in enumerate(true_label):
        if x >=0:
            y_true.append(labels[x])
            y_pred.append(labels[pred_sample[i]])

    tokens = tokenizer.convert_ids_to_tokens(input_ids, skip_special_tokens=True)

    print("\nToken-wise Visualization:")
    print(f"{'Token':<15}{'Label':<15}{'Prediction':<15}")
    for i in range(len(tokens)):
        print(f'{tokens[i]:<15}{y_true[i]:<15}{y_pred[i]:<15}')

In [17]:
model_checkpoint = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizerFast.from_pretrained(model_checkpoint, add_prefix_space=True)
predictions = np.load(f'{path_pred}/predictions.npy')
ds = select_data(split='test', task=task, size=size)

In [48]:
sample_idx = 9
print(f"doc_id: {ds['train'][sample_idx]['id']}")
tokenizer.decode(ds['train'][sample_idx]['input_ids'], skip_special_tokens=True)

doc_id: 001-68512


' PROCEDURE \n\n The case originated in an application ( no . 47328/99 ) against the Republic of Turkey lodged with the Court under Article 34 of the Convention for the Protection of Human Rights and Fundamental Freedoms ( “ the Convention ” ) by a Turkish national , Mr Nurettin Şirin ( “ the applicant ” ) , on 15 March 1999 . \n\n The applicant was represented by Mr M. Arani , a lawyer practising in Middlesex ( United Kingdom ) . The Turkish Government ( “ the Government ” ) did not designate an Agent for the purpose of the proceedings before the Court . \n\n On 27 April 2004 the Court declared the application partly inadmissible and decided to communicate the complaints concerning the applicant ’s right to a fair trial by an independent and impartial tribunal and the failure to notify the applicant of the public prosecutor ’s submissions on his appeal to the Government . Under the provisions of Article 29 § 3 of the Convention , it decided to examine the merits of the application at 

In [49]:
eval_visualizer(ds=ds, pred=predictions, sample_idx=sample_idx, tokenizer=tokenizer, task=task)


Token-wise Visualization:
Token          Label          Prediction     
ĠPROC          O              O              
ED             O              O              
URE            O              O              
Ġ              O              O              
ĊĊ             O              O              
ĠThe           O              O              
Ġcase          O              O              
Ġoriginated    O              O              
Ġin            O              O              
Ġan            O              O              
Ġapplication   O              O              
Ġ(             O              O              
Ġno            O              O              
Ġ.             O              O              
Ġ4             B-DIRECT       B-DIRECT       
73             B-DIRECT       B-DIRECT       
28             B-DIRECT       B-DIRECT       
/              B-DIRECT       B-DIRECT       
99             B-DIRECT       B-DIRECT       
Ġ)             O              O              
Ġagains