In [None]:
try:
  from google import colab
except:
  ON_COLAB = False
else:
  ON_COLAB = True

if ON_COLAB:
  !wget -O annotations.jsonl -q https://www.dropbox.com/scl/fi/7hlxe68s6ge6qxpuihaod/annotations.jsonl?rlkey=6e8e9q7pb7gh4xla5j3gx4n6w&dl=1
  !wget -O utils.zip -q https://www.dropbox.com/scl/fi/0p9j82a13aaizrswrch4q/utils.zip?rlkey=6mm2cdsg7u0xzsrlgx307muz8&dl=1
  !unzip -qq -o utils.zip -d utils
  !rm -r utils.zip

In [None]:
if ON_COLAB:
    # connect Drive
    import os
    from google.colab import drive
    drive.mount('/content/drive')
    # set working directory
    wd = '/content/drive/My Drive/papers/group_mentions_galtan/'
    os.makedirs(wd, exist_ok=True)
    os.chdir(wd)

Mounted at /content/drive


In [None]:
from types import SimpleNamespace

args = SimpleNamespace()

args.model_name = 'roberta-base'

args.data_file = '/content/annotations.jsonl' if ON_COLAB else '../../data/annotations/group-mention-annotation-batch-01/reviewed_annotations.jsonl'
args.types = 'social group,organizational group'

args.experiment_name = 'group-mention-detection_batch-01'
args.experiment_results_path = './results' if ON_COLAB else './../../results/classifiers'

args.test_size = 0.15
args.dev_size = 0.15
args.seed = 1234

args.metric = 'seqeval-macro_f1'
args.epochs=10
args.learning_rate=4e-5
args.train_batch_size=16
args.gradient_accumulation_steps=2
args.eval_batch_size=64
args.weight_decay=0.3

args.types = [t.strip() for t in args.types.split(',')]

In [None]:
if ON_COLAB:
  !pip -q install datasets==2.20.0 accelerate==0.31.0 seqeval==1.2.2
  from nltk import download as nltk_download; nltk_download('punkt')

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m430.1/547.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 k

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import os
import shutil
import json
import numpy as np
from utils.io import read_jsonlines

import torch
import transformers
from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
)
from transformers.utils import logging
# logging.set_verbosity_error()

from utils.corpus import format_doccano_annotations, LabeledText, LabeledSequence
from utils.dataset import create_token_classification_dataset, tokenize_and_align_sequence_labels, split_dataset
from utils.trainer import WriteValidationResultsCallback
from utils.metrics import compute_sequence_metrics

#### Decvice

When CUDA available, we want to use it.
Alternatively, we use Mac's new M1 if available (i.e., 'mps', see [here](https://sebastianraschka.com/blog/2022/pytorch-m1-gpu.html)).
The fall back option is CPU processing.

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'cuda:0'

#### Reproducability

In [None]:
set_seed(args.seed)

#### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True, add_prefix_space=True)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### Load and prepare the data

In [None]:
data = read_jsonlines(args.data_file)

In [None]:
corpus = [
    LabeledText.from_dict(
        {
            'id': d['id'],
            'text': d['text'],
            'annotations': format_doccano_annotations(
                [a for a in d['label'] if a[2] != 'unsure']
            )
        }
    )
    for d in data
]

In [None]:
print(corpus[12])

[1m11110_199109-390960[0m: "Therefore, we oppose the despolitisation of [43m[1msociety[0m[43m [social group][49m by [43m[1mmultinational corporations[0m[43m [organizational group][49m and large capital interests, as well as the centralization of the remaining political power and the transfer of it to [43m[1mparty leaders or officials in Brussels[0m[43m [social group][49m."


In [None]:
types = list(set(entity.type for doc in corpus for entity in doc.entities))
scheme = ['O'] + ['I-'+t for t in types] + ['B-'+t for t in types]
label2id = {l: i for i, l in enumerate(scheme)}
id2label = {i: l for i, l in enumerate(scheme)}
NUM_LABELS = len(label2id)

In [None]:
corpus = [LabeledSequence.from_labeled_text(doc, label2id).to_dict() for doc in corpus]

In [None]:
dataset = create_token_classification_dataset(corpus, labels_field='annotations')

In [None]:
dataset = dataset.map(lambda example: tokenize_and_align_sequence_labels(example, tokenizer=tokenizer), batched=True)

Map:   0%|          | 0/4315 [00:00<?, ? examples/s]

In [None]:
example = dataset[12]
for t, l in zip(example['input_ids'], example['labels']):
    if t == tokenizer.pad_token_id:
        break
    print(l, '\t', tokenizer.decode(t))

-100 	 <s>
0 	  Therefore
0 	 ,
0 	  we
0 	  oppose
0 	  the
0 	  desp
-100 	 olit
-100 	 isation
0 	  of
4 	  society
0 	  by
3 	  multinational
1 	  corporations
0 	  and
0 	  large
0 	  capital
0 	  interests
0 	 ,
0 	  as
0 	  well
0 	  as
0 	  the
0 	  central
-100 	 ization
0 	  of
0 	  the
0 	  remaining
0 	  political
0 	  power
0 	  and
0 	  the
0 	  transfer
0 	  of
0 	  it
0 	  to
4 	  party
2 	  leaders
2 	  or
2 	  officials
2 	  in
2 	  Brussels
0 	 .
-100 	 </s>


In [None]:
dataset = dataset.remove_columns(['tokens']) # no need to load these to the GPU

In [None]:
dataset = split_dataset(dataset, test_size=args.test_size, dev_size=args.dev_size)
dataset.num_rows

{'train': 3021, 'dev': 647, 'test': 647}

In [None]:
from transformers import DataCollatorForTokenClassification, PreTrainedTokenizer
from typing import Union, Callable
from datasets import Dataset

def get_best_checkpoint(log_history, metric):
    best_ = np.array([res['eval_'+metric] for res in log_history if 'eval_loss' in res]).argmax()
    best_ = [res['step'] for res in log_history if 'eval_loss' not in res][best_]
    return 'checkpoint-'+str(best_)

def train_and_test(
    experiment_name: str,
    experiment_results_path: str,
    run_id: Union[None, str],
    model_init: Callable,
    tokenizer: PreTrainedTokenizer,
    data_collator: DataCollatorForTokenClassification,
    train_dat: Dataset,
    dev_dat: Union[None, Dataset],
    test_dat: Union[None, Dataset],
    compute_metrics: Callable,
    metric: str,
    epochs: int = TrainingArguments.num_train_epochs,
    learning_rate: float = TrainingArguments.learning_rate,
    train_batch_size: int = TrainingArguments.per_device_train_batch_size,
    gradient_accumulation_steps: int = TrainingArguments.gradient_accumulation_steps,
    eval_batch_size: int = TrainingArguments.per_device_eval_batch_size,
    weight_decay: float = TrainingArguments.weight_decay,
    early_stopping: bool = True,
    early_stopping_patience: int = 3,
    early_stopping_threshold: float = 0.03,
    seed: int = 42,
):
    """
    Train and test a token classification model.
    """
    results_path = os.path.join(experiment_results_path, experiment_name)
    os.makedirs(results_path, exist_ok=True)

    output_path = os.path.join(results_path, 'checkpoints')
    logs_path = os.path.join(results_path, 'logs')

    # note: the following training options depend on the availability of a dev set and will be disabled if none is provided
    #  - evaluating after each epoch
    #  - early stopping
    #  - saving at most 2 models during training
    #  - saving the best model at the end
    #  - saving the dev results

    training_args = TrainingArguments(
        # hyperparameters
        num_train_epochs=epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        per_device_eval_batch_size=eval_batch_size,
        weight_decay=weight_decay,
        optim='adamw_torch',
        # how to select "best" model
        do_eval=dev_dat is not None,
        metric_for_best_model=metric,
        load_best_model_at_end=True,
        # when to evaluate
        evaluation_strategy='epoch',
        # when to save
        save_strategy='epoch',
        save_total_limit=2 if dev_dat is not None else None, # don't save all model checkpoints
        # where to store results
        output_dir=output_path,
        overwrite_output_dir=True,
        # logging
        logging_dir=logs_path,
        logging_strategy='epoch',
        # reproducibility
        seed=seed,
        data_seed=seed,
        full_determinism=True
    )

    # build callbacks
    callbacks = []
    if early_stopping:
        if dev_dat is None:
            raise ValueError('Early stopping requires a dev data set')
        callbacks.append(EarlyStoppingCallback(early_stopping_patience=early_stopping_patience, early_stopping_threshold=early_stopping_threshold))
    if dev_dat:
        fn = run_id+'-dev_results.jsonl' if run_id else 'dev_results.jsonl'
        fp = os.path.join(results_path, fn)
        callbacks.append(WriteValidationResultsCallback(path=fp))

    # train
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dat,
        eval_dataset=dev_dat if dev_dat is not None else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=callbacks
    )
    print('Training ...')
    _ = trainer.train()

    # save best model to results folder
    # CAVEAT: this is not the "best" model if no dev_dat is provided
    dest = run_id+'-best_model' if run_id else 'best_model'
    dest = os.path.join(results_path, dest)
    if os.path.exists(dest):
        shutil.rmtree(dest)
    trainer.save_model(dest)
    # save tokenizer to best_model folder
    tokenizer.save_pretrained(dest)

    # evaluate
    if test_dat:
        print('Evaluating ...')
        res = trainer.evaluate(test_dat, metric_key_prefix='test')
        print(res)
        fn = run_id+'-test_results.json' if run_id else 'test_results.json'
        fp = os.path.join(results_path, fn)
        with open(fp, 'w') as file:
            json.dump(res, file)
    else:
      res = None

    # finally: clean up
    if os.path.exists(output_path):
        # TODO: reconsider theis when dev_dat is None (in this case, no best model will be copied and deliting the output path would delete any model checkpoints)
        shutil.rmtree(output_path)
    if os.path.exists(logs_path):
        shutil.rmtree(logs_path)

    return trainer, dest, res

In [None]:
def model_init():
    return AutoModelForTokenClassification.from_pretrained(args.model_name, num_labels=NUM_LABELS, id2label=id2label, label2id=label2id, device_map='auto')

In [None]:
from utils.metrics import compute_sequence_metrics

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    return compute_sequence_metrics(y_true=labels, y_pred=predictions, id2label=id2label, flatten_output=True)

In [None]:
# train & test
model, checkpoint, test_res = train_and_test(
    # experiment arguments
    experiment_name=args.experiment_name,
    experiment_results_path=args.experiment_results_path,
    run_id=None,
    # data
    train_dat=dataset['train'],
    dev_dat=dataset['dev'],
    test_dat=dataset['test'],
    # model arguments
    model_init=model_init,
    tokenizer=tokenizer,
    # eval arguments
    metric=args.metric,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    # training arguments
    epochs=args.epochs,
    train_batch_size=args.train_batch_size,
    gradient_accumulation_steps = args.gradient_accumulation_steps,
    eval_batch_size=args.eval_batch_size,
    learning_rate=args.learning_rate,
    weight_decay=args.weight_decay,
    seed=args.seed,
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training ...


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Seqeval-macro Precision,Seqeval-macro Recall,Seqeval-macro F1,Seqeval-micro Precision,Seqeval-micro Recall,Seqeval-micro F1,Seqeval-social group Precision,Seqeval-social group Recall,Seqeval-social group F1,Seqeval-organizational group Precision,Seqeval-organizational group Recall,Seqeval-organizational group F1,Spanlevel-macro Precision,Spanlevel-macro Recall,Spanlevel-macro F1-cross,Spanlevel-macro F1,Spanlevel-micro Precision,Spanlevel-micro Recall,Spanlevel-micro F1-cross,Spanlevel-micro F1,Spanlevel-social group Precision,Spanlevel-social group Recall,Spanlevel-social group F1-cross,Spanlevel-social group F1,Spanlevel-organizational group Precision,Spanlevel-organizational group Recall,Spanlevel-organizational group F1-cross,Spanlevel-organizational group F1,Doclevel-micro Precision,Doclevel-micro Recall,Doclevel-micro F1,Doclevel-social group Precision,Doclevel-social group Recall,Doclevel-social group F1,Doclevel-organizational group Precision,Doclevel-organizational group Recall,Doclevel-organizational group F1,Wordlevel-accuracy,Wordlevel-macro Precision,Wordlevel-macro Recall,Wordlevel-macro F1,Wordlevel-o Precision,Wordlevel-o Recall,Wordlevel-o F1,Wordlevel-social group Precision,Wordlevel-social group Recall,Wordlevel-social group F1,Wordlevel-organizational group Precision,Wordlevel-organizational group Recall,Wordlevel-organizational group F1
0,0.2363,0.110718,0.508921,0.672569,0.575644,0.60688,0.741742,0.667568,0.661677,0.767361,0.710611,0.356164,0.577778,0.440678,0.772913,0.898586,0.868681,0.831025,0.776206,0.898586,0.871248,0.832925,0.83815,0.926187,0.909073,0.879972,0.471577,0.744107,0.765327,0.577295,0.948995,0.948995,0.948995,0.964451,0.964451,0.964451,0.956723,0.956723,0.956723,0.970301,0.785172,0.868194,0.822156,0.991146,0.97851,0.984787,0.808122,0.902494,0.852705,0.55625,0.723577,0.628975
2,0.0456,0.100546,0.623907,0.714931,0.665998,0.678571,0.798799,0.733793,0.696793,0.829861,0.757528,0.55102,0.6,0.574468,0.790415,0.923731,0.881611,0.851889,0.792231,0.923731,0.8829,0.852942,0.805304,0.964305,0.912191,0.877662,0.702381,0.705471,0.852337,0.703923,0.955178,0.955178,0.955178,0.959815,0.959815,0.959815,0.972179,0.972179,0.972179,0.97272,0.847486,0.842334,0.840216,0.991978,0.979603,0.985752,0.792905,0.937642,0.859221,0.757576,0.609756,0.675676
4,0.0179,0.135521,0.700669,0.722569,0.71144,0.768116,0.795796,0.781711,0.792642,0.822917,0.807496,0.608696,0.622222,0.615385,0.868747,0.894556,0.906807,0.881463,0.869776,0.894556,0.907588,0.881992,0.889763,0.930558,0.937789,0.909703,0.7625,0.713284,0.859259,0.737071,0.964451,0.964451,0.964451,0.969088,0.969088,0.969088,0.978362,0.978362,0.978362,0.975557,0.870994,0.837748,0.852205,0.987874,0.986614,0.987244,0.8506,0.884354,0.867148,0.77451,0.642276,0.702222
6,0.0072,0.147801,0.706484,0.714931,0.710526,0.77551,0.798799,0.786982,0.799331,0.829861,0.81431,0.613636,0.6,0.606742,0.855579,0.894971,0.903027,0.874832,0.856686,0.894971,0.903871,0.87541,0.882691,0.933196,0.935635,0.907241,0.703509,0.703373,0.860527,0.703441,0.964451,0.964451,0.964451,0.969088,0.969088,0.969088,0.97527,0.97527,0.97527,0.975223,0.844622,0.850779,0.847345,0.989664,0.985249,0.987452,0.840812,0.89229,0.865787,0.70339,0.674797,0.688797
8,0.0052,0.151289,0.703677,0.724306,0.713836,0.773256,0.798799,0.78582,0.798658,0.826389,0.812287,0.608696,0.622222,0.615385,0.871568,0.895338,0.915499,0.883293,0.872661,0.895338,0.916339,0.883854,0.896618,0.928289,0.944245,0.912179,0.725203,0.728373,0.856062,0.726785,0.962906,0.962906,0.962906,0.969088,0.969088,0.969088,0.976816,0.976816,0.976816,0.977476,0.85626,0.861521,0.858872,0.988793,0.988162,0.988477,0.875986,0.880952,0.878462,0.704,0.715447,0.709677


Evaluating ...


early stopping required metric_for_best_model, but did not find eval_seqeval-macro_f1 so early stopping is disabled


{'test_loss': 0.1330958902835846, 'test_seqeval-macro_precision': 0.6945828390668248, 'test_seqeval-macro_recall': 0.6962064676616915, 'test_seqeval-macro_f1': 0.6948704389212057, 'test_seqeval-micro_precision': 0.7791411042944786, 'test_seqeval-micro_recall': 0.8037974683544303, 'test_seqeval-micro_f1': 0.7912772585669782, 'test_seqeval-social group_precision': 0.8113879003558719, 'test_seqeval-social group_recall': 0.8507462686567164, 'test_seqeval-social group_f1': 0.8306010928961749, 'test_seqeval-organizational group_precision': 0.5777777777777777, 'test_seqeval-organizational group_recall': 0.5416666666666666, 'test_seqeval-organizational group_f1': 0.5591397849462364, 'test_spanlevel-macro_precision': 0.899294889892716, 'test_spanlevel-macro_recall': 0.8880005159217432, 'test_spanlevel-macro_f1-cross': 0.9369073204498926, 'test_spanlevel-macro_f1': 0.8936120168972446, 'test_spanlevel-micro_precision': 0.8982079333709768, 'test_spanlevel-micro_recall': 0.888292574731501, 'test_sp

In [None]:
from utils.metrics import compute_sequence_metrics, parse_metrics

preds = model.predict(dataset['test'])
predictions, labels = preds.predictions, preds.label_ids
predictions = np.argmax(predictions, axis=2)

test_results = compute_sequence_metrics(y_true=labels, y_pred=predictions, id2label=id2label)

In [None]:
parse_metrics(test_results['seqeval'], order=['macro', 'micro'] + args.types)

Unnamed: 0,f1,precision,recall
macro,0.69487,0.694583,0.696206
micro,0.791277,0.779141,0.803797
social group,0.830601,0.811388,0.850746
organizational group,0.55914,0.577778,0.541667


In [None]:
parse_metrics(test_results['spanlevel'], order=['macro', 'micro'] + args.types)

Unnamed: 0,f1,f1-cross,precision,recall
macro,0.893612,0.936907,0.899295,0.888001
micro,0.893223,0.936631,0.898208,0.888293
social group,0.9273,0.945106,0.923304,0.931331
organizational group,0.702032,0.897236,0.761261,0.651353


In [None]:
parse_metrics(test_results['doclevel'], order=['micro'] + args.types)

Unnamed: 0,f1,precision,recall
micro,0.959815,0.959815,0.959815
social group,0.981453,0.981453,0.981453
organizational group,0.972179,0.972179,0.972179


In [None]:
parse_metrics(test_results['wordlevel'], order=['macro'] + args.types)

Unnamed: 0,NaN,f1,precision,recall
macro,,0.825833,0.872546,0.79144
social group,,0.885971,0.905367,0.867388
organizational group,,0.601626,0.72549,0.513889
