# Longformer
- potential to dos; Data_Iterator
- [Huggingface datasets](https://drive.google.com/drive/folders/1UQfl6oXyYt4Eepudmgi6A9xMAkqBuaHf)
- Key functions:
- HuggingFace dataset output: [/data/tab/longformer]
    - [/data/tab/longformer](https://drive.google.com/drive/folders/1UQfl6oXyYt4Eepudmgi6A9xMAkqBuaHf) - for multiclassification; label names = ner_labels; mask_labels
    - [/data/tab/longformer_mask](https://drive.google.com/drive/folders/1bgkTuZ428fLdnFrtq0BWJcTBT3lpNXbK) - single classification; labels
    - [/data/tab/longformer_ner](https://drive.google.com/drive/folders/1M8KiTXhpdkiMzJRqcLX0X7KY0dCbqw3t) - single classification; labels

In [1]:
!pip install -q transformers
!pip install -q datasets
!pip install -q evaluate
!pip install -q seqeval

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# generic
import numpy as np
# from pprint import pprint

# ml
from datasets import load_dataset, load_from_disk
from transformers import LongformerForTokenClassification, LongformerTokenizerFast, Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate
from torch.utils.data import DataLoader
# from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
# import tensorflow as tf
# from tensorflow import keras

In [3]:
# use for vertex ai / google cloud
# from google.cloud import storage

# client = storage.Client()
# bucket_name = 'w266-project'
# bucket = client.get_bucket(bucket_name)
# path = f'gs://{bucket_name}'
# vertex_path = '/content'

# use for google collab
from google.colab import drive

drive.mount('/content/drive')
path = '/content/drive/MyDrive/Colab Notebooks/266 Project'

Mounted at /content/drive


In [4]:
def print_version(library_name):
    try:
        lib = __import__(library_name)
        version = getattr(lib, '__version__', 'Version number not found')
        print(f"{library_name} version: {version}")
    except ImportError:
        print(f"{library_name} not installed.")
    except Exception as e:
        print(f"An error occurred: {e}")

print_version('transformers')
print_version('tensorflow')
print_version('keras')

transformers version: 4.46.2
tensorflow version: 2.17.1
keras version: 3.5.0


In [5]:
# global variables
model_checkpoint = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizerFast.from_pretrained(model_checkpoint, add_prefix_space=True)

task = 'binary' # ner, mask, both, binary
size = 'mini' # testing, mini, full

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

# Longformer Model

## Functions

In [6]:
# general functions
def select_data(split, task, size):
    """
    Loads the appropriate dataset per folder structure here: https://drive.google.com/drive/folders/1C3h3rXdbr9nVAC3_G_I-72DfKNiDU_Pa
    Input:
        Split: ['train', 'val', 'test']
        Task: ['ner', 'mask', 'both']
        Size: ['testing', 'mini', 'full']
    Returns:
        Huggingface dataset
    """
    if split not in ['train', 'val', 'test']:
        raise ValueError("Split value must be in ['train', 'val', 'test']")
    if task not in ['ner', 'mask', 'both', 'binary']:
        raise ValueError("Task value must be in ['ner', 'mask', 'both']")
    if size not in ['testing', 'mini', 'full']:
        raise ValueError("Size value must be in ['testing', 'mini', 'full']")

    path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_mask', 'binary': 'longformer_binary'}
    # path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_4096'}

    if size == 'testing':
        ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_testing')
    if size == 'mini':
        if split == 'train':
            ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_400')
        else:
            ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_50')
    if size == 'full':
        ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}')

    return ds

# def create_dataset(split, task, size):
#     """Creates appropriate dataset depending on training objective.
#     Input:
#         dataset = use load_from_disk(<path>)
#     Output:
#         returns dataset for training
#     """

#     if task == 'both':
#         labels = ['ner_labels', 'mask_labels']
#     else:
#         labels = ['labels']

#     ds = select_data(split=split, task=task, size=size)

#     data_collator = DataCollatorForTokenClassification(tokenizer,
#                                                        padding='max_length',
#                                                        max_length=4096,
#                                                        return_tensors='np')

#     data_set = ds['train'].to_tf_dataset(
#         columns=['input_ids', 'attention_mask'],
#         label_cols=labels,
#         shuffle=True,
#         batch_size=16,
#         collate_fn=data_collator
#     )

#     return data_set

In [7]:
# metrics
def compute_metrics(p):
    seqeval = evaluate.load('seqeval')
    # accuracy = evaluate.load('accuracy')

    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # label_list = ['O', 'B-PERSON', 'I-PERSON', 'B-CODE', 'I-CODE', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG',
    #             'B-DEM', 'I-DEM', 'B-DATETIME', 'I-DATETIME', 'B-QUANTITY', 'I-QUANTITY', 'B-MISC', 'I-MISC']
    # label_list = ['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT', 'B-QUASI', 'I-QUASI']
    label_list = ['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT']
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # flat_predictions = [p for seq in true_predictions for p in seq]
    # flat_labels = [l for seq in true_labels for l in seq]

    # accuracy = accuracy.compute(prediction=flat_predictions, references=flat_labels)
    results = seqeval.compute(predictions=true_predictions, references=true_labels, zero_division=1)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "seqeval_acc": results["overall_accuracy"],
    }

def count_trainable_parameters(model):
    # Get the trainable parameters of the model
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return trainable_params

## Model

### Training

In [8]:
# ds = select_data(split='train', task=task, size=size)
# train_set = create_dataset(split='train', task=task, size=size)
# val_set = create_dataset(split='val', task=task, size=size)

ds_train = select_data(split='train', task=task, size=size)
ds_val = select_data(split='val', task=task, size=size)

KeyboardInterrupt: 

In [9]:
model_name = 'binary_2.5e-4_linear_warmup_12_04'

# possible to implement accerate.utils; auto_find_batch_size
batch_size = 16
num_epochs = 20

# (2.5e-5, 5e-4, 1e-4)
training_args = TrainingArguments(
    output_dir=f'{path}/models/{model_name}/results',
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    save_only_model=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    learning_rate=2.5e-4, #
    num_train_epochs=num_epochs,
    # lr_scheduler_type='cosine',
    # lr_scheduler_kwargs={'num_warmup_steps': 50, 'num_training_steps': 50},
    warmup_ratio=0.1, # only for linear warmup
    # weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    report_to="none"
)

In [None]:
# total_steps = (400 / batch_size) * num_epochs
# warmup_proportion = 0.1
# warmup_steps = total_steps * warmup_proportion
# print(total_steps)
# print(warmup_steps)

500.0
50.0


In [None]:
# build model
# model = LongformerForTokenClassification.from_pretrained(model_checkpoint, gradient_checkpointing=True, num_labels=7) # mask labels
# model = LongformerForTokenClassification.from_pretrained(model_checkpoint, gradient_checkpointing=True, num_labels=17) # ner labels
model = LongformerForTokenClassification.from_pretrained(model_checkpoint, gradient_checkpointing=True, num_labels=5) # binary labels

print('trainable parameters:', count_trainable_parameters(model))
# print(model)

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable parameters: 148072709


In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=ds_train['train'],
#     eval_dataset=ds_val['train'],
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
# )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Seqeval Acc
1,0.8778,0.343791,0.518176,0.503835,0.510904,0.891029
2,0.2151,0.287145,0.699933,0.696899,0.698413,0.9181
3,0.167,0.224686,0.709406,0.716739,0.713054,0.926927
4,0.1409,0.285539,0.70834,0.686729,0.697367,0.91675
5,0.12,0.240221,0.734091,0.72124,0.727609,0.921903
6,0.0995,0.24663,0.733165,0.727909,0.730528,0.923984


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

TrainOutput(global_step=150, training_loss=0.27004572868347165, metrics={'train_runtime': 2513.102, 'train_samples_per_second': 3.183, 'train_steps_per_second': 0.199, 'total_flos': 6271077010636800.0, 'train_loss': 0.27004572868347165, 'epoch': 6.0})

In [None]:
# save hf/pytorch model
trainer.save_model(f'{path}/models/{model_name}/model')
# did not save tokenizer as already tokenized; load default longformer

### Evaluation

In [11]:
ds_test = select_data(split='test', task=task, size=size)

# load hf/pytorch model
model_name = 'binary_2.5e-4_linear_warmup_12_04'
model = LongformerForTokenClassification.from_pretrained(f'{path}/models/{model_name}/model')

ds_test

Dataset({
    features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})

In [21]:
training_args = TrainingArguments(
    output_dir=f'{path}/models/{model_name}/results',
    # eval_strategy='epoch',
    # save_strategy='epoch',
    # logging_strategy='epoch',
    # save_total_limit=2,
    # load_best_model_at_end=True,
    # save_only_model=True,
    # metric_for_best_model='eval_loss',
    # greater_is_better=False,
    # learning_rate=2.5e-4, #
    # num_train_epochs=num_epochs,
    # # lr_scheduler_type='cosine',
    # # lr_scheduler_kwargs={'num_warmup_steps': 50, 'num_training_steps': 50},
    # warmup_ratio=0.1, # only for linear warmup
    # # weight_decay=0.01,
    # per_device_train_batch_size=batch_size,
    # per_device_eval_batch_size=batch_size,
    # fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics
)

In [22]:
trainer.evaluate(eval_dataset=ds_test)

{'eval_loss': 0.19704759120941162,
 'eval_model_preparation_time': 0.0213,
 'eval_precision': 0.7394366197183099,
 'eval_recall': 0.7421104536489151,
 'eval_f1': 0.7407711238720261,
 'eval_seqeval_acc': 0.9371099788171982,
 'eval_runtime': 15.004,
 'eval_samples_per_second': 3.332,
 'eval_steps_per_second': 0.467}

In [24]:
predictions, labels, metrics = trainer.predict(ds_test)
print(f"Metrics: {metrics}")
print(predictions[0])
print(labels[0])

Metrics: {'test_loss': 0.19704759120941162, 'test_model_preparation_time': 0.0213, 'test_precision': 0.7394366197183099, 'test_recall': 0.7421104536489151, 'test_f1': 0.7407711238720261, 'test_seqeval_acc': 0.9371099788171982, 'test_runtime': 14.6348, 'test_samples_per_second': 3.417, 'test_steps_per_second': 0.478}
[[ 2.8847656  -2.1210938  -0.91845703 -1.7558594   0.5209961 ]
 [ 6.0507812  -1.3173828  -1.6191406  -2.1738281  -2.1992188 ]
 [ 5.6484375  -1.3818359  -1.3291016  -2.0429688  -1.9365234 ]
 ...
 [ 0.9501953  -0.34814453 -0.66796875 -0.28076172 -0.04437256]
 [ 0.9501953  -0.34814453 -0.66796875 -0.28076172 -0.04437256]
 [ 0.9501953  -0.34814453 -0.66796875 -0.28076172 -0.04437256]]
[-100    0    0 ... -100 -100 -100]


In [25]:
np.save(f'{path}/models/{model_name}/predictions.npy', predictions)
np.save(f'{path}/models/{model_name}/labels.npy', labels)