# Longformer Concatenation
- To Dos:
    - extract remaining embeddings from MASK longformer; determine max size
    - train longformer on NER and extract embeddings
    - concatenated vectors and train model

In [None]:
!pip install -q datasets
!pip install -q evaluate
!pip install -q seqeval

In [None]:
import os
import math
import logging
import numpy as np
from transformers import LongformerForTokenClassification, Trainer, TrainingArguments
from datasets import load_from_disk, Dataset, DatasetDict
from datasets import Features, ClassLabel, Value
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import evaluate

# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

In [None]:
# global variables
model_name = 'baseline_final_2.5e-5_linear_warmup_11_25' # update to select the right path

task = 'ner'
size = 'mini'

In [None]:
# use for google colab
from google.colab import drive

drive.mount('/content/drive')
# path = '/content/drive/MyDrive/Colab Notebooks/266 Project'
path = '/content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project'
path_model = f'{path}/models/{model_name}/model'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Functions

In [None]:
# general functions
def select_data(split, task, size):
    """
    Loads the appropriate dataset per folder structure here: https://drive.google.com/drive/folders/1C3h3rXdbr9nVAC3_G_I-72DfKNiDU_Pa
    Input:
        Split: ['train', 'val', 'test']
        Task: ['ner', 'mask', 'both']
        Size: ['testing', 'mini', 'full']
    Returns:
        Huggingface dataset
    """
    if split not in ['train', 'val', 'test']:
        raise ValueError("Split value must be in ['train', 'val', 'test']")
    if task not in ['ner', 'mask', 'both', 'binary']:
        raise ValueError("Task value must be in ['ner', 'mask', 'both']")
    if size not in ['testing', 'mini', 'full']:
        raise ValueError("Size value must be in ['testing', 'mini', 'full']")

    path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_mask', 'binary': 'longformer_binary'}
    # path_label = {'both': 'longformer', 'ner': 'longformer_ner', 'mask': 'longformer_4096'}

    if size == 'testing':
        ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_testing')
    if size == 'mini':
        if split == 'train':
            ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_400')
        else:
            ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}_50')
    if size == 'full':
        ds = load_from_disk(f'{path}/data/tab/{path_label[task]}/lf_{split}')

    return ds

# Load/Save Embeddings

In [None]:
ds_train = select_data(split='train', task='ner', size='testing')

# step not needed as can specify keys in batch below
dataset = Dataset.from_dict({
    'input_ids': ds_train['train']['input_ids'],
    'attention_mask': ds_train['train']['attention_mask']
}).with_format('torch')

# larger batch sizes crash on 12.7 GB CPU
dataloader = DataLoader(dataset, batch_size=16)
output_dir = f'{path}/models/{model_name}/'

KeyboardInterrupt: 

In [None]:
len(ds_train['train']['input_ids'][0])

In [None]:
# test two; success
# with torch.no_grad():
#     model = LongformerForTokenClassification.from_pretrained(f'{path}/models/{model_name}/model')
#     model.config.output_hidden_states=True
#     outputs = model(input_ids=input_ids, attention_mask=attention_mask)
# last_hidden_state = np.array(outputs.hidden_states[-1])
# print(last_hidden_state.shape)
# np.save(f'{path}/models/{model_name}/last_hidden_state_mini.npy', last_hidden_state)

In [None]:
# stream and save last_hidden_state to avoid memory issues; can optimize further by sending to GPU?
with torch.no_grad():
    model = LongformerForTokenClassification.from_pretrained(f'{path}/models/{model_name}/model')
    model.config.output_hidden_states=True
    for i, batch in enumerate(dataloader):
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = np.array(outputs.hidden_states[-1])
        file_path = os.path.join(output_dir, f'last_hidden_state_batch_{i}.npy')
        np.save(file_path, last_hidden_state)

# Create Concatenated / Additive Dataset

In [None]:
def create_concat_dataset_full_and_save(model_mask, model_ner, split):
    mask_path = f'{path}/models/{model_mask}/hidden_states/{split}/'
    ner_path = f'{path}/models/{model_ner}/hidden_states/{split}/'
    files = [f for f in os.listdir(mask_path)]

    for i, f in enumerate(files):
        mpath = os.path.join(mask_path, f)
        npath = os.path.join(ner_path, f)
        if i == 0:
            m_hidden_state = np.load(mpath)
            n_hidden_state = np.load(npath)
            final_hs = np.concatenate((m_hidden_state, n_hidden_state), axis=-1)
        else:
            m_hidden_state = np.load(mpath)
            n_hidden_state = np.load(npath)
            hs = np.concatenate((m_hidden_state, n_hidden_state), axis=-1)
            final_hs = np.concatenate((final_hs, hs), axis=0)

    print(final_hs.shape)
    save_path = f'{path}/data/tab/concatenated_mini/{split}/concat_{i}.npy'
    np.save(save_path, final_hs)
    print(f'File saved at {save_path}')

def create_concat_dataset_and_save(model_mask, model_ner, split):
    mask_path = f'{path}/models/{model_mask}/hidden_states/{split}/'
    ner_path = f'{path}/models/{model_ner}/hidden_states/{split}/'
    files = [f for f in os.listdir(mask_path)]

    for i, f in enumerate(files):
        mpath = os.path.join(mask_path, f)
        npath = os.path.join(ner_path, f)
        m_hidden_state = np.load(mpath)
        n_hidden_state = np.load(npath)
        final_hs = np.concatenate((m_hidden_state, n_hidden_state), axis=-1)

        save_path = f'{path}/data/tab/concatenated_mini/{split}/concat_{i}.npy'
        np.save(save_path, final_hs)
        print(f'File saved at {save_path}')

In [None]:
# create batched train concatenated vectors
split = 'train'
create_concat_dataset_and_save(
    model_mask='baseline_final_2.5e-5_linear_warmup_11_25',
    model_ner='ner_2.5e-5_cosine_warmup_12_02_II',
    split=split
)

In [None]:
# create test and val data in one file
# create_concat_dataset_full_and_save(model_mask='baseline_final_2.5e-5_linear_warmup_11_25',
#                                     model_ner='ner_2.5e-5_cosine_warmup_12_02_II',
#                                     split='val')

# create_concat_dataset_full_and_save(model_mask='baseline_final_2.5e-5_linear_warmup_11_25',
#                                     model_ner='ner_2.5e-5_cosine_warmup_12_02_II',
#                                     split='test')

# split = 'val'
# create_concat_dataset_and_save(
#     model_mask='baseline_final_2.5e-5_linear_warmup_11_25',
#     model_ner='ner_2.5e-5_cosine_warmup_12_02_II',
#     split=split
# )

# split = 'test'
# create_concat_dataset_and_save(
#     model_mask='baseline_final_2.5e-5_linear_warmup_11_25',
#     model_ner='ner_2.5e-5_cosine_warmup_12_02_II',
#     split=split
# )

File saved at /content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project/data/tab/concatenated_mini/val/concat_0.npy
File saved at /content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project/data/tab/concatenated_mini/val/concat_1.npy


File saved at /content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project/data/tab/concatenated_mini/test/concat_0.npy
File saved at /content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project/data/tab/concatenated_mini/test/concat_1.npy


In [None]:
def create_concat_dataset_full_and_save(model_mask, model_ner, split):
    mask_path = f'{path}/models/{model_mask}/hidden_states/{split}/'
    ner_path = f'{path}/models/{model_ner}/hidden_states/{split}/'
    files = [f for f in os.listdir(mask_path)]

    for i, f in enumerate(files):
        mpath = os.path.join(mask_path, f)
        npath = os.path.join(ner_path, f)
        if i == 0:
            m_hidden_state = np.load(mpath)
            n_hidden_state = np.load(npath)
            final_hs = np.concatenate((m_hidden_state, n_hidden_state), axis=-1)
        else:
            m_hidden_state = np.load(mpath)
            n_hidden_state = np.load(npath)
            hs = np.concatenate((m_hidden_state, n_hidden_state), axis=-1)
            final_hs = np.concatenate((final_hs, hs), axis=0)

    print(final_hs.shape)
    save_path = f'{path}/data/tab/concatenated_mini/{split}/concat_{i}.npy'
    np.save(save_path, final_hs)
    print(f'File saved at {save_path}')

In [None]:
# one-off
# model_mask='baseline_final_2.5e-5_linear_warmup_11_25'
# model_ner='ner_2.5e-5_cosine_warmup_12_02_II'
# mask_path = f'{path}/models/{model_mask}/hidden_states/train/train_last_hidden_state_batch_4.npy'
# ner_path = f'{path}/models/{model_ner}/hidden_states/train/train_last_hidden_state_batch_4.npy'
# m_hidden_state = np.load(mask_path)
# n_hidden_state = np.load(ner_path)
# final_hs = np.concatenate((m_hidden_state, n_hidden_state), axis=-1)
# np.save(f'{path}/data/tab/concatenated_mini/train/concat_4.npy', final_hs)

In [None]:
# add_hidden_state = hidden_state_1 + hidden_state_2
# print(add_hidden_state.shape)
# print(hidden_state_1[0][0][:20])
# print(hidden_state_2[0][0][:20])
# print(add_hidden_state[0][0][:20])

In [None]:
# grab example labels
ds_train = select_data(split='train', task='mask', size='mini')
# example_labels = torch.tensor(ds_train['train']['labels'][:32])
ds_train

DatasetDict({
    train: Dataset({
        features: ['id', 'ner_tags', 'mask_tags', 'text_spans', 'tokens', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 400
    })
})

In [None]:
labels = ds_train['train']['labels']

In [None]:
def get_labels_for_batch(labels, batch_size=32):
    start_idx = 0
    end_idx = batch_size
    batch_labels = {}
    for i in range(math.ceil(len(labels) / 32)):
        batch_labels[i] = labels[start_idx: end_idx]
        start_idx = end_idx
        end_idx += batch_size

    return batch_labels

In [None]:
batch_labels = get_labels_for_batch(labels, batch_size=32)
batch_labels.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

In [None]:
len(batch_labels[12])

16

# Train Model

In [None]:
# functions
# metrics
def compute_metrics(p):
    seqeval = evaluate.load('seqeval')

    predictions, labels = p
    predictions = predictions[0] # outcoming dim is (1, 32, 4096, 7) instead of (32, 4096, 7)
    labels = labels[0] # outcoming dim is (1, 32, 4096) instead of (32, 4096)
    # print(predictions.shape)
    # print(labels.shape)
    predictions = np.argmax(predictions, axis=2)

    label_list = ['O', 'B-NO_MASK', 'I-NO_MASK', 'B-DIRECT', 'I-DIRECT', 'B-QUASI', 'I-QUASI']
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels, zero_division=1)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "seqeval_acc": results["overall_accuracy"],
    }

def count_trainable_parameters(model):
    # Get the trainable parameters of the model
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return trainable_params

In [None]:
# classes
class ConcatTokenClassificationModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            # nn.Linear(hidden_dim, hidden_dim),
            # nn.Relu(),
            nn.Linear(hidden_dim, num_classes)
        )
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None):
        logits = self.linear_relu_stack(input_ids)

        if labels is not None:
            loss = self.loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))
            return {"logits": logits, "loss": loss}

        return {"logits": logits}

In [None]:
def get_labels_for_batch(labels, batch_size=32):
    start_idx = 0
    end_idx = batch_size
    batch_labels = {}
    for i in range(math.ceil(len(labels) / 32)):
        batch_labels[i] = labels[start_idx: end_idx]
        start_idx = end_idx
        end_idx += batch_size

    return batch_labels

In [None]:
from torch.utils.data import IterableDataset
from transformers import Trainer, TrainingArguments

class FileStreamingDataset(IterableDataset):
    def __init__(self, split, labels, process_function, batch_size=32, num_files=None):
        super(FileStreamingDataset).__init__()
        self.folder = f'{path}/data/tab/concatenated_mini/{split}'
        if num_files == None:
            self.num_files = len(os.listdir(file_path))
        else:
            self.num_files = num_files
        self.labels = labels
        self.process_function = process_function
        self.batch_size = batch_size

    def __iter__(self):
        batch_count = 0
        for x in range(self.num_files):
            file_path = os.path.join(self.folder, f'concat_{x}.npy')
            hidden_states = np.load(file_path)
            # print(f'Processing file: {file_path}, {hidden_states.shape}')

            # Process the data and yield the batches of individual samples
            num_samples = hidden_states.shape[0]
            for i in range(0, num_samples, self.batch_size):
                batch_hidden_states = hidden_states[i:i+self.batch_size]
                batch_labels = self.labels[x]
                # print(f'Batch labels: {len(batch_labels)}')

                # Process the batch into the desired format
                batch = self.process_function(batch_hidden_states, batch_labels)

                batch_count += 1
                # print(f'Yielding batch: {batch_count}, {batch_hidden_states.shape}')
                yield batch

def process_line(hidden_states, labels):
    # Ensure inputs are numpy arrays and properly formatted
    if not isinstance(hidden_states, np.ndarray):
        raise ValueError(f"Expected hidden_states as np.ndarray, got {type(hidden_states)}")
    if not isinstance(labels, list):
        raise ValueError(f"Expected labels as list, got {type(labels)}")

    # Convert to PyTorch tensors
    input_ids = torch.tensor(hidden_states, dtype=torch.float32)
    label_tensor = torch.tensor(labels, dtype=torch.int64)  # CrossEntropyLoss expects int64 for labels

    # Return a dictionary suitable for Trainer
    return {'input_ids': input_ids, 'labels': label_tensor}

In [None]:
# def pad_collate(batch):
#     max_batch_size = max(item['input_ids'].size(0) for item in batch)
#     padded_batch = {
#         'input_ids': torch.stack([torch.cat([item['input_ids'],
#                                              torch.zeros(max_batch_size - item['input_ids'].size(0),
#                                                          item['input_ids'].size(1),
#                                                          item['input_ids'].size(2)])
#                                   for item in batch]),
#         'labels': torch.cat([item['labels'],
#                              torch.full((max_batch_size - item['labels'].size(0),),
#                                         fill_value=-100)])  # Use -100 for ignored labels
#     }
#     return padded_batch

SyntaxError: closing parenthesis '}' does not match opening parenthesis '[' on line 4 (<ipython-input-60-13e4f2fb7778>, line 12)

In [None]:
# Initialize train dataset
ds_train = select_data(split='train', task='mask', size='mini')
labels = ds_train['train']['labels']
batch_labels = get_labels_for_batch(labels, batch_size=32)
print(batch_labels.keys())

# split = 'train'
# file_path = f'{path}/data/tab/concatenated_mini/{split}'
# files = [f'{file_path}/{f}' for f in os.listdir(file_path)]

streaming_dataset = FileStreamingDataset(split='train',
                                         labels=batch_labels,
                                         process_function=process_line,
                                         num_files=4)

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])


In [None]:
# eval_hidden_states = np.load(f'{path}/data/tab/concatenated_mini/val/concat_1.npy')
ds_val = select_data(split='val', task='mask', size='mini')
labels = ds_val['train']['labels']
eval_batch_labels = get_labels_for_batch(labels, batch_size=32)
print(eval_batch_labels.keys())

# # val
# split = 'val'
# file_path = f'{path}/data/tab/concatenated_mini/{split}'
# val_files = [f'{file_path}/{f}' for f in os.listdir(file_path)]

eval_streaming_dataset = FileStreamingDataset(split='val',
                                              labels=eval_batch_labels,
                                              process_function=process_line,
                                              num_files=1)

dict_keys([0, 1])


In [None]:
# eval_hidden_states = np.load(f'{path}/data/tab/concatenated_mini/val/concat_1.npy')
# ds_val = select_data(split='val', task='mask', size='mini')
# eval_labels = ds_val['train']['labels']

In [None]:
# eval_dataset = Dataset.from_dict({
#     'input_ids': torch.tensor(np.load(f'{path}/data/tab/concatenated_mini/val/concat_1.npy'), dtype=torch.float32),
#     'labels': torch.tensor(eval_labels, dtype=torch.int64)
# })

In [None]:
seq_length = 4096
input_dim = 1536
hidden_dim = 512
num_classes = 7

model = ConcatTokenClassificationModel(input_dim=input_dim,
                                       hidden_dim=hidden_dim,
                                       num_classes=num_classes)

In [None]:
# dataset = Dataset.from_dict({
#     'input_ids': torch.tensor(np.load('/content/drive/MyDrive/Colab Notebooks/DATASCI 266/266 project/data/tab/concatenated_mini/train/concat_0.npy')),
#     'labels': torch.tensor(ds_train['train']['labels'][:32])
# })

In [None]:
# type(dataset['input_ids'])

In [None]:
print(model)

ConcatTokenClassificationModel(
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=1536, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=512, out_features=7, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
)


In [None]:
# # TrainingArguments with eval; oom issues
# model_name = 'concat_base_1e-4_test'

# batch_size = 32
# num_train_epochs = 5
# max_steps = (128 // batch_size) * num_train_epochs

# training_args = TrainingArguments(
#     output_dir=f'{path}/models/{model_name}/results',
#     eval_strategy='epoch',
#     save_strategy='epoch',
#     logging_strategy='epoch',
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     save_only_model=True,
#     metric_for_best_model='eval_loss',
#     per_device_train_batch_size=32,
#     per_device_eval_batch_size=32,
#     greater_is_better=False,
#     learning_rate=1e-4,
#     max_steps=max_steps, # overrides num_train_epochs
#     # num_train_epochs=num_train_epochs,
#     report_to='none')

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=streaming_dataset,
#     eval_dataset=eval_streaming_dataset,
#     compute_metrics=compute_metrics
# )

# trainer.train()

max_steps is given, it will override any value given in num_train_epochs


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Seqeval Acc
0,1.9105,1.550794,0.048025,0.07522,0.058622,0.776915


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


(32, 4096, 7)
(32, 4096)


In [None]:
# TrainingArguments w/o eval
model_name = 'concat_base_0.01_test'

batch_size = 32
num_train_epochs = 5
max_steps = (128 // batch_size) * num_train_epochs

training_args = TrainingArguments(
    output_dir=f'{path}/models/{model_name}/results',
    # eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    save_total_limit=2,
    # load_best_model_at_end=True,
    # save_only_model=True,
    # metric_for_best_model='eval_loss',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    greater_is_better=False,
    learning_rate=0.01,
    max_steps=max_steps, # overrides num_train_epochs
    # num_train_epochs=num_train_epochs,
    report_to='none')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=streaming_dataset,
    # eval_dataset=eval_streaming_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.8513
2,9.5032
3,8.3707
4,5.6644
5,2.4639
6,14.0991
7,20.343
8,23.8405
9,20.829
10,17.3858


TrainOutput(global_step=20, training_loss=8.245213794708253, metrics={'train_runtime': 830.534, 'train_samples_per_second': 0.771, 'train_steps_per_second': 0.024, 'total_flos': 0.0, 'train_loss': 8.245213794708253, 'epoch': 19.05})

In [None]:
trainer.evaluate(eval_dataset=streaming_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'eval_loss': 2.189661741256714,
 'eval_precision': 0.6052631578947368,
 'eval_recall': 0.020063972084908403,
 'eval_f1': 0.03884041654939488,
 'eval_seqeval_acc': 0.793350928369681,
 'eval_runtime': 39.3866,
 'eval_samples_per_second': 0.102,
 'eval_steps_per_second': 0.025,
 'epoch': 19.05}

In [None]:
trainer.evaluate(eval_dataset=eval_streaming_dataset)

{'eval_loss': 1.1053968667984009,
 'eval_precision': 0.9310344827586207,
 'eval_recall': 0.03275705186533212,
 'eval_f1': 0.06328743041312629,
 'eval_seqeval_acc': 0.7734955293326232,
 'eval_runtime': 8.5946,
 'eval_samples_per_second': 0.116,
 'eval_steps_per_second': 0.116,
 'epoch': 19.05}

In [None]:
# test pipeline
# predictions, labels, metrics = trainer.predict(dataset)

In [None]:
# predictions.shape

(32, 4096, 7)

In [None]:
# save hf/pytorch model
trainer.save_model(f'{path}/models/{model_name}/model')
# did not save tokenizer as already tokenized; load default longformer

# Evaluation

In [None]:
ds_test = select_data(split='test', task=task, size=size)

In [None]:
trainer.evaluate(eval_dataset=ds_test['train'])

In [None]:
predictions, labels, metrics = trainer.predict(ds_test['train'])
print(f"Metrics: {metrics}")
print(predictions[0])
print(labels[0])