In [1]:
from transformers import BartForSequenceClassification, BartForConditionalGeneration
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers.utils import PaddingStrategy
import evaluate
from nltk import sent_tokenize
import numpy as np
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
import nltk
from tqdm.auto import tqdm
import torch
from datasets import ClassLabel
from transformers import DataCollatorWithPadding
from itertools import tee, chain
from functools import partial
from pprint import pformat

PRETRAINED_MODEL_NAME_OR_PATH="ainize/bart-base-cnn"

In [2]:


def setup_models():
    # initialize models
    classification_model = BartForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH, num_labels=59)
    summarization_model = BartForConditionalGeneration.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)

    # share parameters
    summarization_model.model.shared = classification_model.model.shared
    summarization_model.model.encoder = classification_model.model.encoder
    summarization_model.model.decoder = classification_model.model.decoder

    return {
        "summarization": summarization_model,
        "classification": classification_model
    }

def preprocess_docee(examples, tokenizer):
    batch_encoding = tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024,
    )
    batch_encoding["labels"] = examples["event_type"]
    return batch_encoding

def print_first_param(model):
    for name, param in model.named_parameters():
        print(f"{name} is {param}")
        break

def compose2(f, g):
    def composition(*args, **kwargs):
        g_output = g(*args, **kwargs)
        return f(g_output)
    return composition

def c(*fs):
    def composition(*args, **kwargs):
        output = fs[-1](*args, **kwargs)
        for f in reversed(fs[:-1]):
            output = f(output)
        return output
    return composition

def process_summary_example(
        examples,
        tokenizer,
        max_input_length=1024,
        max_target_length=100
):
    # tokenize the article
    batch_encoding = tokenizer(
        examples["article"],
        max_length=max_input_length,
        truncation=True
    )

    # tokenize the labels
    tokenized_highlights = tokenizer(
        examples["highlights"],
        max_length=max_target_length,
        truncation=True
    )

    batch_encoding["labels"] = tokenized_highlights["input_ids"]
    return batch_encoding

def setup_tasks(task_templates, models, num_epochs=1):

    tasks = {
        task: {
            "model": models[task],
            "optimizer": None,
            "train_dataloader": DataLoader(
                stuff["train_dataset"] ,
                batch_size=stuff["batch_size"],
                shuffle=True,
                collate_fn=stuff["collate_fn"],
                pin_memory=True
            ),
            "eval_dataloader": DataLoader(
                stuff["eval_dataset"],
                batch_size=stuff["batch_size"],
                collate_fn=stuff["collate_fn"],
                pin_memory=True
            )

        }
        for task, stuff in task_templates.items()
    }

    setup_optimizers(tasks)
    accelerate(tasks)
    summ_cls_ratio = len(tasks["summarization"]["train_dataloader"]) // len(tasks["classification"]["train_dataloader"]) + 1
    setup_schedulers(tasks, num_epochs=num_epochs, summ_cls_ratio=summ_cls_ratio)
    return tasks


def compute_rouge(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def get_param_list(model):
    return [
               param for param in model.shared.parameters()
           ] + [
               param for param in model.encoder.parameters()
           ] + [
               param for param in model.decoder.parameters()
           ]


def setup_optimizers(tasks):
    for task_name, task_objects in tasks.items():
        print(f"Setting up {task_name}")
        task_objects["optimizer"] = AdamW(task_objects["model"].parameters(), lr=2e-5)


def accelerate(tasks):
    for task in tasks:
        accelerator = Accelerator()
        # tasks[task]["accelerator"] = Accelerator()
        for component in ["model", "optimizer", "train_dataloader", "eval_dataloader"]:
            tasks[task][component] = accelerator.prepare(tasks[task][component])
        tasks[task]["accelerator"] = accelerator


def setup_schedulers(tasks, num_epochs, summ_cls_ratio):
    tasks["summarization"]["lr_scheduler"] = get_scheduler(
        "linear",
        tasks["summarization"]["optimizer"],
        num_warmup_steps=0,
        num_training_steps=num_epochs * len(tasks["summarization"]["train_dataloader"])
    )
    tasks["classification"]["lr_scheduler"] = get_scheduler(
        "linear",
        tasks["classification"]["optimizer"],
        num_warmup_steps=0,
        num_training_steps=num_epochs * len(tasks["classification"]["train_dataloader"]) * summ_cls_ratio
    )


def train(tasks, num_epochs, tokenizer):
    rouge_score = evaluate.load("rouge")
    classification_f1 = evaluate.load("f1")

    for epoch in tqdm(range(num_epochs), desc="Epoch", total=num_epochs):

        summ_cls_ratio = len(tasks["summarization"]["train_dataloader"]) // len(tasks["classification"]["train_dataloader"]) + 1
        print(f"{summ_cls_ratio = }")

        print(f"Dataloader for classification will be replicated {summ_cls_ratio} times.")
        cls_tdl_len = len(tasks["classification"]["train_dataloader"])
        print(f"Instead of {cls_tdl_len}, classification dataset iterator will yield {cls_tdl_len * summ_cls_ratio} examples.")

        # load training data, step by step
        num_epoch_steps = len(tasks["summarization"]["train_dataloader"]) * 2
        iters = {task: iter(tasks[task]["train_dataloader"]) for task in tasks}
        iters["classification"] = chain(*tee(iter(tasks["classification"]["train_dataloader"]), summ_cls_ratio))

        tasks["classification"]["train_len"] = cls_tdl_len * summ_cls_ratio
        tasks["summarization"]["train_len"] = len(tasks["summarization"]["train_dataloader"])

        progress_bars = {
            task: tqdm(range(tasks[task]["train_len"]), desc=f"{task} progress in epoch {epoch+1}", total=tasks[task]["train_len"], leave=False)
            for task in tasks
        }

        set_train(tasks)
        for step in range(num_epoch_steps):
            if step % 2 == 0: # train summarization
                task = "summarization"
            else:
                task = "classification"
            batch = next(iters[task])
            with tasks[task]["accelerator"].accumulate(tasks[task]["model"]):
                outputs = tasks[task]["model"](**batch)
                loss = outputs.loss
                tasks[task]["accelerator"].backward(loss)
                tasks[task]["optimizer"].step()
                tasks[task]["lr_scheduler"].step()
                tasks[task]["optimizer"].zero_grad()
                progress_bars[task].update(1)

        # TODO - gradient accumulation
        # TODO - loss weighing

        # idea -> instead of alternating batches, we could scale losses
        # idea2 -> GAN setup?
        #   -> generator tries to generate summaries
        #   -> discriminator predicts event types base on those summaries
        #   -> generator wants to generate such that discriminator is able to predict labels easier
        #
        # this would also be expensive AS FUCK to train
        #
        # would this work, and why not?
        #   where are the real/fake examples?
        #

        # evaluation at the end of epoch
        set_eval(tasks)

        # evaluate summarization
        accelerator = tasks["summarization"]["accelerator"]
        for step, batch in enumerate(tasks["summarization"]["eval_dataloader"]):
            with torch.no_grad():
                generated_tokens = accelerator.unwrap_model(tasks["summarization"]["model"]).generate(
                    batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                )  # aha! we can plug the generation parameters here

                generated_tokens = accelerator.pad_across_processes(
                    generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
                )
                labels = batch["labels"]

                # If we did not pad to max length, we need to pad the labels too
                labels = accelerator.pad_across_processes(
                    labels, dim=1, pad_index=tokenizer.pad_token_id
                )

                generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
                labels = accelerator.gather(labels).cpu().numpy()

                # Replace -100 in the labels as we can't decode them
                labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
                if isinstance(generated_tokens, tuple):
                    generated_tokens = generated_tokens[0]
                decoded_preds = tokenizer.batch_decode(
                    generated_tokens, skip_special_tokens=True
                )
                decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

                decoded_preds, decoded_labels = postprocess_text(
                    decoded_preds, decoded_labels
                )

                rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

                # evaluation loop is fine for summarization but we need it for classification as well

        # Compute metrics
        result = rouge_score.compute()
        # Extract the median ROUGE scores
        result = {key: value * 100 for key, value in result.items()}
        result = {k: round(v, 4) for k, v in result.items()}
        print(f"[SUMM] Epoch {epoch+1}:", pformat(result))

        output_dir = f"./test_mtl/summ_epoch_{epoch+1}"
        # Save and upload
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(tasks["summarization"]["model"])
        unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(output_dir)  # ovo treba samo jednom realno


        # evaluate classification
        eval_dataloader = tasks["classification"]["eval_dataloader"]
        accelerator = tasks["classification"]["accelerator"]
        model = tasks["classification"]["model"]
        for batch in tqdm(eval_dataloader, total=len(eval_dataloader), desc="[CLS] Evaluation"):
            # extract outputs
            outputs = accelerator.unwrap_model(model)(**batch)
            # print(outputs.keys())  # loss, logits, encoder_last_hidden_state

            # outputs["logits"] = (BS, 59)
            # we need argmax by dimension 1

            # decode logits into labels
            predictions = torch.argmax(outputs["logits"], dim=1)
            predictions = accelerator.gather(predictions).cpu().numpy()
            # print(labels)
            classification_f1.add_batch(
                predictions=predictions,
                references=batch["labels"].cpu().numpy(),
            )
            # break
            # f1.add_batch(predictions=outputs["labels"])
        result = classification_f1.compute(average="macro")
        print(f"[CLS] Epoch {epoch+1}: {pformat(result)}")

        output_dir = f"./test_mtl/cls_epoch_{epoch+1}"
        # Save and upload
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

        # pa ovo radi, bruhus maximus


def setup_dummy_dataset(cls_train_size, cls_eval_size, summ_train_size, summ_eval_size, tokenizer):
    summ = load_dataset("cnn_dailymail", "3.0.0")
    cls = load_dataset("csv", data_files={
        "train": "../data/docee/18091999/train.csv",
        "validation": "../data/docee/18091999/early_stopping.csv"
    })
    event_names = cls["train"].unique("event_type")
    cls = cls.cast_column("event_type", ClassLabel(num_classes=len(event_names), names=sorted(event_names)))

    def setup_dataset_split(dataset, split, n_examples, preprocessing):
        return dataset[split].shuffle().select(range(n_examples)).map(preprocessing, batched=True, remove_columns=dataset["train"].column_names)

    setup_cls = partial(setup_dataset_split, dataset=cls, preprocessing=partial(preprocess_docee, tokenizer=tokenizer))
    setup_summ = partial(setup_dataset_split, dataset=summ, preprocessing=partial(process_summary_example, tokenizer=tokenizer))

    cls_train = setup_cls(split="train", n_examples=cls_train_size)
    cls_eval = setup_cls(split="validation", n_examples=cls_eval_size)
    summ_train = setup_summ(split="train", n_examples=summ_train_size)
    summ_eval = setup_summ(split="validation", n_examples=summ_eval_size)

    print(f"{len(cls_train) = }")
    print(f"{len(cls_eval) = }")
    print(f"{len(summ_train) = }")
    print(f"{len(summ_eval) = }")

    return cls_train, cls_eval, summ_train, summ_eval


def set_train(tasks):
    for task in tasks:
        print(f"setting {task} to train")
        tasks[task]["model"].train()

def set_eval(tasks):
    for task in tasks:
        print(f"setting {task} to eval")
        tasks[task]["model"].eval()



In [3]:
from transformers import BartTokenizer


def pipeline(num_epochs=3):
    models = setup_models()
    tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)
    cls_train, cls_eval, summ_train, summ_eval = setup_dummy_dataset(
        cls_train_size=5,
        cls_eval_size=5,
        summ_train_size=12,
        summ_eval_size=8,
        tokenizer=tokenizer
    )
    task_templates = {
        "classification": {
            "train_dataset": cls_train,
            "eval_dataset": cls_eval,
            "batch_size": 1,
            "collate_fn": DataCollatorWithPadding(
                tokenizer=tokenizer,
                padding=PaddingStrategy.MAX_LENGTH,
                return_tensors="pt"
            )
        },
        "summarization": {
            "train_dataset": summ_train,
            "eval_dataset": summ_eval,
            "batch_size": 1,
            "collate_fn": DataCollatorForSeq2Seq(
                tokenizer=tokenizer,
                padding=PaddingStrategy.MAX_LENGTH,
                return_tensors="pt"
            )
        },
    }
    tasks = setup_tasks(task_templates, models, num_epochs=num_epochs)
    train(tasks, num_epochs=num_epochs, tokenizer=tokenizer)


In [4]:
pipeline(num_epochs=3)

Some weights of the model checkpoint at ainize/bart-base-cnn were not used when initializing BartForSequenceClassification: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at ainize/bart-base-cnn and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.weight', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-strea

  0%|          | 0/3 [00:00<?, ?it/s]

Using custom data configuration default-0720af0f377253e9
Found cached dataset csv (/home/jvidakovic/.cache/huggingface/datasets/csv/default-0720af0f377253e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/jvidakovic/.cache/huggingface/datasets/csv/default-0720af0f377253e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-5f6bf4a5560153b2.arrow
Loading cached processed dataset at /home/jvidakovic/.cache/huggingface/datasets/csv/default-0720af0f377253e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d42b77ea8f3482ef.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

len(cls_train) = 5
len(cls_eval) = 5
len(summ_train) = 12
len(summ_eval) = 8
Setting up classification
Setting up summarization


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

summ_cls_ratio = 3
Dataloader for classification will be replicated 3 times.
Instead of 5, classification dataset iterator will yield 15 examples.


classification progress in epoch 1:   0%|          | 0/15 [00:00<?, ?it/s]

summarization progress in epoch 1:   0%|          | 0/12 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.


setting classification to train
setting summarization to train


In [None]:
# models = setup_models()

In [6]:
tokenizer = AutoTokenizer.from_pretrained("ainize/bart-base-cnn")
tokenizer.model_max_length

1000000000000000019884624838656

In [None]:

assert id(models["summarization"].model.shared) == id(models["classification"].model.shared)
assert id(models["summarization"].model.encoder) == id(models["classification"].model.encoder)
assert id(models["summarization"].model.decoder) == id(models["classification"].model.decoder)


In [5]:

print_first_param(models["classification"].model.encoder)
print_first_param(models["summarization"].model.encoder)


In [None]:

# we need a:
#   -> dataframe loaded with docee examples
#   -> tokenizer (bart tokenizer)

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)

In [None]:
cls_train, cls_eval, summ_train, summ_eval = setup_dummy_dataset(
    cls_train_size=10, cls_eval_size=4, summ_train_size=55, summ_eval_size=17
)

In [None]:
task_templates = {
    "classification": {
        "train_dataset": cls_train,
        "eval_dataset": cls_eval,
        "batch_size": 1,
        "collate_fn": DataCollatorWithPadding(
            tokenizer=tokenizer,
            padding=PaddingStrategy.MAX_LENGTH,
            return_tensors="pt"
        )
    },
    "summarization": {
        "train_dataset": summ_train,
        "eval_dataset": summ_eval,
        "batch_size": 1,
        "collate_fn": DataCollatorForSeq2Seq(
            tokenizer=tokenizer,
            padding=PaddingStrategy.MAX_LENGTH,
            return_tensors="pt"
        )
    },
}

tasks = setup_tasks(task_templates, num_epochs=1)

In [None]:
train(tasks, num_epochs=3)

In [None]:

# bruhus
summ_dataset = load_dataset("cnn_dailymail", name="3.0.0")
print({split: len(summ_dataset[split]) for split in summ_dataset})

In [None]:
# okay, we got this
# cls_dataset = load_dataset("csv", data_files="../data/docee/train_all.csv")
# data_files can be a dictionary, where key is the name of the split, and value is path to the split
cls_dataset = load_dataset("csv", data_files={
    "train": "../data/docee/18091999/train.csv",
    "validation": "../data/docee/18091999/early_stopping.csv"
})
cls_dataset

In [None]:
cls_dataset["train"].shuffle(42).select(range(100))[:3]

In [None]:

max_input_length = 512
max_target_length = 100


In [None]:
tokenized_cnn = summ_dataset.map(process_summary_example, batched=True, remove_columns=["id", "article", "highlights"])


In [None]:
tokenized_cnn["train"].features

In [None]:

#import evaluate
rouge_score = evaluate.load("rouge")

generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

scores = rouge_score.compute(
    predictions=[generated_summary],
    references=[reference_summary]
)
scores

In [None]:

summ_data_collator = DataCollatorForSeq2Seq(tokenizer, model=models["summarization"])

In [None]:
features = [tokenized_cnn[i] for i in range(2)]
features

In [None]:
summ_data_collator(features)

In [None]:
tokenized_cnn.set_format("torch")

In [None]:

batch_size = 4
train_dataloader = DataLoader(
    tokenized_cnn,
    shuffle=True,
    collate_fn=summ_data_collator,
    batch_size=batch_size
)

eval_dataloader = DataLoader(
    tokenized_cnn,
    collate_fn=summ_data_collator,
    batch_size=batch_size
)


In [None]:

optimizer = AdamW(models["summarization"].parameters(), lr=2e-5)

In [None]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    models["summarization"], optimizer, train_dataloader, eval_dataloader
)


In [None]:

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [None]:

# progress_bar = tqdm(range(num_training_steps))


for epoch in tqdm(range(num_train_epochs), total=num_train_epochs, desc="Epoch progress"):
    # Training
    model.train()
    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Epoch step", leave=False):
        # pass through model
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # okay, this works
        # but we cannot iterate over the two dataloaders with knowing which batch we got
        #   which means -> we got to do the iteration manually

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )  # aha! we can plug the generation parameters here

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

            # evaluation loop is fine for summarization but we need it for classification as well

    # Compute metrics
    result = rouge_score.compute()
    # Extract the median ROUGE scores
    result = {key: value * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    output_dir = "./test_summ_train"
    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)


In [None]:
print_first_param(models["classification"].model.encoder)
print_first_param(models["summarization"].model.encoder)


In [None]:

cls_param_list = get_param_list(models["classification"].model)
summ_param_list = get_param_list(models["summarization"].model)
for cls_param, summ_param in zip(cls_param_list, summ_param_list):
    if not torch.all(torch.eq(cls_param, summ_param)):
        raise RuntimeError("Shared parameters are not equal!")

In [None]:
for name, param in models["classification"].named_parameters():
    print(f"{name} is {param}")
    break

In [None]:
event_names = cls_dataset.unique("event_type")
event_names

In [None]:
cls_dataset = cls_dataset.cast_column("event_type", ClassLabel(num_classes=len(event_names["train"]), names=sorted(event_names["train"])))

In [None]:
cls_dataset["train"].features

In [None]:
cls_dataset["train"][0]["text"]

In [None]:

docee = cls_dataset.map(preprocess_docee, batched=True, remove_columns=cls_dataset["train"].column_names)

In [None]:
docee["train"].features

In [None]:

batch_size = 1

data_collator = DataCollatorWithPadding(
    tokenizer,
    padding=PaddingStrategy.LONGEST,
    return_tensors="pt"
)

train_dataloader = DataLoader(
    docee["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    docee["validation"],
    batch_size=batch_size,
    collate_fn=data_collator
)


In [None]:
for batch in train_dataloader:
    break

{k: v.shape for k, v in batch.items()}

In [None]:
# test run
outputs = models["classification"](**batch)
print(f"{outputs.loss = }")
print(f"{outputs.logits.shape = }")
# moze

In [None]:
cls_optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=cls_optimizer,
    num_warmup_steps=500,
    num_training_steps=num_training_steps
)
num_training_steps

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

In [None]:
cls_accelerator = Accelerator()
train_dataloader, eval_dataloader, model, optimizer = cls_accelerator.prepare(
    train_dataloader, eval_dataloader, models["classification"], cls_optimizer
)

In [None]:
f1 = evaluate.load("f1")


In [None]:
model.eval()
for batch in tqdm(eval_dataloader, total=len(eval_dataloader), desc="Evaluation"):
    # extract outputs
    outputs = model(**batch)
    # print(outputs.keys())  # loss, logits, encoder_last_hidden_state

    # outputs["logits"] = (BS, 59)
    # we need argmax by dimension 1

    # decode logits into labels
    predictions = torch.argmax(outputs["logits"], dim=1)
    # print(labels)
    f1.add_batch(
        predictions=predictions.cpu().numpy(),
        references=batch["labels"].cpu().numpy(),
    )
    # break
    # f1.add_batch(predictions=outputs["labels"])
result = f1.compute(average="macro")
print(result)


In [None]:
f1_micro = f1.compute(average="micro")
print(f1_micro)
# okay, so we cannot call compute multiple times

In [None]:
model.train()
for epoch in tqdm(range(num_epochs), total=num_epochs, desc="Epoch progress"):
    for batch in tqdm(train_dataloader, total=len(train_dataloader), desc=f"Epoch {epoch+1}", leave=False):
        outputs = model(**batch)
        loss = outputs.loss
        cls_accelerator.backward(loss)

        cls_optimizer.step()
        lr_scheduler.step()
        cls_optimizer.zero_grad()

    # eval loop
    # model.eval()
    # we need metrics
    # for batch in tqdm(eval_dataloader, total=len(eval_dataloader), desc=f"Evaluation after epoch {epoch+1}", leave=False):
    # what about the evaluation loop? -> stick it somewhere here

In [None]:
print_first_param(models["classification"].model.encoder)
print_first_param(models["summarization"].model.encoder)

In [None]:
num_train_epochs = 1
cls_steps = 1  # what does this mean?
summ_steps = 2   # what does this mean?

cls_batch_size=1
summ_batch_size=1

# probably doesnt make much sense to train summarization more often than classification, right?
# the thing is:
#   we are actually learning SUMMARIZATION!!
#   -> but we want to accomplish learning this summarization by utilizing classification as well

cls_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=PaddingStrategy.MAX_LENGTH,
    return_tensors="pt"
)

summ_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=PaddingStrategy.MAX_LENGTH,
    return_tensors="pt"
)

tasks = {
    "classification": {
        "model": models["classification"],
        "optimizer": None,
        "train_dataloader": DataLoader(
            docee["train"],
            batch_size=cls_batch_size,
            shuffle=True,
            collate_fn=cls_collator
        ),
        "eval_dataloader": DataLoader(
            docee["validation"],
            batch_size=cls_batch_size,
            collate_fn=cls_collator
        )
    },
    "summarization": {
        "model": models["summarization"],
        "optimizer": None,
        "train_dataloader": DataLoader(
            tokenized_cnn["train"],
            batch_size=summ_batch_size,
            shuffle=True,
            collate_fn=summ_collator
        ),
        "eval_dataloader": DataLoader(
            tokenized_cnn["validation"],
            batch_size=summ_batch_size,
            collate_fn=summ_collator
        )
    }
}

tasks

In [None]:

setup_optimizers(tasks)

In [None]:
tasks

In [None]:
# the thing is, classification dataloader contains less examples than summarization dataloader
# we can solve this by oversampling the classification dataloader (by using itertools.tee)
summ_cls_ratio = len(tasks["summarization"]["train_dataloader"]) // len(tasks["classification"]["train_dataloader"]) + 1
summ_cls_ratio
# tasks["classification"]["train_dataloader"] = tee(tasks)


In [None]:

tasks["classification"]["train_dataloader"] = DataLoader(
    docee["train"],
    batch_size=cls_batch_size,
    shuffle=True,
    collate_fn=cls_collator
)

In [None]:

# set_train(tasks)

In [None]:
accelerate(tasks)

In [None]:
tasks["classification"]["train_dataloader"] = chain(*tee(tasks["classification"]["train_dataloader"], summ_cls_ratio))

In [None]:
test_iter = iter(tasks["classification"]["train_dataloader"])
batch = next(test_iter)
batch

In [None]:

fix_cls_dataloader(tasks)

In [None]:
num_epochs = 1

In [None]:
setup_schedulers(tasks)
tasks

In [None]:
set_train(tasks)

In [None]:

# pa ovo radi buraz


In [None]:


# okay, lets try this, but with an extremely small size, just to test the loop


In [None]:
cls_train, cls_eval, summ_train, summ_eval = setup_dummy_dataset(
    cls_train_size=10,
    cls_eval_size=5,
    summ_train_size=55,
    summ_eval_size=17
)