In [1]:
from transformers import BartForSequenceClassification, BartForConditionalGeneration
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers.utils import PaddingStrategy
import evaluate
from nltk import sent_tokenize
import numpy as np
from transformers import DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
import nltk
from tqdm.auto import tqdm
import torch
from datasets import ClassLabel
from transformers import DataCollatorWithPadding

PRETRAINED_MODEL_NAME_OR_PATH="ainize/bart-base-cnn"

In [2]:
def setup_models():
    # initialize models
    classification_model = BartForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH, num_labels=59)
    summarization_model = BartForConditionalGeneration.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)

    # share parameters
    summarization_model.model.shared = classification_model.model.shared
    summarization_model.model.encoder = classification_model.model.encoder
    summarization_model.model.decoder = classification_model.model.decoder

    return {
        "summarization": summarization_model,
        "classification": classification_model
    }


In [3]:
models = setup_models()
assert id(models["summarization"].model.shared) == id(models["classification"].model.shared)
assert id(models["summarization"].model.encoder) == id(models["classification"].model.encoder)
assert id(models["summarization"].model.decoder) == id(models["classification"].model.decoder)


Some weights of the model checkpoint at ainize/bart-base-cnn were not used when initializing BartForSequenceClassification: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at ainize/bart-base-cnn and are newly initialized: ['classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-strea

In [4]:
def print_first_param(model):
    for name, param in model.named_parameters():
        print(f"{name} is {param}")
        break

print_first_param(models["classification"].model.encoder)
print_first_param(models["summarization"].model.encoder)


embed_tokens.weight is Parameter containing:
tensor([[ 0.0113,  0.0083, -0.0115,  ...,  0.0084,  0.1087,  0.0126],
        [ 0.0123, -0.0161,  0.0099,  ..., -0.0460, -0.0303,  0.0128],
        [ 0.0790, -0.0347,  0.0084,  ...,  0.0486,  0.0094,  0.0307],
        ...,
        [ 0.0131, -0.0308, -0.0453,  ..., -0.0032,  0.0246, -0.0191],
        [ 0.0053, -0.0446, -0.0519,  ...,  0.0054,  0.0143, -0.0166],
        [ 0.0102, -0.0272, -0.0528,  ...,  0.0231,  0.0057, -0.0126]],
       requires_grad=True)
embed_tokens.weight is Parameter containing:
tensor([[ 0.0113,  0.0083, -0.0115,  ...,  0.0084,  0.1087,  0.0126],
        [ 0.0123, -0.0161,  0.0099,  ..., -0.0460, -0.0303,  0.0128],
        [ 0.0790, -0.0347,  0.0084,  ...,  0.0486,  0.0094,  0.0307],
        ...,
        [ 0.0131, -0.0308, -0.0453,  ..., -0.0032,  0.0246, -0.0191],
        [ 0.0053, -0.0446, -0.0519,  ...,  0.0054,  0.0143, -0.0166],
        [ 0.0102, -0.0272, -0.0528,  ...,  0.0231,  0.0057, -0.0126]],
       requires

In [5]:

# we need a:
#   -> dataframe loaded with docee examples
#   -> tokenizer (bart tokenizer)

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)

In [6]:

# bruhus
summ_dataset = load_dataset("cnn_dailymail", name="3.0.0")
print({split: len(summ_dataset[split]) for split in summ_dataset})

Found cached dataset cnn_dailymail (/home/jvidakovic/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

{'train': 287113, 'validation': 13368, 'test': 11490}


In [35]:
def compose2(f, g):
    def composition(*args, **kwargs):
        g_output = g(*args, **kwargs)
        return f(g_output)
    return composition

def c(*fs):
    def composition(*args, **kwargs):
        output = fs[-1](*args, **kwargs)
        for f in reversed(fs[:-1]):
            output = f(output)
        return output
    return composition


In [7]:
# okay, we got this
# cls_dataset = load_dataset("csv", data_files="../data/docee/train_all.csv")
# data_files can be a dictionary, where key is the name of the split, and value is path to the split
cls_dataset = load_dataset("csv", data_files={
    "train": "../data/docee/18091999/train.csv",
    "validation": "../data/docee/18091999/early_stopping.csv"
})
cls_dataset

Using custom data configuration default-0720af0f377253e9
Found cached dataset csv (/home/jvidakovic/.cache/huggingface/datasets/csv/default-0720af0f377253e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'title', 'text', 'event_type', 'arguments', 'date', 'metadata'],
        num_rows: 17559
    })
    validation: Dataset({
        features: ['index', 'title', 'text', 'event_type', 'arguments', 'date', 'metadata'],
        num_rows: 2195
    })
})

In [10]:
cls_dataset["train"].shuffle(42).select(range(100))[:3]

Loading cached shuffled indices for dataset at /home/jvidakovic/.cache/huggingface/datasets/csv/default-0720af0f377253e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-1b432e07d6b975f8.arrow


{'index': [8677, 13606, 10423],
 'title': ['Simulation of glacial calving and tsunami waves predicts climate change consequences',
  'Four protesters and two police officers are killed during clashes in Baghdad.',
  'Italian firm Fiat Chrysler  proposes a merger with French carmaker Renault. The new company will be based in the Netherlands and will be listed on the Milan, Paris and New York stock exchanges.'],
 'text': ['As natural disasters intensify due to climate change, accurate predictions of weather patterns and mechanisms are greatly needed to mitigate damage. Coastal regions will be the most affected by changing weather, with events such as tsunamis and hurricanes becoming more frequent and life-threatening. While most tsunamis are caused by earthquakes and tectonic activity, the warming of the planet is now increasing the occurrence of tsunamis caused by glacier calving, when chunks of glacier break off and become icebergs. Additionally, glacier calving is predicted to be the 

In [8]:

max_input_length = 512
max_target_length = 100

def process_summary_example(examples):
    # tokenize the article
    batch_encoding = tokenizer(
        examples["article"],
        max_length=max_input_length,
        truncation=True
    )

    # tokenize the labels
    tokenized_highlights = tokenizer(
        examples["highlights"],
        max_length=max_target_length,
        truncation=True
    )

    batch_encoding["labels"] = tokenized_highlights["input_ids"]
    return batch_encoding

In [9]:
tokenized_cnn = summ_dataset.map(process_summary_example, batched=True, remove_columns=["id", "article", "highlights"])


  0%|          | 0/288 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/jvidakovic/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-b0bb298f8937ca1c.arrow


  0%|          | 0/12 [00:00<?, ?ba/s]

In [39]:
tokenized_cnn["train"].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [40]:

#import evaluate
rouge_score = evaluate.load("rouge")

generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

scores = rouge_score.compute(
    predictions=[generated_summary],
    references=[reference_summary]
)
scores

{'rouge1': 0.923076923076923,
 'rouge2': 0.7272727272727272,
 'rougeL': 0.923076923076923,
 'rougeLsum': 0.923076923076923}

In [41]:
def compute_rouge(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}


In [16]:

summ_data_collator = DataCollatorForSeq2Seq(tokenizer, model=models["summarization"])

In [17]:
features = [tokenized_cnn[i] for i in range(2)]
features

[{'input_ids': [0,
   1640,
   16256,
   43,
   11957,
   6,
   8,
   110,
   4085,
   40,
   28,
   39582,
   4,
   280,
   189,
   2369,
   101,
   41,
   43962,
   2329,
   1580,
   6,
   53,
   77,
   525,
   19678,
   163,
   8508,
   7485,
   1120,
   1403,
   12445,
   1276,
   7,
   492,
   65,
   9,
   69,
   33473,
   7,
   10,
   12443,
   6,
   69,
   19501,
   11153,
   62,
   19,
   380,
   414,
   4,
   85,
   4596,
   11,
   411,
   1484,
   2806,
   28748,
   3277,
   4,
   280,
   3911,
   8,
   885,
   9725,
   69,
   4,
   22,
   100,
   802,
   38,
   21,
   164,
   7,
   244,
   42,
   65,
   621,
   54,
   38,
   218,
   75,
   216,
   6,
   53,
   5,
   754,
   14,
   98,
   171,
   82,
   64,
   33,
   10,
   301,
   5064,
   6,
   14,
   18,
   1256,
   380,
   60,
   163,
   8508,
   7485,
   1120,
   174,
   3480,
   10515,
   229,
   14740,
   4,
   264,
   189,
   619,
   10346,
   11,
   69,
   19501,
   30,
   10,
   723,
   476,
   4,
   22,
   22086,
 

In [18]:
summ_data_collator(features)

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[    0,  1640, 16256,  ...,    39,  2761,     2],
        [    0,  1640, 16256,  ...,    95,    15,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[    0,  1301, 19678,   163,  8508,  7485,  1120,  1276,     7,   492,
            10, 12855,     7,    10, 12443,   479, 50118,   250,    92,  3034,
           586,  1147,    69,  7096, 15220, 28748,  3277,    13,   411, 12855,
          1484,   479,     2,  -100,  -100,  -100],
        [    0,   133,   291,   212, 13989,   191,  3772,    42,   983,   479,
         50118, 17608,    34,  1714,  8617,   187,    63, 17692,    11,  8008,
           479, 50118,  6323,   864,   549,  1492,  2624,  5391,  9686,     8,
         12291,   240,     7,   464,   479,     2]]), 'decoder_input_ids': tensor([[    2,     0,  1301, 19678,   163,  8508,  7485,  1120,  1276,     7,
           492,    10, 12855,     7,    10, 12443,   479, 50118,   250,    92,
          3

In [10]:
tokenized_cnn.set_format("torch")

In [20]:

batch_size = 4
train_dataloader = DataLoader(
    tokenized_cnn,
    shuffle=True,
    collate_fn=summ_data_collator,
    batch_size=batch_size
)

eval_dataloader = DataLoader(
    tokenized_cnn,
    collate_fn=summ_data_collator,
    batch_size=batch_size
)


In [21]:

optimizer = AdamW(models["summarization"].parameters(), lr=2e-5)

In [22]:
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    models["summarization"], optimizer, train_dataloader, eval_dataloader
)


In [23]:

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [43]:

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [25]:

# progress_bar = tqdm(range(num_training_steps))


for epoch in tqdm(range(num_train_epochs), total=num_train_epochs, desc="Epoch progress"):
    # Training
    model.train()
    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Epoch step", leave=False):
        # pass through model
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # okay, this works
        # but we cannot iterate over the two dataloaders with knowing which batch we got
        #   which means -> we got to do the iteration manually

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )  # aha! we can plug the generation parameters here

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

            # evaluation loop is fine for summarization but we need it for classification as well

    # Compute metrics
    result = rouge_score.compute()
    # Extract the median ROUGE scores
    result = {key: value * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    output_dir = "./test_summ_train"
    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)


Epoch progress:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch step:   0%|          | 0/3342 [00:00<?, ?it/s]



Epoch 0: {'rouge1': 25.981, 'rouge2': 14.579, 'rougeL': 22.3367, 'rougeLsum': 24.8108}


In [26]:
print_first_param(models["classification"].model.encoder)
print_first_param(models["summarization"].model.encoder)


embed_tokens.weight is Parameter containing:
tensor([[ 0.0110,  0.0082, -0.0107,  ...,  0.0088,  0.1086,  0.0112],
        [ 0.0123, -0.0161,  0.0099,  ..., -0.0460, -0.0303,  0.0128],
        [ 0.0785, -0.0346,  0.0086,  ...,  0.0478,  0.0098,  0.0302],
        ...,
        [ 0.0131, -0.0308, -0.0453,  ..., -0.0032,  0.0246, -0.0191],
        [ 0.0053, -0.0446, -0.0519,  ...,  0.0054,  0.0143, -0.0166],
        [ 0.0102, -0.0272, -0.0527,  ...,  0.0231,  0.0057, -0.0126]],
       device='cuda:0', requires_grad=True)
embed_tokens.weight is Parameter containing:
tensor([[ 0.0110,  0.0082, -0.0107,  ...,  0.0088,  0.1086,  0.0112],
        [ 0.0123, -0.0161,  0.0099,  ..., -0.0460, -0.0303,  0.0128],
        [ 0.0785, -0.0346,  0.0086,  ...,  0.0478,  0.0098,  0.0302],
        ...,
        [ 0.0131, -0.0308, -0.0453,  ..., -0.0032,  0.0246, -0.0191],
        [ 0.0053, -0.0446, -0.0519,  ...,  0.0054,  0.0143, -0.0166],
        [ 0.0102, -0.0272, -0.0527,  ...,  0.0231,  0.0057, -0.0126]]

In [27]:
def get_param_list(model):
    return [
        param for param in model.shared.parameters()
    ] + [
        param for param in model.encoder.parameters()
    ] + [
        param for param in model.decoder.parameters()
    ]

cls_param_list = get_param_list(models["classification"].model)
summ_param_list = get_param_list(models["summarization"].model)
for cls_param, summ_param in zip(cls_param_list, summ_param_list):
    if not torch.all(torch.eq(cls_param, summ_param)):
        raise RuntimeError("Shared parameters are not equal!")

In [28]:
for name, param in models["classification"].named_parameters():
    print(f"{name} is {param}")
    break

model.shared.weight is Parameter containing:
tensor([[ 0.0110,  0.0082, -0.0107,  ...,  0.0088,  0.1086,  0.0112],
        [ 0.0123, -0.0161,  0.0099,  ..., -0.0460, -0.0303,  0.0128],
        [ 0.0785, -0.0346,  0.0086,  ...,  0.0478,  0.0098,  0.0302],
        ...,
        [ 0.0131, -0.0308, -0.0453,  ..., -0.0032,  0.0246, -0.0191],
        [ 0.0053, -0.0446, -0.0519,  ...,  0.0054,  0.0143, -0.0166],
        [ 0.0102, -0.0272, -0.0527,  ...,  0.0231,  0.0057, -0.0126]],
       device='cuda:0', requires_grad=True)


In [11]:
event_names = cls_dataset.unique("event_type")
event_names

{'train': ['Military Exercise',
  'Fire',
  'Air crash',
  'Droughts',
  'Awards ceremony',
  'Diplomatic Talks _ Diplomatic_Negotiation_ Summit Meeting',
  'Road Crash',
  'Riot',
  'Armed Conflict',
  'Government Policy Changes',
  'Withdraw from an Organization',
  'Famous Person - Sick',
  'Strike',
  'Government Job change - Election',
  'New achievements in aerospace',
  'Organization Closed',
  'Protest_Online Condemnation',
  'Hurricanes_Tornado_Storm_Blizzard',
  'Famous Person - Commit Crime - Release',
  'Earthquakes',
  'Famous Person - Commit Crime - Accuse',
  'Famous Person - Commit Crime - Arrest',
  'Diplomatic Visit',
  'Bank Robbery',
  'Financial Aid',
  'Famous Person - Marriage',
  'Mine Collapses',
  'Government Job change - Appoint_Inauguration',
  'Famous Person - Commit Crime - Sentence',
  'Volcano Eruption',
  'Disease Outbreaks',
  'Famous Person - Death',
  'Government Job change - Resignation_Dismissal',
  'Mass Poisoning',
  'Train collisions',
  'Gas ex

In [12]:
cls_dataset = cls_dataset.cast_column("event_type", ClassLabel(num_classes=len(event_names["train"]), names=sorted(event_names["train"])))

Loading cached processed dataset at /home/jvidakovic/.cache/huggingface/datasets/csv/default-0720af0f377253e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-5f6bf4a5560153b2.arrow
Loading cached processed dataset at /home/jvidakovic/.cache/huggingface/datasets/csv/default-0720af0f377253e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d42b77ea8f3482ef.arrow


In [45]:
cls_dataset["train"].features

{'index': Value(dtype='int64', id=None),
 'title': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'event_type': ClassLabel(names=['Air crash', 'Armed Conflict', 'Awards ceremony', 'Bank Robbery', 'Break historical records', 'Diplomatic Talks _ Diplomatic_Negotiation_ Summit Meeting', 'Diplomatic Visit', 'Disease Outbreaks', 'Droughts', 'Earthquakes', 'Environment Pollution', 'Famine', 'Famous Person - Commit Crime - Accuse', 'Famous Person - Commit Crime - Arrest', 'Famous Person - Commit Crime - Investigate', 'Famous Person - Commit Crime - Release', 'Famous Person - Commit Crime - Sentence', 'Famous Person - Death', 'Famous Person - Divorce', 'Famous Person - Give a speech', 'Famous Person - Marriage', 'Famous Person - Recovered', 'Famous Person - Sick', 'Financial Aid', 'Financial Crisis', 'Fire', 'Floods', 'Gas explosion', 'Government Job change - Appoint_Inauguration', 'Government Job change - Election', 'Government Job change - Resignation_Dismissal', '

In [46]:
cls_dataset["train"][0]["text"]



In [13]:
def preprocess_docee(examples):
    batch_encoding = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512
    )
    batch_encoding["labels"] = examples["event_type"]
    return batch_encoding

docee = cls_dataset.map(preprocess_docee, batched=True, remove_columns=cls_dataset["train"].column_names)

  0%|          | 0/18 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/jvidakovic/.cache/huggingface/datasets/csv/default-0720af0f377253e9/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-a80ff54c5a9ded56.arrow


In [12]:
docee["train"].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Value(dtype='int64', id=None)}

In [13]:

batch_size = 1

data_collator = DataCollatorWithPadding(
    tokenizer,
    padding=PaddingStrategy.LONGEST,
    return_tensors="pt"
)

train_dataloader = DataLoader(
    docee["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    docee["validation"],
    batch_size=batch_size,
    collate_fn=data_collator
)


In [14]:
for batch in train_dataloader:
    break

{k: v.shape for k, v in batch.items()}

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([1, 512]),
 'attention_mask': torch.Size([1, 512]),
 'labels': torch.Size([1])}

In [19]:
# test run
outputs = models["classification"](**batch)
print(f"{outputs.loss = }")
print(f"{outputs.logits.shape = }")
# moze

outputs.loss = tensor(3.8328, grad_fn=<NllLossBackward0>)
outputs.logits.shape = torch.Size([1, 59])


In [20]:
cls_optimizer = AdamW(model.parameters(), lr=5e-5)

In [21]:
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=cls_optimizer,
    num_warmup_steps=500,
    num_training_steps=num_training_steps
)
num_training_steps

17559

In [22]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [23]:
cls_accelerator = Accelerator()
train_dataloader, eval_dataloader, model, optimizer = cls_accelerator.prepare(
    train_dataloader, eval_dataloader, models["classification"], cls_optimizer
)

In [25]:
f1 = evaluate.load("f1")


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [31]:
model.eval()
for batch in tqdm(eval_dataloader, total=len(eval_dataloader), desc="Evaluation"):
    # extract outputs
    outputs = model(**batch)
    # print(outputs.keys())  # loss, logits, encoder_last_hidden_state

    # outputs["logits"] = (BS, 59)
    # we need argmax by dimension 1

    # decode logits into labels
    predictions = torch.argmax(outputs["logits"], dim=1)
    # print(labels)
    f1.add_batch(
        predictions=predictions.cpu().numpy(),
        references=batch["labels"].cpu().numpy(),
    )
    # break
    # f1.add_batch(predictions=outputs["labels"])
result = f1.compute(average="macro")
print(result)


Evaluation:   0%|          | 0/2195 [00:00<?, ?it/s]

{'f1': 0.008521995382692497}


In [32]:
f1_micro = f1.compute(average="micro")
print(f1_micro)
# okay, so we cannot call compute multiple times

In [19]:
model.train()
for epoch in tqdm(range(num_epochs), total=num_epochs, desc="Epoch progress"):
    for batch in tqdm(train_dataloader, total=len(train_dataloader), desc=f"Epoch {epoch+1}", leave=False):
        outputs = model(**batch)
        loss = outputs.loss
        cls_accelerator.backward(loss)

        cls_optimizer.step()
        lr_scheduler.step()
        cls_optimizer.zero_grad()

    # eval loop
    # model.eval()
    # we need metrics
    # for batch in tqdm(eval_dataloader, total=len(eval_dataloader), desc=f"Evaluation after epoch {epoch+1}", leave=False):
    # what about the evaluation loop? -> stick it somewhere here

Epoch progress:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/17559 [00:00<?, ?it/s]

In [20]:
print_first_param(models["classification"].model.encoder)
print_first_param(models["summarization"].model.encoder)

In [14]:
num_train_epochs = 1
cls_steps = 1  # what does this mean?
summ_steps = 2   # what does this mean?

cls_batch_size=1
summ_batch_size=1

# probably doesnt make much sense to train summarization more often than classification, right?
# the thing is:
#   we are actually learning SUMMARIZATION!!
#   -> but we want to accomplish learning this summarization by utilizing classification as well

cls_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=PaddingStrategy.MAX_LENGTH,
    return_tensors="pt"
)

summ_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=PaddingStrategy.MAX_LENGTH,
    return_tensors="pt"
)

tasks = {
    "classification": {
        "model": models["classification"],
        "optimizer": None,
        "train_dataloader": DataLoader(
            docee["train"],
            batch_size=cls_batch_size,
            shuffle=True,
            collate_fn=cls_collator
        ),
        "eval_dataloader": DataLoader(
            docee["validation"],
            batch_size=cls_batch_size,
            collate_fn=cls_collator
        )
    },
    "summarization": {
        "model": models["summarization"],
        "optimizer": None,
        "train_dataloader": DataLoader(
            tokenized_cnn["train"],
            batch_size=summ_batch_size,
            shuffle=True,
            collate_fn=summ_collator
        ),
        "eval_dataloader": DataLoader(
            tokenized_cnn["validation"],
            batch_size=summ_batch_size,
            collate_fn=summ_collator
        )
    }
}

tasks

{'classification': {'model': BartForSequenceClassification(
    (model): BartModel(
      (shared): Embedding(50265, 768, padding_idx=1)
      (encoder): BartEncoder(
        (embed_tokens): Embedding(50265, 768, padding_idx=1)
        (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
        (layers): ModuleList(
          (0): BartEncoderLayer(
            (self_attn): BartAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=

In [15]:
def setup_optimizers(tasks):
    for task_name, task_objects in tasks.items():
        print(f"Setting up {task_name}")
        task_objects["optimizer"] = AdamW(task_objects["model"].parameters(), lr=2e-5)

setup_optimizers(tasks)

Setting up classification
Setting up summarization


In [52]:
tasks

{'classification': {'model': BartForSequenceClassification(
    (model): BartModel(
      (shared): Embedding(50265, 768, padding_idx=1)
      (encoder): BartEncoder(
        (embed_tokens): Embedding(50265, 768, padding_idx=1)
        (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
        (layers): ModuleList(
          (0): BartEncoderLayer(
            (self_attn): BartAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=

In [16]:
# the thing is, classification dataloader contains less examples than summarization dataloader
# we can solve this by oversampling the classification dataloader (by using itertools.tee)
summ_cls_ratio = len(tasks["summarization"]["train_dataloader"]) // len(tasks["classification"]["train_dataloader"]) + 1
summ_cls_ratio
# tasks["classification"]["train_dataloader"] = tee(tasks)


17

In [None]:
from itertools import tee, chain

tasks["classification"]["train_dataloader"] = DataLoader(
    docee["train"],
    batch_size=cls_batch_size,
    shuffle=True,
    collate_fn=cls_collator
)

In [18]:
def set_train(tasks):
    tasks["summarization"]["model"].train()
    tasks["classification"]["model"].train()

# set_train(tasks)

In [33]:
def accelerate(tasks):
    for task in tasks:
        accelerator = Accelerator()
        # tasks[task]["accelerator"] = Accelerator()
        for component in ["model", "optimizer", "train_dataloader", "eval_dataloader"]:
            tasks[task][component] = accelerator.prepare(tasks[task][component])
        tasks[task]["accelerator"] = accelerator

In [34]:
accelerate(tasks)

In [35]:
tasks["classification"]["train_dataloader"] = chain(*tee(tasks["classification"]["train_dataloader"], summ_cls_ratio))

In [38]:
test_iter = iter(tasks["classification"]["train_dataloader"])
batch = next(test_iter)
batch

{'input_ids': tensor([[    0,  3762,     9,     5,  8260,  7749,    11,  2673,   750,     7,
           478,     5,   382,    34, 18432,  1926,    12, 10823,  1261,     6,
          5681,  1611,     6, 14784,    66, 11487,     8, 23285,  3980,     4,
         50118,   500, 10338,  1780,   518,    32,  1786,     7,  7118,     5,
           455,   913,     9,  4370,   988,     6,    61,   156, 19504,    15,
           307,  1390,    25,    10,  4120,   237,  2130,    19, 18918, 17055,
            36,  5714,  7203,    73,   298,    43,  2372,     4, 50118,  3750,
           513,   411,    82,    33,    57,   848,     6,   144,     9,   106,
            11,  1261,     4, 50118, 15852, 12434,     7,    10, 10602,  2130,
             6,   988,    34,    57,  3022,     5,  9347, 11275,     4,  5809,
            12, 12557,  1899,  8383,    32,   202,    11,   317,     6,     5,
           382,   496,  4370,   824,   161,     6,     8,  1196,   420,     5,
          3174,   382,    33,    57,  

In [39]:
def setup_dataloader_lengths(tasks):
    tasks["classification"]["train_len"] = len(docee["train"]) // cls_batch_size
    tasks["summarization"]["train_len"] = len(tasks["summarization"]["train_dataloader"])

setup_dataloader_lengths(tasks)

In [40]:
num_epochs = 1
def setup_schedulers(tasks):
    for task in tasks:
        tasks[task]["lr_scheduler"] = get_scheduler(
            "linear",
            tasks[task]["optimizer"],
            num_warmup_steps=0,
            num_training_steps=num_epochs * tasks[task]["train_len"]
        )

In [41]:
setup_schedulers(tasks)
tasks

{'classification': {'model': BartForSequenceClassification(
    (model): BartModel(
      (shared): Embedding(50265, 768, padding_idx=1)
      (encoder): BartEncoder(
        (embed_tokens): Embedding(50265, 768, padding_idx=1)
        (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
        (layers): ModuleList(
          (0): BartEncoderLayer(
            (self_attn): BartAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=

In [42]:
set_train(tasks)

In [44]:
for epoch in tqdm(range(num_epochs), desc="Epoch", total=num_epochs):
    # load training data, step by step
    num_epoch_steps = len(tasks["summarization"]["train_dataloader"]) * 2
    iters = {task: iter(tasks[task]["train_dataloader"]) for task in tasks}
    progress_bars = {
        task: tqdm(range(tasks[task]["train_len"]), desc=f"{task} progress", total=tasks[task]["train_len"], leave=False)
        for task in tasks
    }
    for step in range(num_epoch_steps):
        if step % 2 == 0: # train summarization
            task = "summarization"
        else:
            task = "classification"
        batch = next(iters[task])
        outputs = tasks[task]["model"](**batch)
        loss = outputs.loss
        tasks[task]["accelerator"].backward(loss)
        tasks[task]["optimizer"].step()
        tasks[task]["lr_scheduler"].step()
        tasks[task]["optimizer"].zero_grad()
        progress_bars[task].update(1)

# pa ovo radi buraz


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

classification progress:   0%|          | 0/17559 [00:00<?, ?it/s]

summarization progress:   0%|          | 0/287113 [00:00<?, ?it/s]