In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%pip install accelerate -U

********************

**************

last

************

In [1]:
from tqdm.notebook import tqdm
from IPython import display

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn

from datasets import load_dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration, DataCollatorForSeq2Seq

In [2]:
BASE_MODEL_NAME = "t5-large"

BATCH_SIZE = 16
LEARNING_RATE = 5e-5
EPOCHS = 3

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
dataset = load_dataset("glue", "rte")
# dataset.pop('unsupervised')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})


In [4]:
dataset['validation']['label'][0]

1

In [5]:
def id2label(ids):
    label_names = ['entailment', 'not_entailment']
    return [label_names[id] for id in ids]

def label2id(labels):
    label_names_dict = {
        'entailment': 0,
        'not_entailment': 1
    }
    return [
        label_names_dict.get(label, 2)
        for label in labels
    ]

In [6]:
tokenizer = T5TokenizerFast.from_pretrained(BASE_MODEL_NAME)

In [7]:
print('hello')

hello


In [8]:
def preprocess_input(text):
    text = text.lower()
    text = text.replace('<br />', ' ')
    return text

def map_function(row):
    # processed_input = [
    #     preprocess_input(text)
    #     for text in row['text']
    # ]
    print('hello')
    processed_input = [f"premise: {premise} hypothesis: {hypothesis}" for premise, hypothesis in zip(row['sentence1'], row['sentence2'])]
    input_info = tokenizer(processed_input, truncation=True, max_length=256)
    output_info = tokenizer(id2label(row['label']), padding=True)
    # encoded_labels = tokenizer(text_target=labels, padding=True)
    return {
        **input_info,
        'labels': output_info.input_ids
    }


dataset = dataset.map(map_function, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

hello
hello
hello


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [10]:
dataset['validation']['labels']

tensor([[  59,  834,   35, 5756,  297,    1],
        [   3,   35, 5756,  297,    1,    0],
        [  59,  834,   35, 5756,  297,    1],
        ...,
        [   3,   35, 5756,  297,    1,    0],
        [   3,   35, 5756,  297,    1,    0],
        [  59,  834,   35, 5756,  297,    1]])

In [11]:
model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)

In [12]:
col_fn = DataCollatorForSeq2Seq(
    tokenizer, return_tensors='pt', padding='longest',
)

train_loader = torch.utils.data.DataLoader(
    dataset['train'],
    batch_size=BATCH_SIZE,
    collate_fn=col_fn,
    shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    dataset['validation'],
    batch_size=BATCH_SIZE,
    collate_fn=col_fn,
)

In [13]:
# import torch
# import numpy as np
# from tqdm import tqdm

# def train_loop(model, loader, optimizer, accumulation_steps=32):
#     model.train()

#     batch_losses = []
#     optimizer.zero_grad()

#     for i, row in enumerate(tqdm(loader, desc='Training:')):
#         out = model(**{k: v.to(model.device) for k, v in row.items()})
#         loss = out.loss / accumulation_steps  # normalize loss

#         batch_loss_value = loss.item() * accumulation_steps  # convert to original loss value for logging
#         loss.backward()

#         if (i + 1) % accumulation_steps == 0:  # update weights every accumulation_steps mini-batches
#             optimizer.step()
#             optimizer.zero_grad()  # reset gradients

#         batch_losses.append(batch_loss_value)

#     # Update remaining gradients if the number of batches is not a multiple of accumulation_steps
#     if len(loader) % accumulation_steps != 0:
#         optimizer.step()
#         optimizer.zero_grad()

#     loss_value = np.mean(batch_losses)
#     return {'train_loss': loss_value}


In [14]:
def train_loop(model, loader, optimizer):
    model.train()

    batch_losses = []

    for row in tqdm(loader, desc='Training:'):
        optimizer.zero_grad()

        out = model(**row.to(model.device))
        loss = out.loss

        batch_loss_value = loss.item()
        loss.backward()
        optimizer.step()

        batch_losses.append(batch_loss_value)

    loss_value = np.mean(batch_losses)
    return {'train_loss': loss_value}

def _predict(model, row):
    return model.generate(
        input_ids=row.input_ids,
        attention_mask=row.attention_mask,
        max_length=5
    )

def tokenizer_ids_to_label(all_input_ids):
    return tokenizer.batch_decode(all_input_ids, skip_special_tokens=True)

def valid_loop(model, loader, compute_metrics):
    model.eval()

    all_true = []
    all_pred = []

    with torch.no_grad():
        for row in tqdm(loader, desc='Validating:'):
            row.to(model.device)
            pred = _predict(model, row)

            all_true += row.labels.detach().cpu().tolist()
            all_pred += pred.detach().cpu().tolist()

    all_true = label2id(tokenizer_ids_to_label(all_true))
    all_pred = label2id(tokenizer_ids_to_label(all_pred))

    return {'valid_acc': compute_metrics(y_true=all_true, y_pred=all_pred)}

In [15]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
compute_metrics = accuracy_score

In [16]:
model.to(DEVICE)

all_results = []
for epoch in range(EPOCHS):
    epoch_results = {'epoch': epoch}

    epoch_results.update(
        train_loop(
            model=model,
            loader=train_loader,
            optimizer=optimizer,
        )
    )

    epoch_results.update(
        valid_loop(
            model=model,
            loader=test_loader,
            compute_metrics=compute_metrics,
        )
    )
    all_results.append(epoch_results)

    display.clear_output()
    display.display(pd.DataFrame(all_results).set_index('epoch'))

Unnamed: 0_level_0,train_loss,valid_acc
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.182129,0.476534
1,0.043604,0.476534
2,0.027298,0.487365
