<a href="https://colab.research.google.com/github/hillelda/ANLP/blob/main/rec_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title pip install
! pip install datasets
! pip install evaluate
! pip install accelerate -U
! pip install transformers[torch]
! pip install torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

In [3]:
# @title Imports
import evaluate
import numpy as np
from datasets import load_dataset
import transformers
from transformers import (AutoModelForSequenceClassification, AutoTokenizer)
import torch
from tqdm import tqdm

In [11]:
# @title load model
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').cuda()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# @title data
def preprocess_function(examples):
    result = tokenizer(examples['sentence1'], examples['sentence2'], max_length=256, truncation=True, padding='max_length')
    return result

raw_datasets = load_dataset("nyu-mll/glue", 'mrpc')
raw_datasets = raw_datasets.map(preprocess_function,batched=True)

train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["validation"]

train_dataset = train_dataset.select(range(300)) #training on 5k samples

# Set format for PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

In [14]:
# @title Metric
metric = evaluate.load("accuracy",)

# def compute_metrics(p):
#     preds = np.argmax(p.predictions, axis=1)
#     return metric.compute(predictions=preds, references=p.label_ids)

def compute_metrics(preds, labels):
    pred_labels = np.argmax(preds, axis=1)
    return metric.compute(predictions=pred_labels, references=labels)

In [15]:
# @title Imports for Trainer alternative
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import DataCollatorWithPadding
from transformers import get_scheduler


In [20]:
from transformers import get_scheduler


# @title Init trainer
# training_args = TrainingArguments(output_dir='/tmp/', do_eval=True, do_train=True, num_train_epochs=3, per_device_train_batch_size=8, learning_rate =5e-5)
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     compute_metrics=compute_metrics,
#     tokenizer=tokenizer,
# )


def train(model, train_dataset, eval_dataset, tokenizer, num_epochs=3, learning_rate=5e-5, batch_size=8):
    model.train()
    model.cuda()
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optim = AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_scheduler(
        "linear",
        optim,
        num_warmup_steps=0,
        num_training_steps=num_epochs * len(train_loader)
    )
    for epoch in range(num_epochs):
        for batch in tqdm(train_loader):
            optim.zero_grad()
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            labels = batch['label'].cuda()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optim.step()
            scheduler.step()
        print("Epoch: " + str(epoch) + " - Loss: " + str(loss.item()))
        model.eval()
        eval_loader = DataLoader(eval_dataset, batch_size=batch_size)
        all_preds = []
        all_labels = []
        for batch in tqdm(eval_loader):
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            labels = batch['label'].cuda()
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
        all_preds = np.array(all_preds)
        all_labels = np.array(all_labels)
        accuracy = (all_preds == all_labels).mean()
        print("Epochs: " + str(epoch + 1) + " - Learning Rate: " + str(learning_rate) + " - Batch Size: " + str(batch_size) + " - Accuracy: " + str(accuracy))
    return model, {'accuracy': accuracy}

In [21]:
# @title Train!
# Train the model
trained_model, metrics = train(model, train_dataset, eval_dataset, tokenizer)
metrics

100%|██████████| 38/38 [00:18<00:00,  2.07it/s]


Epoch: 0 - Loss: 0.0336349718272686


100%|██████████| 51/51 [00:08<00:00,  6.06it/s]


Epochs: 3 - Learning Rate: 5e-05 - Batch Size: 8 - Accuracy: 0.7769607843137255


100%|██████████| 38/38 [00:17<00:00,  2.21it/s]


Epoch: 1 - Loss: 0.01290391106158495


100%|██████████| 51/51 [00:08<00:00,  6.27it/s]


Epochs: 3 - Learning Rate: 5e-05 - Batch Size: 8 - Accuracy: 0.7450980392156863


100%|██████████| 38/38 [00:16<00:00,  2.27it/s]


Epoch: 2 - Loss: 0.029584228992462158


100%|██████████| 51/51 [00:07<00:00,  6.40it/s]

Epochs: 3 - Learning Rate: 5e-05 - Batch Size: 8 - Accuracy: 0.7573529411764706





{'accuracy': 0.7573529411764706}

In [None]:
# @title Evaluate
# metrics = trainer.evaluate(eval_dataset=eval_dataset)
# metrics

{'eval_loss': 0.33856436610221863,
 'eval_accuracy': 0.8853211009174312,
 'eval_runtime': 2.6311,
 'eval_samples_per_second': 331.425,
 'eval_steps_per_second': 41.428,
 'epoch': 1.0}