<a href="https://colab.research.google.com/github/hillelda/ANLP/blob/main/rec_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# @title pip install
! pip install datasets
! pip install evaluate
! pip install accelerate -U
! pip install transformers[torch]
! pip install torch



In [31]:
# @title Imports
import evaluate
import numpy as np
from datasets import load_dataset
import transformers
from transformers import (AutoModelForSequenceClassification, AutoTokenizer)
import torch
from tqdm import tqdm

In [25]:
# @title load model
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').cuda()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
# @title data
def preprocess_function(examples):
    result = tokenizer(examples['sentence'], max_length=128, truncation=True, padding='max_length')
    return result

raw_datasets = load_dataset("nyu-mll/glue", 'sst2')
raw_datasets = raw_datasets.map(preprocess_function,batched=True)

train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["validation"]

# Set format for PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

#train_dataset = train_dataset.select(range(5000)) #training on 5k samples

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [46]:
# @title Metric
metric = evaluate.load("accuracy",)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

In [47]:
# @title Imports for Trainer alternative
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import DataCollatorWithPadding


In [48]:
# @title Init trainer
# training_args = TrainingArguments(output_dir='/tmp/', do_eval=True, do_train=True, num_train_epochs=3, per_device_train_batch_size=8, learning_rate =5e-5)
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     compute_metrics=compute_metrics,
#     tokenizer=tokenizer,
# )


def train(model, train_dataset, eval_dataset, tokenizer, num_epochs=3, learning_rate=5e-5, batch_size=8):
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=data_collator)


    # Optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Learning rate scheduler
    num_training_steps = num_epochs * len(train_loader)
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    # Move model to device (GPU or CPU)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        for batch in progress_bar:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress_bar.set_postfix(loss=loss.item())

        # Evaluation
        model.eval()
        all_preds = []
        all_labels = []
        for batch in eval_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            all_preds.append(logits.cpu().numpy())
            all_labels.append(batch["labels"].cpu().numpy())

        all_preds = np.concatenate(all_preds, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        metrics = compute_metrics(all_preds, all_labels)
        print(f"Epoch {epoch+1} - Evaluation metrics: {metrics}")

    return model, metrics

In [49]:
# @title Train!
# Train the model
trained_model, metrics = train(model, train_dataset, eval_dataset, tokenizer)
metrics

Epoch 1:   2%|▏         | 149/8419 [00:42<39:45,  3.47it/s, loss=0.311]


KeyboardInterrupt: 

In [None]:
# @title Evaluate
metrics = trainer.evaluate(eval_dataset=eval_dataset)
metrics

{'eval_loss': 0.33856436610221863,
 'eval_accuracy': 0.8853211009174312,
 'eval_runtime': 2.6311,
 'eval_samples_per_second': 331.425,
 'eval_steps_per_second': 41.428,
 'epoch': 1.0}