<a href="https://colab.research.google.com/github/hillelda/ANLP/blob/main/rec_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title pip install
! pip install datasets
! pip install evaluate
! pip install accelerate -U
! pip install transformers[torch]
! pip install torch
! pip install peft
! pip install tqdm



In [None]:
# @title Imports
import evaluate
import numpy as np
from datasets import load_dataset
import transformers
from transformers import (AutoModelForSequenceClassification, AutoTokenizer)
import torch
from tqdm import tqdm

In [None]:
# @title Globals
MODE = 'gemma' #@param ["regular", "lora", "large", "gemma"]

In [None]:
# @title load model
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
if MODE == 'regular':
  model = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').cuda()
elif MODE == 'lora':
  model2 = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base').cuda()
elif MODE == 'large':
  deberta = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-large').cuda()
elif MODE == 'gemma':
  !huggingface-cli login # ask for token for gemma
  gemma = AutoModelForSequenceClassification.from_pretrained('google/gemma-2b').cuda()
  gemma_tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b')



In [None]:
# @title data
def preprocess_function(examples):
    result = tokenizer(examples['sentence1'], examples['sentence2'], max_length=256, truncation=True, padding='max_length')
    return result

raw_datasets = load_dataset("nyu-mll/glue", 'mrpc')
raw_datasets = raw_datasets.map(preprocess_function,batched=True)

train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["validation"]

# train_dataset = train_dataset.select(range(300)) #training on 5k samples

# Set format for PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


In [None]:
# @title Metric
metric = evaluate.load("accuracy",)

# def compute_metrics(p):
#     preds = np.argmax(p.predictions, axis=1)
#     return metric.compute(predictions=preds, references=p.label_ids)

def compute_metrics(preds, labels):
    pred_labels = np.argmax(preds, axis=1)
    return metric.compute(predictions=pred_labels, references=labels)

In [None]:
# @title Imports for Trainer alternative
from torch.utils.data import DataLoader
from torch.optim import Adam
from transformers import DataCollatorWithPadding
from transformers import get_scheduler


In [None]:
# from transformers import get_scheduler


# # @title Init trainer
# # training_args = TrainingArguments(output_dir='/tmp/', do_eval=True, do_train=True, num_train_epochs=3, per_device_train_batch_size=8, learning_rate =5e-5)
# # trainer = Trainer(
# #     model=model,
# #     args=training_args,
# #     train_dataset=train_dataset,
# #     eval_dataset=eval_dataset,
# #     compute_metrics=compute_metrics,
# #     tokenizer=tokenizer,
# # )


# def train(model, train_dataset, eval_dataset, tokenizer, num_epochs=5, learning_rate=5e-5, batch_size=16):
#     model.train()
#     model.cuda()
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     optim = Adam(model.parameters(), lr=learning_rate)
#     scheduler = get_scheduler(
#         "linear",
#         optim,
#         # num_warmup_steps=0,
#         num_training_steps=num_epochs * len(train_loader),
#         num_warmup_steps = int(0.1 * num_epochs * len(train_loader))
#     )
#     for epoch in range(num_epochs):
#         torch.cuda.empty_cache()
#         for batch in tqdm(train_loader):
#             optim.zero_grad()
#             input_ids = batch['input_ids'].cuda()
#             attention_mask = batch['attention_mask'].cuda()
#             labels = batch['label'].cuda()
#             outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#             loss = outputs.loss
#             loss.backward()
#             optim.step()
#             scheduler.step()
#         print("Epoch: " + str(epoch) + " - Loss: " + str(loss.item()))
#         model.eval()
#         eval_loader = DataLoader(eval_dataset, batch_size=batch_size)
#         all_preds = []
#         all_labels = []
#         for batch in tqdm(eval_loader):
#             input_ids = batch['input_ids'].cuda()
#             attention_mask = batch['attention_mask'].cuda()
#             labels = batch['label'].cuda()
#             with torch.no_grad():
#                 outputs = model(input_ids, attention_mask=attention_mask)
#             preds = torch.argmax(outputs.logits, dim=1)
#             all_preds.extend(preds.cpu().numpy())
#             all_labels.extend(labels.cpu().numpy())
#         all_preds = np.array(all_preds)
#         all_labels = np.array(all_labels)
#         accuracy = (all_preds == all_labels).mean()
#         print("Epochs: " + str(epoch + 1) + " - Learning Rate: " + str(learning_rate) + " - Batch Size: " + str(batch_size) + " - Accuracy: " + str(accuracy))
#     return model, {'accuracy': accuracy}

In [None]:
def train(model, train_dataset, eval_dataset, tokenizer, num_epochs=3, learning_rate=1e-5, batch_size=8):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            torch.cuda.empty_cache() # helps run on the collab GPU without OOM errors.
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss
            loss.backward() # Backpropogation
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        eval_accuracy = 0
        eval_loss = 0
        with torch.no_grad(): # only evaluating, don't change weights.
            for batch in eval_loader:
                torch.cuda.empty_cache()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                eval_loss += loss.item()
                pred = torch.argmax(outputs.logits, dim=1)
                eval_accuracy += (pred == labels).sum().item()

        eval_accuracy = eval_accuracy / len(eval_dataset)
        train_loss = train_loss / len(train_loader)
        eval_loss = eval_loss / len(eval_dataset)

        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f}, Eval Accuracy: {eval_accuracy:.4f}, Eval Loss: {eval_loss:.4f}, LR: {learning_rate}, Batch Size: {batch_size}")

    final_accuracy = eval_accuracy
    return model, {'accuracy': final_accuracy}

In [None]:
# @title Train!
# Train the model
if MODE == 'regular':
  trained_model, metrics = train(model, train_dataset, eval_dataset, tokenizer)
  metrics

In [None]:
torch.cuda.empty_cache() # clear the cache befoer next step

In [None]:
#@title Lora imports
import peft
from peft import LoraModel, LoraConfig

In [None]:
# print(model)

In [None]:
# @title Lora model config

lora_config_lora = LoraConfig(
    r=16,
    lora_alpha=32, #should be about r*2.
    lora_dropout=0.05,
    task_type="classification",
    target_modules=['query_proj', 'value_proj'], # learn on target modules with LoRA
    # do learn the sequence classification head and the pooler weights
    modules_to_save=['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
)


In [None]:
# print(lora_model)

In [None]:
# @title Train LORA
if MODE == 'lora':
  lora_model = LoraModel(model2, lora_config_lora, adapter_name="default")
  trained_model, metrics = train(lora_model, train_dataset, eval_dataset, tokenizer, learning_rate=1e-4)
  metrics

In [None]:
# @title define deberta
if MODE == 'large':
  lora_config_deberta = LoraConfig(
    r=16,
    lora_alpha=32, #should be about r*2.
    lora_dropout=0.05,
    task_type="classification",
    target_modules=['query_proj', 'value_proj'],
    modules_to_save=['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
)
  lora_model_large = LoraModel(deberta, lora_config_deberta, adapter_name="default")

In [None]:
# @title Train deberta large
if MODE == 'large':
  trained_model, metrics = train(lora_model_large, train_dataset, eval_dataset, tokenizer, learning_rate=1e-4)
  metrics

In [None]:
print(gemma) # neaded so we can find the layer names in new model.

In [None]:
# @title Deifne gemma
if MODE == 'gemma':
  lora_config_gemma = LoraConfig(
    r=16,
    lora_alpha=32, #should be about r*2.
    lora_dropout=0.05,
    task_type="classification",
    target_modules=['q_proj', 'v_proj'], # other layer names in this model
    modules_to_save=['pooler', 'classifier','score.weight']
    #'pooler', 'classifier',
    #'gate_proj', 'up_proj', 'down_proj'
)
  lora_model_gemma = LoraModel(gemma, lora_config_gemma, adapter_name="default")

In [None]:
# @title Train gemma
if MODE == 'gemma':
  trained_model, metrics = train(lora_model_gemma, train_dataset, eval_dataset, gemma_tokenizer, learning_rate=0.0001, batch_size=2)
  metrics

In [None]:
# @title Evaluate
# metrics = trainer.evaluate(eval_dataset=eval_dataset)
# metrics