In [None]:
!pip install -q transformers datasets evaluate

In [None]:
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
     DataCollatorWithPadding,
     TrainingArguments,
     Trainer,
 )
 from peft import (
     get_peft_config,
     get_peft_model,
     get_peft_model_state_dict,
     set_peft_model_state_dict,
     PeftType,
     PromptEncoderConfig,
 )
 from datasets import load_dataset
 import evaluate
 import torch

 model_name_or_path = 'roberta-large'
 task = 'mrpc'
 num_epochs = 20
 lr = 1e-3
 batch_size = 32

In [None]:
dataset = load_dataset('glue', task)
dataset['train'][0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [None]:
 dataset['train']

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [None]:
metric = evaluate.load('glue', task)

In [None]:
metric

EvaluationModule(name: "glue", module_type: "metric", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = evaluate.load('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=ref

In [None]:
import numpy as np

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
if any(k in model_name_or_path for k in ("gpt","opt","bloon")):
  padding_side = 'left'
else:
  padding_side = 'right'

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, 'pad_token_id') is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

def tokenize_function(examples):
  outputs = tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, max_length=None)
  return outputs

In [None]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['idx','sentence1','sentence2']
)
tokenized_datasets = tokenized_datasets.rename_column('label','labels')

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

In [None]:
peft_config = PromptEncoderConfig(task_type='SEQ_CLS', num_virtual_tokens=20, encoder_hidden_size=128)

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,351,938 || all params: 356,713,732 || trainable%: 0.3790


In [None]:
training_args = TrainingArguments(
    output_dir = 'hang1n/roberta-large-p_tuning',
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    greater_is_better=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.646039,0.665507,0.799025


TrainOutput(global_step=115, training_loss=0.6496184639308764, metrics={'train_runtime': 165.3604, 'train_samples_per_second': 22.182, 'train_steps_per_second': 0.695, 'total_flos': 536899901695488.0, 'train_loss': 0.6496184639308764, 'epoch': 1.0})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!huggingface-cli whoami

hang1n


In [None]:
model.push_to_hub("hang1n/roberta-large-peft-p-tuning", use_auth_token=True)

adapter_model.safetensors:   0%|          | 0.00/4.29M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hang1n/roberta-large-peft-p-tuning/commit/026a2e677acc04bc55cb687d5648589624a63d59', commit_message='Upload model', commit_description='', oid='026a2e677acc04bc55cb687d5648589624a63d59', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hang1n/roberta-large-peft-p-tuning', endpoint='https://huggingface.co', repo_type='model', repo_id='hang1n/roberta-large-peft-p-tuning'), pr_revision=None, pr_num=None)

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer

peft_model_id = 'hang1n/roberta-large-peft-p-tuning'
config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, peft_model_id)

adapter_config.json:   0%|          | 0.00/431 [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/4.29M [00:00<?, ?B/s]

In [None]:
classes = ["not equivalent", 'equivalent']

sentence1 = "Coast redwood trees are the tallest trees on the planet and can grow over 300 feet tall."
sentence2 = "The coast redwood trees, which can attain a height of over 300 feet, are the tallest trees on earth."

inputs = tokenizer(sentence1, sentence2, truncation=True,padding='longest',return_tensors='pt')

In [None]:
with torch.no_grad():
  outputs = model(**inputs).logits
  print(outputs)

paraphrased_text = torch.softmax(outputs, dim=1).tolist()[0]
for i in range(len(classes)):
  print(f"{classes[i]}: {int(round(paraphrased_text[i] * 100))}%")

tensor([[-0.8028,  1.0945]])
not equivalent: 13%
equivalent: 87%
