In [1]:
!pip install transformers==4.38.0
!pip install datasets
!pip install torch==2.0.1+cu110
!pip install peft


Collecting transformers==4.38.0
  Downloading transformers-4.38.0-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.38.0)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.2
    Uninstalling transformers-4.40.2:
      Successfully uninstalled transformers-4.40.2
Successfully installed tokenizers-0.15.2 transformers-4.38.0
Collecting datasets
  Downloading dataset

lora

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from peft import get_peft_model, LoraConfig
import torch
import numpy as np

# Tokenize data
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Define evaluation metric
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define tokenizer and model and data collator
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).cuda() if torch.cuda.is_available() else AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load dataset
data = load_dataset("glue", "mrpc")
train_data, valid_data, test_data = data["train"], data["validation"], data["test"]

# Preprocess data
train_dataset = train_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])
valid_dataset = valid_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])
test_dataset = test_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])

# Define PEFT configuration
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['query'])

# Apply PEFT to the model
model = get_peft_model(model, peft_config)

# define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    seed=42
)

# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

# test model
trainer.predict(test_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5818,0.445014,0.794118,0.856655
2,0.4562,0.418157,0.823529,0.875862
3,0.3651,0.451905,0.791667,0.839925
4,0.295,0.446213,0.813725,0.866197
5,0.2307,0.484915,0.813725,0.864286


  metric = load_metric("glue", "mrpc")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


PredictionOutput(predictions=array([[-1.4706208 ,  1.8487391 ],
       [-2.1946828 ,  2.5006037 ],
       [-1.4935042 ,  1.8378971 ],
       ...,
       [-0.22088751,  0.6208285 ],
       [-1.1958159 ,  1.5719521 ],
       [-1.9279646 ,  2.348506  ]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 1, 1]), metrics={'test_loss': 0.45459237694740295, 'test_accuracy': 0.7982608695652174, 'test_f1': 0.8556016597510374, 'test_runtime': 8.3826, 'test_samples_per_second': 205.783, 'test_steps_per_second': 12.884})

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from peft import get_peft_model, LoraConfig
import torch
import numpy as np

# Tokenize data
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Define evaluation metric
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define tokenizer and model and data collator
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).cuda() if torch.cuda.is_available() else AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load dataset
data = load_dataset("glue", "mrpc")
train_data, valid_data, test_data = data["train"], data["validation"], data["test"]

# Preprocess data
train_dataset = train_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])
valid_dataset = valid_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])
test_dataset = test_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])

# Define PEFT configuration
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=1,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['query'])

# Apply PEFT to the model
model = get_peft_model(model, peft_config)

# define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    seed=42
)

# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

# test model
trainer.predict(test_dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5877,0.574838,0.737745,0.837139
2,0.4787,0.48989,0.784314,0.853333
3,0.4017,0.483105,0.791667,0.847397
4,0.3441,0.473211,0.79902,0.860544
5,0.2962,0.512279,0.794118,0.85567


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Checkpoint destination directory ./results/checkpoint-230 already exists and is non-empty. Saving will proceed but saved results may be invalid.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Checkpoint destination directory ./results/checkpoint-460 already exists and is non-empty. Saving will proceed but saved results may be invalid.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Checkpoint destination directory ./results/checkpoint-690 already exists and is non-empty. Saving will proc

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


PredictionOutput(predictions=array([[-1.1552765,  2.0703356],
       [-1.3743078,  2.6123998],
       [-1.3719616,  2.4705675],
       ...,
       [-0.528451 ,  1.1064227],
       [-1.4771526,  2.6125555],
       [-0.8292141,  1.3849913]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 1, 1]), metrics={'test_loss': 0.47299104928970337, 'test_accuracy': 0.7913043478260869, 'test_f1': 0.8491198658843252, 'test_runtime': 8.6015, 'test_samples_per_second': 200.546, 'test_steps_per_second': 12.556})

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from peft import get_peft_model, LoraConfig
import torch
import numpy as np

# Tokenize data
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Define evaluation metric
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define tokenizer and model and data collator
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).cuda() if torch.cuda.is_available() else AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load dataset
data = load_dataset("glue", "mrpc")
train_data, valid_data, test_data = data["train"], data["validation"], data["test"]

# Preprocess data
train_dataset = train_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])
valid_dataset = valid_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])
test_dataset = test_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])

# Define PEFT configuration
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=16,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['query'])

# Apply PEFT to the model
model = get_peft_model(model, peft_config)

# define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    seed=42
)

# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

# test model
trainer.predict(test_dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5725,0.458926,0.806373,0.865417
2,0.4383,0.439847,0.821078,0.873484
3,0.3296,0.459596,0.816176,0.86535
4,0.2503,0.502741,0.816176,0.868651
5,0.1778,0.582295,0.816176,0.86819


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Checkpoint destination directory ./results/checkpoint-230 already exists and is non-empty. Saving will proceed but saved results may be invalid.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Checkpoint destination directory ./results/checkpoint-460 already exists and is non-empty. Saving will proceed but saved results may be invalid.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Checkpoint destination directory ./results/checkpoint-690 already exists and is non-empty. Saving will proc

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


PredictionOutput(predictions=array([[-1.2456176,  2.3050508],
       [-0.6305416,  1.622322 ],
       [-1.5832111,  2.832495 ],
       ...,
       [-0.8647527,  1.6541317],
       [-1.1738269,  2.124179 ],
       [-0.9691645,  1.8867148]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 1, 1]), metrics={'test_loss': 0.43008407950401306, 'test_accuracy': 0.8057971014492754, 'test_f1': 0.8588284871470713, 'test_runtime': 8.5231, 'test_samples_per_second': 202.392, 'test_steps_per_second': 12.671})

bitfit

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from peft import get_peft_model, LoraConfig
import torch
import numpy as np

# Tokenize data
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# Define evaluation metric
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define tokenizer and model and data collator
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).cuda() if torch.cuda.is_available() else AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load dataset
data = load_dataset("glue", "mrpc")
train_data, valid_data, test_data = data["train"], data["validation"], data["test"]

# Preprocess data
train_dataset = train_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])
valid_dataset = valid_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])
test_dataset = test_data.map(tokenize_function, batched=True, remove_columns=['idx','sentence1','sentence2'])

#Bitfit
num_param = 0
for name, param in model.named_parameters():
    if "bias" not in name:
        param.requires_grad = False
    else:
        num_param += param.numel()

print("trainable params: {} || all params: {} || trainable%: {}".format(num_param, sum(param.numel() for param in model.parameters()), num_param / sum(param.numel() for param in model.parameters())))

# define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    seed=42
)

# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

# test model
trainer.predict(test_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

trainable params: 102914 || all params: 109483778 || trainable%: 0.0009399931376135011


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6055,0.503456,0.740196,0.835913
2,0.5162,0.44299,0.808824,0.863636
3,0.4703,0.421365,0.803922,0.848485
4,0.4416,0.397526,0.830882,0.877442
5,0.4098,0.39547,0.82598,0.87478


  metric = load_metric("glue", "mrpc")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


PredictionOutput(predictions=array([[-0.82849497,  0.83913445],
       [ 0.05809403, -0.10933584],
       [-1.6932485 ,  1.4456315 ],
       ...,
       [-1.5826427 ,  1.4211832 ],
       [-1.630496  ,  1.4264804 ],
       [-1.4698936 ,  1.4734664 ]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 1, 1]), metrics={'test_loss': 0.4299057722091675, 'test_accuracy': 0.8081159420289855, 'test_f1': 0.8593285167870803, 'test_runtime': 8.1103, 'test_samples_per_second': 212.691, 'test_steps_per_second': 13.316})