In [1]:
import os

import evaluate
import numpy as np
from datasets import load_dataset
from dotenv import load_dotenv
from peft import LoraConfig, TaskType, get_peft_model
from transformers import TrainingArguments, Trainer, \
    DataCollatorWithPadding, BertTokenizer, AutoModelForSequenceClassification

In [2]:
load_dotenv()

True

## Load the model. I will use `RoBERTa` model on huggingface.

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', token=os.getenv('HUGGINGFACE_TOKEN'))
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased',
                                                           trust_remote_code=False,
                                                           token=os.getenv('HUGGINGFACE_TOKEN'))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Use PEFT and Lora for effeciency.

In [4]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    bias='none',
    lora_dropout=0.1,
)

In [5]:
model = get_peft_model(model, lora_config)

In [6]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): L

### EDA

In [7]:
imdb = load_dataset("imdb")
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [8]:
def preprocess_function(examples):
    # Tokenize the reviews
    text_tokenized = tokenizer(examples['text'], padding='max_length', truncation=True)
    return text_tokenized


tokenized_train = imdb['train'].map(preprocess_function, batched=True)
tokenized_test = imdb['test'].map(preprocess_function, batched=True)
tokenized_unsupervised = imdb['unsupervised'].map(preprocess_function, batched=True)

In [9]:
tokenized_train

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [10]:
tokenized_test

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [11]:
def compute_metrics(eval_pred):
    load_accuracy = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": accuracy, }

In [12]:
model_id = 'kreimben/bert-base-uncase-sentiment-analysis'

In [13]:
training_args = TrainingArguments(
    output_dir="./saved_training/bert_lora_peft",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_token=os.getenv('HUGGINGFACE_WRITE_TOKEN'),
    hub_model_id=model_id.split('/')[1],
    hub_strategy='end',
    warmup_steps=1000,
    do_train=True,
    do_eval=True,
    bf16=True,
)

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
%%time

training_result = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2843,0.237931,0.9172
2,0.2591,0.234216,0.92216
3,0.2442,0.228641,0.92544


CPU times: total: 8min 45s
Wall time: 21min 10s


In [17]:
# compute train results
metrics = training_result.metrics
max_train_samples = len(tokenized_train)
metrics["train_samples"] = min(max_train_samples, len(tokenized_train))

In [18]:
# compute evaluation results
metrics = trainer.evaluate()
max_val_samples = len(tokenized_test)
metrics["eval_samples"] = min(max_val_samples, len(tokenized_test))

In [19]:
metrics

{'eval_loss': 0.22864128649234772,
 'eval_accuracy': 0.92544,
 'eval_runtime': 118.0727,
 'eval_samples_per_second': 211.734,
 'eval_steps_per_second': 26.467,
 'epoch': 3.0,
 'eval_samples': 25000}

In [24]:
model.push_to_hub('kreimben/bert-base-uncase-sentiment-analysis',
                  commit_message='Adjust bf16 for mixed-precision training',
                  token=os.getenv('HUGGINGFACE_WRITE_TOKEN'),
                  )

README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


adapter_model.safetensors:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/bert-base-uncase-sentiment-analysis/commit/3fc5f3381b026c1c069acb228519bb6360db6a99', commit_message='Adjust bf16 for mixed-precision training', commit_description='', oid='3fc5f3381b026c1c069acb228519bb6360db6a99', pr_url=None, pr_revision=None, pr_num=None)

### Test!

In [25]:
import random

N = len(tokenized_unsupervised)

idx = random.randint(1, N)

example = tokenized_unsupervised[idx]
text = example['text']
text

"Brilliant film and excellent acting from Eliza Dushku! I liked her even before when I watched Tru Calling. :) She is not very famous but very charming. Her best roles are acting scary in my opinion. For the moment there are too many artificial movies full of tragedies and sci-fi... not like this one...great script...tragedy...fear...and of course Eliza Dushku. Irecommend you this film as one of the good ones...because it's original...not one of the same old FBI movies and investigations that you wish you never watched to the end. The original is always the best decision...<br /><br />Great MOVIE!!! ENJOY watching it!!!"

In [26]:
tokenised = tokenizer(text, return_tensors='pt')
model = model.to('cpu')
res = model(**tokenised)

In [27]:
import torch.nn.functional as F

probabilities = F.softmax(res.logits, dim=1)
predicted_class = probabilities.argmax(dim=1)
predicted_class[0]

tensor(1)

In [28]:
model.save_pretrained('saved_training/bert-base-uncased-sentiment-analysis')