In [None]:
import os

import evaluate
import numpy as np
from datasets import load_dataset
from dotenv import load_dotenv
from peft import LoraConfig, TaskType, get_peft_model
from transformers import TrainingArguments, Trainer, \
    DataCollatorWithPadding, BertTokenizer, AutoModelForSequenceClassification

In [None]:
load_dotenv()

## Load the model. I will use `RoBERTa` model on huggingface.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', token=os.getenv('HUGGINGFACE_TOKEN'))
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased',
                                                           trust_remote_code=False,
                                                           token=os.getenv('HUGGINGFACE_TOKEN'))

### Use PEFT and Lora for effeciency.

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    bias='none',
    lora_dropout=0.1,
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
model

### EDA

In [None]:
imdb = load_dataset("imdb")
imdb

In [None]:
def preprocess_function(examples):
    # Tokenize the reviews
    text_tokenized = tokenizer(examples['text'], padding='max_length', truncation=True)
    return text_tokenized


tokenized_train = imdb['train'].map(preprocess_function, batched=True)
tokenized_test = imdb['test'].map(preprocess_function, batched=True)
tokenized_unsupervised = imdb['unsupervised'].map(preprocess_function, batched=True)

In [None]:
tokenized_train

In [None]:
tokenized_test

In [None]:
def compute_metrics(eval_pred):
    load_accuracy = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {"accuracy": accuracy, }

In [None]:
model_id = 'kreimben/bert-sentiment-analysis'

In [None]:
training_args = TrainingArguments(
    output_dir="./saved_model/bert_lora_peft",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_token=os.getenv('HUGGINGFACE_WRITE_TOKEN'),
    hub_model_id=model_id.split('/')[1],
    hub_strategy='end',
    warmup_steps=1000,
    do_train=True,
    do_eval=True,
    bf16=True,
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
%%time

training_result = trainer.train()

In [None]:
# compute train results
metrics = training_result.metrics
max_train_samples = len(tokenized_train)
metrics["train_samples"] = min(max_train_samples, len(tokenized_train))

In [None]:
# compute evaluation results
metrics = trainer.evaluate()
max_val_samples = len(tokenized_test)
metrics["eval_samples"] = min(max_val_samples, len(tokenized_test))

In [None]:
metrics

In [None]:
model.push_to_hub('kreimben/bert-sentiment-analysis',
                  commit_message='Adjust bf16 for mixed-precision training',
                  token=os.getenv('HUGGINGFACE_WRITE_TOKEN'),
                  )

### Test!

In [None]:
import random

N = len(tokenized_unsupervised)

idx = random.randint(1, N)

example = tokenized_unsupervised[idx]
text = example['text']
text

In [None]:
tokenised = tokenizer(text, return_tensors='pt')
model = model.to('cpu')
res = model(**tokenised)

In [None]:
import torch.nn.functional as F

probabilities = F.softmax(res.logits, dim=1)
predicted_class = probabilities.argmax(dim=1)
predicted_class[0]

In [None]:
model.save_pretrained('saved_training/bert-base-uncased-sentiment-analysis')