# Fine-tuning Sentiment Analysis

In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

### Making the Dataset

In [7]:
orca_dataset = load_dataset("Open-Orca/OpenOrca")

# subsample size
N = 2000 

rand_idx_train = np.random.randint(24999, size=N) 
rand_idx_test = np.random.randint(24999, 49999, size=N) 

x_train = orca_dataset['train'][rand_idx_train]['system_prompt']
y_train = orca_dataset['train'][rand_idx_train]['question']
z_train = orca_dataset['train'][rand_idx_train]['response']

x_test = orca_dataset['train'][rand_idx_test]['system_prompt']
y_test = orca_dataset['train'][rand_idx_test]['question']
z_test = orca_dataset['train'][rand_idx_test]['response']

dataset = DatasetDict({'train':Dataset.from_dict({'system_prompt':x_train,'question':y_train,'response':z_train}),
                             'validation':Dataset.from_dict({'system_prompt':x_test,'question':y_test,'response':z_test})})

#dataset.save_to_disk("imdb-truncated")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['system_prompt', 'question', 'response'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['system_prompt', 'question', 'response'],
        num_rows: 2000
    })
})

In [9]:
# display % of data with positive sentiment
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

KeyError: "Column label not in the dataset. Current columns in the dataset: ['system_prompt', 'question', 'response']"

### Getting the Model

In [10]:
model_checkpoint = 'distilbert-base-uncased'

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

NameError: name 'local_model_checkpoint' is not defined

In [89]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

### Tokenizer Stuff

In [90]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [91]:
def tokenize_function(examples):
    text = examples["text"]
    
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [92]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [93]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Evaluation Metric (accuracy)

In [94]:
accuracy = evaluate.load("accuracy")

In [95]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Try Untrained Model

In [96]:
text_list = ["Awesome movie", "Bad movie", "Not for me", "Perfect choice for movie night", "My eyes were literally bleeding"]

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    print(logits)
    print(text + " - " + id2label[predictions.tolist()])

tensor([[-1.9095,  2.0475]], grad_fn=<AddmmBackward0>)
Awesome movie - Positive
tensor([[ 6.7187, -7.5494]], grad_fn=<AddmmBackward0>)
Bad movie - Negative
tensor([[-0.3641,  0.1762]], grad_fn=<AddmmBackward0>)
Not for me - Positive
tensor([[-0.7912,  0.6729]], grad_fn=<AddmmBackward0>)
Perfect choice for movie night - Positive
tensor([[ 0.4222, -0.6718]], grad_fn=<AddmmBackward0>)
My eyes were literally bleeding - Negative


### Train model

In [70]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [71]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})

In [72]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [73]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 2

In [74]:
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [81]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.626243,{'accuracy': 0.879}
2,0.295500,0.547636,{'accuracy': 0.875}


Checkpoint destination directory distilbert-base-uncased-lora-text-classification\checkpoint-250 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilbert-base-uncased-lora-text-classification\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=500, training_loss=0.29547357177734374, metrics={'train_runtime': 105.2143, 'train_samples_per_second': 19.009, 'train_steps_per_second': 4.752, 'total_flos': 224571345628224.0, 'train_loss': 0.29547357177734374, 'epoch': 2.0})

### Test Trained Model

In [None]:
model.to('cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(logits)

    print(text + " - " + id2label[predictions.tolist()[0]])