# Fine-tuning using LoRA


In [3]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

### dataset

In [2]:
# # how dataset was generated

# # load imdb data
# imdb_dataset = load_dataset("imdb")

# # define subsample size
# N = 1000 
# # generate indexes for random subsample
# rand_idx = np.random.randint(24999, size=N) 

# # extract train and test data
# x_train = imdb_dataset['train'][rand_idx]['text']
# y_train = imdb_dataset['train'][rand_idx]['label']

# x_test = imdb_dataset['test'][rand_idx]['text']
# y_test = imdb_dataset['test'][rand_idx]['label']

# # create new dataset
# dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
#                              'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

In [4]:
# load dataset
dataset = load_dataset('/imdb-truncated')
dataset

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 105210.05 examples/s]
Generating validation split: 100%|██████████| 1000/1000 [00:00<00:00, 141284.20 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [5]:
# display % of training data with label=1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

np.float64(0.5)

### model

In [36]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' 

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# display architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### preprocess data

In [39]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [40]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [41]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 1000/1000 [00:00<00:00, 3351.73 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [42]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### evaluation

In [43]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [44]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Apply untrained model to text

In [45]:
# define list of examples
text_list = ["should be better"]
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass.", "It's amazing how much better it could"]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive
It's amazing how much better it could - Positive


### Train model

In [16]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [17]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [18]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [19]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [20]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [21]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, 
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

                                                    
 10%|█         | 250/2500 [07:56<52:45,  1.41s/it]

{'eval_loss': 0.3872629404067993, 'eval_accuracy': {'accuracy': 0.875}, 'eval_runtime': 146.0661, 'eval_samples_per_second': 6.846, 'eval_steps_per_second': 1.712, 'epoch': 1.0}


 20%|██        | 500/2500 [13:28<34:13,  1.03s/it]   

{'loss': 0.4432, 'grad_norm': 1.3231385946273804, 'learning_rate': 0.0008, 'epoch': 2.0}


                                                  
 20%|██        | 500/2500 [15:53<34:13,  1.03s/it]

{'eval_loss': 0.5263215899467468, 'eval_accuracy': {'accuracy': 0.856}, 'eval_runtime': 144.8385, 'eval_samples_per_second': 6.904, 'eval_steps_per_second': 1.726, 'epoch': 2.0}


                                                     
 30%|███       | 750/2500 [23:43<36:33,  1.25s/it]

{'eval_loss': 0.5357687473297119, 'eval_accuracy': {'accuracy': 0.886}, 'eval_runtime': 141.1364, 'eval_samples_per_second': 7.085, 'eval_steps_per_second': 1.771, 'epoch': 3.0}


 40%|████      | 1000/2500 [29:24<33:39,  1.35s/it]  

{'loss': 0.1983, 'grad_norm': 0.017277993261814117, 'learning_rate': 0.0006, 'epoch': 4.0}


                                                   
 40%|████      | 1000/2500 [31:52<33:39,  1.35s/it]

{'eval_loss': 0.59220290184021, 'eval_accuracy': {'accuracy': 0.893}, 'eval_runtime': 148.3626, 'eval_samples_per_second': 6.74, 'eval_steps_per_second': 1.685, 'epoch': 4.0}


                                                      
 50%|█████     | 1250/2500 [40:09<32:18,  1.55s/it]

{'eval_loss': 0.6617922186851501, 'eval_accuracy': {'accuracy': 0.895}, 'eval_runtime': 153.2092, 'eval_samples_per_second': 6.527, 'eval_steps_per_second': 1.632, 'epoch': 5.0}


 60%|██████    | 1500/2500 [45:49<19:00,  1.14s/it]   

{'loss': 0.0616, 'grad_norm': 0.0003634823369793594, 'learning_rate': 0.0004, 'epoch': 6.0}


                                                   
 60%|██████    | 1500/2500 [48:26<19:00,  1.14s/it]

{'eval_loss': 0.712582528591156, 'eval_accuracy': {'accuracy': 0.887}, 'eval_runtime': 157.3658, 'eval_samples_per_second': 6.355, 'eval_steps_per_second': 1.589, 'epoch': 6.0}


                                                      
 70%|███████   | 1750/2500 [56:59<16:39,  1.33s/it]

{'eval_loss': 0.7819281816482544, 'eval_accuracy': {'accuracy': 0.893}, 'eval_runtime': 157.4687, 'eval_samples_per_second': 6.35, 'eval_steps_per_second': 1.588, 'epoch': 7.0}


 80%|████████  | 2000/2500 [1:02:54<10:06,  1.21s/it] 

{'loss': 0.0193, 'grad_norm': 0.0003806292952504009, 'learning_rate': 0.0002, 'epoch': 8.0}


                                                     
 80%|████████  | 2000/2500 [1:05:27<10:06,  1.21s/it]

{'eval_loss': 0.9163217544555664, 'eval_accuracy': {'accuracy': 0.884}, 'eval_runtime': 153.2164, 'eval_samples_per_second': 6.527, 'eval_steps_per_second': 1.632, 'epoch': 8.0}


                                                       
 90%|█████████ | 2250/2500 [1:36:09<06:35,  1.58s/it]

{'eval_loss': 0.8861302733421326, 'eval_accuracy': {'accuracy': 0.892}, 'eval_runtime': 1517.8329, 'eval_samples_per_second': 0.659, 'eval_steps_per_second': 0.165, 'epoch': 9.0}


100%|██████████| 2500/2500 [1:41:33<00:00,  1.11s/it]    

{'loss': 0.0022, 'grad_norm': 10.465410232543945, 'learning_rate': 0.0, 'epoch': 10.0}


                                                     
100%|██████████| 2500/2500 [1:43:53<00:00,  1.11s/it]

{'eval_loss': 0.8551654815673828, 'eval_accuracy': {'accuracy': 0.892}, 'eval_runtime': 138.3034, 'eval_samples_per_second': 7.23, 'eval_steps_per_second': 1.808, 'epoch': 10.0}


100%|██████████| 2500/2500 [1:43:54<00:00,  2.49s/it]

{'train_runtime': 6234.4263, 'train_samples_per_second': 1.604, 'train_steps_per_second': 0.401, 'train_loss': 0.14492488350868224, 'epoch': 10.0}





TrainOutput(global_step=2500, training_loss=0.14492488350868224, metrics={'train_runtime': 6234.4263, 'train_samples_per_second': 1.604, 'train_steps_per_second': 0.401, 'total_flos': 1112883852759936.0, 'train_loss': 0.14492488350868224, 'epoch': 10.0})

### Generate prediction

In [28]:
text_list = ["this one is awful"]

In [34]:
model.to('cpu') 

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") 

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive
It's amazing how much better it could be. - Positive
