# Updating the Lora Fine-tuning

We are going to do classification using a language model.

In [1]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer, # flexible class for loading the suitable tokenizer for the model.
    AutoConfig, # auto load the detail of model configuration e.g. num of layers, attention head etc.
    AutoModelForSequenceClassification, # load prtrained mdl specifically tailored for sequence classfction
    DataCollatorWithPadding, # for collating (nyusun) data with padding (lapis) form
    TrainingArguments, # defining train paramters, e.g.: epoch, num_of_gpu, lr, batch etc.
    Trainer # high-level for training and evaluating models in transformers. no need manually write backprop steps
)

from peft import ( # Parameter Efficient Fine Tunning
    PeftModel, # a wrapper for integrating the PEFT method with the model
    PeftConfig, # FT settings e.g. adapter (lora tec.), hyperparameter
    get_peft_model,
    LoraConfig # define lora config e.g. rank, scalling factor, & dropout. Rank adl besar kecilnya row&column
)

import evaluate
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Dataset Generation

In [2]:
# load imdb train data with shuffled 
imdb_dataset = load_dataset("imdb", split="train").shuffle(seed=3)

# select 2000 random rows
dataset = imdb_dataset.select(range(3000))

print(dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 3000
})


In [3]:
x_train = dataset['text'][:2000]
y_train = dataset['label'][:2000]

x_test = dataset['text'][:-2000]
y_test = dataset['label'][:-2000]

In [4]:
dataset = DatasetDict({'train':Dataset.from_dict({'label':y_train,'text':x_train}),
                              'validation':Dataset.from_dict({'label':y_test,'text':x_test})})

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

### Base Model Loading

In [7]:
model_checkpoint = 'distilbert-base-uncased'  # model name that will be used

# label maps: have to be both direction to avoid confusion
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# display model architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### Data Preprocessing

In [None]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) # make sure compatibility
# jadi tokenisasi juga ngga sembarang tokenisasi, harus sesuai sama model yang kita mau finetune 

# add pad token if none exists
if tokenizer.pad_token is None: # checking the pad, pad is important to make sure all inputs have same length
    tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # add the pad token to model
    model.resize_token_embeddings(len(tokenizer)) # update the embed size to match loken vocab length

In [10]:
# create tokenize function
def tokenize_function(examples):
    # take the text
    text = examples['text']

    # tokenize and truncate the text
    tokenizer.truncation_side = 'left'
    tokenized_inputs = tokenizer(
        text,
        return_tensors = 'np',
        truncation = True,
        max_length = 512
    )

    return tokenized_inputs


In [None]:
# tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True) # use map() to apply the function to data.
tokenized_dataset


Map: 100%|██████████| 2000/2000 [00:00<00:00, 3998.85 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 5589.41 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # penyusunan data biar biar siap ditrain

### Evaluation

In [13]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [14]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [15]:
# define list of examples
text_list = ["should be better"]
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass.", "It's amazing how much better it could"]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive
It's amazing how much better it could - Positive


In [None]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4, # lebar matrix yang bakal ditrain
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [17]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [18]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [19]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [None]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification", # save model to local directory
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch", # save the model by epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [21]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

 10%|█         | 500/5000 [10:49<1:34:50,  1.26s/it]

{'loss': 0.4805, 'grad_norm': 0.11535267531871796, 'learning_rate': 0.0009000000000000001, 'epoch': 1.0}


                                                    
 10%|█         | 500/5000 [13:05<1:34:50,  1.26s/it]

{'eval_loss': 0.21696944534778595, 'eval_accuracy': {'accuracy': 0.921}, 'eval_runtime': 136.1716, 'eval_samples_per_second': 7.344, 'eval_steps_per_second': 1.836, 'epoch': 1.0}


 20%|██        | 1000/5000 [51:42<1:35:36,  1.43s/it]  

{'loss': 0.3815, 'grad_norm': 16.98744010925293, 'learning_rate': 0.0008, 'epoch': 2.0}


                                                     
 20%|██        | 1000/5000 [54:00<1:35:36,  1.43s/it]

{'eval_loss': 0.1949460208415985, 'eval_accuracy': {'accuracy': 0.948}, 'eval_runtime': 138.7659, 'eval_samples_per_second': 7.206, 'eval_steps_per_second': 1.802, 'epoch': 2.0}


 30%|███       | 1500/5000 [1:05:40<1:32:21,  1.58s/it]

{'loss': 0.2958, 'grad_norm': 8.434636116027832, 'learning_rate': 0.0007, 'epoch': 3.0}


                                                       
 30%|███       | 1500/5000 [1:08:04<1:32:21,  1.58s/it]

{'eval_loss': 0.13149301707744598, 'eval_accuracy': {'accuracy': 0.965}, 'eval_runtime': 143.7457, 'eval_samples_per_second': 6.957, 'eval_steps_per_second': 1.739, 'epoch': 3.0}


 40%|████      | 2000/5000 [1:19:43<1:11:19,  1.43s/it] 

{'loss': 0.2432, 'grad_norm': 2.0960934162139893, 'learning_rate': 0.0006, 'epoch': 4.0}


                                                       
 40%|████      | 2000/5000 [1:22:02<1:11:19,  1.43s/it]

{'eval_loss': 0.07744618505239487, 'eval_accuracy': {'accuracy': 0.981}, 'eval_runtime': 138.9333, 'eval_samples_per_second': 7.198, 'eval_steps_per_second': 1.799, 'epoch': 4.0}


 50%|█████     | 2500/5000 [1:33:42<46:36,  1.12s/it]   

{'loss': 0.1623, 'grad_norm': 0.017084449529647827, 'learning_rate': 0.0005, 'epoch': 5.0}


                                                     
 50%|█████     | 2500/5000 [1:35:59<46:36,  1.12s/it]

{'eval_loss': 0.023435022681951523, 'eval_accuracy': {'accuracy': 0.99}, 'eval_runtime': 137.3969, 'eval_samples_per_second': 7.278, 'eval_steps_per_second': 1.82, 'epoch': 5.0}


 60%|██████    | 3000/5000 [2:17:31<47:28,  1.42s/it]     

{'loss': 0.1051, 'grad_norm': 0.0006026664050295949, 'learning_rate': 0.0004, 'epoch': 6.0}


                                                     
 60%|██████    | 3000/5000 [2:19:54<47:28,  1.42s/it]

{'eval_loss': 0.012323903851211071, 'eval_accuracy': {'accuracy': 0.998}, 'eval_runtime': 142.3723, 'eval_samples_per_second': 7.024, 'eval_steps_per_second': 1.756, 'epoch': 6.0}


 70%|███████   | 3500/5000 [2:32:06<41:32,  1.66s/it]   

{'loss': 0.0886, 'grad_norm': 0.0002075612428598106, 'learning_rate': 0.0003, 'epoch': 7.0}


                                                     
 70%|███████   | 3500/5000 [2:34:27<41:32,  1.66s/it]

{'eval_loss': 0.0010571195743978024, 'eval_accuracy': {'accuracy': 0.999}, 'eval_runtime': 141.1885, 'eval_samples_per_second': 7.083, 'eval_steps_per_second': 1.771, 'epoch': 7.0}


 80%|████████  | 4000/5000 [2:46:27<21:04,  1.26s/it]   

{'loss': 0.043, 'grad_norm': 0.0022691581398248672, 'learning_rate': 0.0002, 'epoch': 8.0}


                                                     
 80%|████████  | 4000/5000 [2:48:50<21:04,  1.26s/it]

{'eval_loss': 0.00019290494674351066, 'eval_accuracy': {'accuracy': 1.0}, 'eval_runtime': 142.9177, 'eval_samples_per_second': 6.997, 'eval_steps_per_second': 1.749, 'epoch': 8.0}


 90%|█████████ | 4500/5000 [3:00:37<12:46,  1.53s/it]   

{'loss': 0.0285, 'grad_norm': 0.0457051657140255, 'learning_rate': 0.0001, 'epoch': 9.0}


                                                     
 90%|█████████ | 4500/5000 [3:02:57<12:46,  1.53s/it]

{'eval_loss': 1.8194259610027075e-05, 'eval_accuracy': {'accuracy': 1.0}, 'eval_runtime': 140.1881, 'eval_samples_per_second': 7.133, 'eval_steps_per_second': 1.783, 'epoch': 9.0}


100%|██████████| 5000/5000 [3:14:43<00:00,  1.20s/it]  

{'loss': 0.0076, 'grad_norm': 8.574708772357553e-05, 'learning_rate': 0.0, 'epoch': 10.0}


                                                     
100%|██████████| 5000/5000 [3:17:06<00:00,  1.20s/it]

{'eval_loss': 5.509498350875219e-06, 'eval_accuracy': {'accuracy': 1.0}, 'eval_runtime': 142.6538, 'eval_samples_per_second': 7.01, 'eval_steps_per_second': 1.752, 'epoch': 10.0}


100%|██████████| 5000/5000 [3:17:07<00:00,  2.37s/it]

{'train_runtime': 11827.1756, 'train_samples_per_second': 1.691, 'train_steps_per_second': 0.423, 'train_loss': 0.18359914259910584, 'epoch': 10.0}





TrainOutput(global_step=5000, training_loss=0.18359914259910584, metrics={'train_runtime': 11827.1756, 'train_samples_per_second': 1.691, 'train_steps_per_second': 0.423, 'total_flos': 2229340846699296.0, 'train_loss': 0.18359914259910584, 'epoch': 10.0})

### Prediction

In [28]:
text_list = ["This is not worth watching even once."]

In [29]:
model.to('cpu') 

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") 

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
This is not worth watching even once. - Negative


In [32]:
# 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

from huggingface_hub import login
login('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')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\Lenovo\.cache\huggingface\token
Login successful


In [33]:
hf_name = 'isal-amir' # username
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # model name

In [34]:
model.push_to_hub(model_id) # save model

adapter_model.safetensors: 100%|██████████| 2.52M/2.52M [00:00<00:00, 2.73MB/s]


CommitInfo(commit_url='https://huggingface.co/isal-amir/distilbert-base-uncased-lora-text-classification/commit/b6cccf8a08874ee34108a78f6fff7601dc4bb3c5', commit_message='Upload model', commit_description='', oid='b6cccf8a08874ee34108a78f6fff7601dc4bb3c5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/isal-amir/distilbert-base-uncased-lora-text-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='isal-amir/distilbert-base-uncased-lora-text-classification'), pr_revision=None, pr_num=None)

In [35]:
trainer.push_to_hub(model_id) # save trainer


adapter_model.bin:   0%|          | 0.00/2.52M [00:00<?, ?B/s]
training_args.bin: 100%|██████████| 5.24k/5.24k [00:00<00:00, 10.7kB/s]
adapter_model.bin: 100%|██████████| 2.52M/2.52M [00:01<00:00, 1.47MB/s]
Upload 2 LFS files: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]


CommitInfo(commit_url='https://huggingface.co/isal-amir/distilbert-base-uncased-lora-text-classification/commit/f62e1d33294aee38659607a8946f2b1c09111e7f', commit_message='isal-amir/distilbert-base-uncased-lora-text-classification', commit_description='', oid='f62e1d33294aee38659607a8946f2b1c09111e7f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/isal-amir/distilbert-base-uncased-lora-text-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='isal-amir/distilbert-base-uncased-lora-text-classification'), pr_revision=None, pr_num=None)