<a href="https://colab.research.google.com/github/kvsnoufal/MSPhi2-for-classification-LoRA/blob/main/trainPhi2Lora%26Quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fine Tuning Microsoft Phi 2 model for Sequence Classification on QLoRA using huggingface trainer (Bits and Bytes Quantization)

In [1]:
!pip install -U transformers peft datasets evaluate wandb einops bitsandbytes



In [2]:
NUM_LABELS = 2
MODEL_NM = "microsoft/phi-2"
MAX_LEN = 512
LR = 2e-4
OUTPUT_DIR = 'Phi2-Seq-classification-QLoRa'
MAX_STEPS = 3000

In [3]:
# data loading
from datasets import load_dataset,Dataset
import pandas as pd
dataset = load_dataset("mehdiiraqui/twitter_disaster")
# Split the dataset into training and validation datasets
data = dataset['train'].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
data['val'] = data.pop("test")
# Convert the test dataframe to HuggingFace dataset and add it into the first dataset
data['test'] = dataset['test']
# explore data
data.items()
print(data['train'].to_pandas().info())
print(data['val'].to_pandas().info())
print(data['test'].to_pandas().info())
print(data['train'].to_pandas()['target'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6090 entries, 0 to 6089
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        6090 non-null   int64 
 1   keyword   6037 non-null   object
 2   location  4064 non-null   object
 3   text      6090 non-null   object
 4   target    6090 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 238.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1523 entries, 0 to 1522
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        1523 non-null   int64 
 1   keyword   1515 non-null   object
 2   location  1016 non-null   object
 3   text      1523 non-null   object
 4   target    1523 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 59.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 

In [4]:
# setting weights for loss function- to address class imbalance
pos_weights = len(data['train'].to_pandas()) / (2 * data['train'].to_pandas().target.value_counts()[1])
neg_weights = len(data['train'].to_pandas()) / (2 * data['train'].to_pandas().target.value_counts()[0])
pos_weights,neg_weights

(1.1622137404580153, 0.877521613832853)

### Data prep

In [5]:
# tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NM,trust_remote_code=True,)
# set pad token - to avoid error while training
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def preprocessing_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=MAX_LEN)
# test
preprocessing_function(data['train'][0])

{'input_ids': [1820, 5290, 3252, 13, 3740, 1378, 83, 13, 1073, 14, 72, 39, 23, 8322, 89, 23, 76, 80, 18], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
# ref: https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
col_to_delete = ['id', 'keyword','location', 'text']
# Apply the preprocessing function and remove the undesired columns
tokenized_datasets = data.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# Rename the target to label as for HugginFace standards
tokenized_datasets = tokenized_datasets.rename_column("target", "label")
# Set to torch format
tokenized_datasets.set_format("torch")
# test
tokenized_datasets['train'][0]

{'label': tensor(0),
 'input_ids': tensor([1820, 5290, 3252,   13, 3740, 1378,   83,   13, 1073,   14,   72,   39,
           23, 8322,   89,   23,   76,   80,   18]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [8]:
#  It takes a batch of examples and ensures that each sequence in the batch has the same length
#  by padding the shorter ones. Ensures fixed-size input sequences
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Modeling

In [9]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
from torch import nn
# ref: https://www.kaggle.com/code/archanghosh/quantized-mistral-7b-approach
# ref: https://huggingface.co/microsoft/phi-2/discussions/19
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype='float16',
        bnb_4bit_use_double_quant=False,
    )
device_map = {"": 0}


# load model in half precision - not working for training
# basemodel = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype="auto", device_map="cuda", trust_remote_code=True)
# loads model in full precision - works
# basemodel = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", torch_dtype=torch.float32, device_map="cuda", trust_remote_code=True)
# loads model with quantization
basemodel = AutoModelForCausalLM.from_pretrained(MODEL_NM, quantization_config=bnb_config, device_map=device_map, trust_remote_code=True)
#Setting the Pretraining_tp to 1 ensures we are using the Linear Layers to the max computation possible
basemodel.config.pretraining_tp = 1
basemodel.config.pad_token_id = tokenizer.pad_token_id
basemodel

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2560)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-31): 32 x ParallelBlock(
        (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (Wqkv): Linear4bit(in_features=2560, out_features=7680, bias=True)
          (out_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_attn): CrossAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
        )
        (mlp): MLP(
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
          (act): NewGELUActivation()
        )
      )

Model has 3 main blocks:
1. Transformer block - backbone model - output shape - batch x seq_len x 2560
2. lm_head - output batch x seq_len x 51200(token size)
3. loss - Loss fn

To reconfigure model for sequence classification:
We use an existing sequence classification wrapper and modify it as follows:
1. use transformer block from above
2. change lm head to classification head
3. use custom loss function

We are using PhiForSequenceClassification class source code : https://github.com/huggingface/transformers/blob/v4.36.1/src/transformers/models/phi/modeling_phi.py#L1165

In [10]:

from transformers.modeling_utils import PreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutputWithPast
from typing import List, Optional, Tuple, Union
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
class PhiPreTrainedModel(PreTrainedModel):
    config_class = basemodel.config_class
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _supports_cache_class = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


#custom class - modified from PhiForSequenceClassification
class PhiForSequenceClassificationModified(PhiPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = NUM_LABELS#changed
        self.model = basemodel.transformer#changed
        self.score = nn.Linear(basemodel.config.hidden_size, NUM_LABELS, bias=False)#changed

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self):
        return self.model.embd.wte#changed

    def set_input_embeddings(self, value):
        self.model.embd.wte = value#changed

    @add_start_docstrings_to_model_forward("PHI_INPUTS_DOCSTRING")
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        model_outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            past_key_values=past_key_values,

        )
        hidden_states = model_outputs#changed
        logits = self.score(hidden_states)
        # print(logits)

        if input_ids is not None:
            batch_size = input_ids.shape[0]
        else:
            batch_size = inputs_embeds.shape[0]

        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
                    logits.device
                )
            else:
                sequence_lengths = -1

        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(pooled_logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(pooled_logits, labels)
        if not return_dict:
            output = (pooled_logits,) + model_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=None,
            hidden_states=None,
            attentions=None,
        )#changed


In [11]:
model = PhiForSequenceClassificationModified(basemodel.config)

#### Setup PEFT- LoRa.
ref. https://huggingface.co/microsoft/phi-2/discussions/19

In [12]:

from peft import get_peft_model, LoraConfig, TaskType
peft_model = get_peft_model(model, LoraConfig(
                            task_type=TaskType.SEQ_CLS,
                            r=32,
                            lora_alpha=16,
                            target_modules=[
                            'Wqkv',
                            'out_proj'
                            ],
                            bias="none",
                            lora_dropout=0.05, # Conventional
                        ))
peft_model.print_trainable_parameters()
peft_model = peft_model.to(device='cuda')


trainable params: 15,733,760 || all params: 2,664,294,400 || trainable%: 0.5905413455810289


In [13]:
# evaluation metrics and loss function for trainer
# https://huggingface.co/blog/Lora-for-sequence-classification-with-Roberta-Llama-Mistral
# https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt

import evaluate
import numpy as np
from transformers import Trainer,TrainingArguments
def compute_metrics(eval_pred):
    # All metrics are already predefined in the HF `evaluate` package
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric= evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # The trainer is expecting a dictionary where the keys are the metrics names and the values are the scores.
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}


class WeightedCELossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # Get model's predictions
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Compute custom loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([neg_weights, pos_weights], device=model.device, dtype=logits.dtype))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


In [14]:

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",#for qlora
    save_steps=100,
    save_total_limit =2,
    logging_steps=10,
    learning_rate=LR,
    fp16=True,#for qlora
    # bf16=True,
    max_grad_norm=.3,
    max_steps=MAX_STEPS,
    warmup_ratio=.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to='wandb',
    evaluation_strategy="steps",# Evaluate the model every specified number of steps
    eval_steps=100,
)
import wandb
wandb.init(project=OUTPUT_DIR)
phi2_trainer = WeightedCELossTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
from copy import deepcopy

from transformers import TrainerCallback
class CustomCallback(TrainerCallback):
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy
phi2_trainer.add_callback(CustomCallback(phi2_trainer))

[34m[1mwandb[0m: Currently logged in as: [33mkvsnoufal[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [15]:
# train
phi2_trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
100,1.3925,1.283911,0.580645,0.387097,0.464516,0.618516
200,1.2532,0.996144,0.68543,0.635945,0.659761,0.719632
300,1.0466,2.312223,0.876106,0.304147,0.451539,0.684176
400,1.4127,1.017044,0.765988,0.809524,0.787155,0.812869
500,1.1878,1.219186,0.61987,0.88172,0.727964,0.718319
600,1.192,0.916777,0.692403,0.854071,0.764787,0.775443
700,0.6855,1.089889,0.856884,0.726575,0.786367,0.831254
800,0.9064,0.934007,0.771341,0.777266,0.774292,0.806303
900,0.0885,1.229913,0.751037,0.834101,0.790393,0.8109
1000,0.8815,0.87057,0.829752,0.771121,0.799363,0.834537


Checkpoint destination directory Phi2-Seq-classification-QLoRa/checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory Phi2-Seq-classification-QLoRa/checkpoint-200 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=3000, training_loss=0.9764829754432043, metrics={'train_runtime': 6659.8268, 'train_samples_per_second': 1.802, 'train_steps_per_second': 0.45, 'total_flos': 2829283971379200.0, 'train_loss': 0.9764829754432043, 'epoch': 1.97})

In [17]:
import os
best_step = 3000
best_model_ckpt = os.path.join(OUTPUT_DIR,f'checkpoint-{best_step}')

from peft import PeftModel
loaded_model = PeftModel.from_pretrained(model,best_model_ckpt,is_trainable=False)
# phi2_trainer.model = loaded_model
phi2_trainer = WeightedCELossTrainer(
    model=loaded_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [18]:
phi2_trainer.evaluate()

{'eval_loss': 0.9753166437149048,
 'eval_precision': 0.8548672566371681,
 'eval_recall': 0.7419354838709677,
 'eval_f1-score': 0.794407894736842,
 'eval_accuracy': 0.8358502954694682,
 'eval_runtime': 71.1432,
 'eval_samples_per_second': 21.408,
 'eval_steps_per_second': 2.685}

In [19]:
preds = phi2_trainer.predict(tokenized_datasets['test'])
pred_probas = torch.nn.functional.softmax(torch.tensor(preds.predictions),dim=1)
pred_probas

  _warn_prf(average, modifier, msg_start, len(result))


tensor([[4.6733e-01, 5.3267e-01],
        [1.9411e-04, 9.9981e-01],
        [2.4826e-04, 9.9975e-01],
        ...,
        [4.4520e-05, 9.9996e-01],
        [1.4466e-03, 9.9855e-01],
        [4.6839e-05, 9.9995e-01]])

In [20]:
# data['test'].to_pandas()['target'].value_counts()
pred_class  = pred_probas.argmax(1)
pred_class

tensor([1, 1, 1,  ..., 1, 1, 1])

In [21]:
pd.Series(pred_class.cpu().numpy()).value_counts()

0    2088
1    1175
dtype: int64