# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Install Required Packages

In [None]:
!pip install evaluate datasets

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.9 MB/s[0m eta [3

# Import Required Packages

In [None]:
import os
import evaluate
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from datasets import load_dataset
from scipy.special import softmax

# Set Environment

In [None]:
seed = 2024
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(False)

cuda


# Data Preparation

## Load dataset

In [None]:
def is_valid_text(example):
    """Detect if the example contain missing value in text or labels
    """

    return example['text'] is not None and example['labels'] is not None

def dataset_load(path):
    """Load dataset from a path
    """

    dataset = load_dataset('csv', data_files=path)
    dataset = dataset["train"].filter(is_valid_text)
    dataset = dataset.class_encode_column('labels')
    return dataset

In [None]:
source_dir = '/content/gdrive/MyDrive/Colab_Notebooks/ML-LoRA-E5/'
dataset = dataset_load(path=os.path.join(source_dir, 'twitter_raid_data/raid_twitter_train.csv')) ## change to your dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/228000 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/228000 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/228000 [00:00<?, ? examples/s]

In [None]:
dataset.to_pandas()["labels"].value_counts()

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
1,138000
0,90000


## Split data into training and validation/test datasets

In [None]:
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='labels', seed = 100) ## set seed for reproducibility
print("Train label distribution:", dataset["train"].to_pandas()["labels"].value_counts())
print("Test label distribution:", dataset["test"].to_pandas()["labels"].value_counts())

Train label distribution: labels
1    110400
0     72000
Name: count, dtype: int64
Test label distribution: labels
1    27600
0    18000
Name: count, dtype: int64


## Tokenization

In [None]:
def text_tokenize(dataset, model_name, max_length=512, truncation=True, padding=False):
    """Tokenize dataset using the provided tokenizer.

    Parameters:
    - dataset (Dataset): The dataset to tokenize.
    - model_name (str): The name of the tokenizer to use. Must be a model on https://huggingface.co/models
    - max_length (int, optional): The maximum length of the tokenized text. Defaults to 512.
    - truncation (bool, optional): Whether to truncate the text if it exceeds max_length. Defaults to True.
    - padding (bool, optional): Whether to pad the text to max_length. Defaults to False.

    Returns:
    - tokenized_datasets (DatasetDict): The tokenized dataset.
    - tokenizer (Tokenizer): The tokenizer used for tokenization.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def tokenize_function(examples):
        return tokenizer(examples["text"], max_length=max_length, truncation=truncation, padding=padding)
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    return tokenized_datasets, tokenizer

In [None]:
tokenized_datasets, tokenizer = text_tokenize(dataset, model_name='intfloat/e5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Map:   0%|          | 0/182400 [00:00<?, ? examples/s]

Map:   0%|          | 0/45600 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets['train']

Dataset({
    features: ['text', 'domain', 'model', 'attack', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 182400
})

In [None]:
tokenized_datasets['test']

Dataset({
    features: ['text', 'domain', 'model', 'attack', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 45600
})

# Model Training

## Setup raw model

In [None]:
model_name = "intfloat/e5-small"
raw_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Check the paramters

In [None]:
for name, param in raw_model.named_parameters():
    print(f"Parameter: {name} | Requires Grad: {param.requires_grad}")

Parameter: bert.embeddings.word_embeddings.weight | Requires Grad: True
Parameter: bert.embeddings.position_embeddings.weight | Requires Grad: True
Parameter: bert.embeddings.token_type_embeddings.weight | Requires Grad: True
Parameter: bert.embeddings.LayerNorm.weight | Requires Grad: True
Parameter: bert.embeddings.LayerNorm.bias | Requires Grad: True
Parameter: bert.encoder.layer.0.attention.self.query.weight | Requires Grad: True
Parameter: bert.encoder.layer.0.attention.self.query.bias | Requires Grad: True
Parameter: bert.encoder.layer.0.attention.self.key.weight | Requires Grad: True
Parameter: bert.encoder.layer.0.attention.self.key.bias | Requires Grad: True
Parameter: bert.encoder.layer.0.attention.self.value.weight | Requires Grad: True
Parameter: bert.encoder.layer.0.attention.self.value.bias | Requires Grad: True
Parameter: bert.encoder.layer.0.attention.output.dense.weight | Requires Grad: True
Parameter: bert.encoder.layer.0.attention.output.dense.bias | Requires Grad: T

## LoRA configuration

In [None]:
def lora_model(rank, raw_model, alpha=None, dropout=0.1, modules=None):
    """Define LoRA model

    Parameters:
    - rank: rank of the LoRA model
    - raw_model: the model we want to fine-tune
    - alpha: alpha of the LoRA model
    - dropout: dropout of the LoRA model
    - modules: the modules we want to add LoRA adapter

    Returns:
    - model: the LoRA model obtained from peft.get_peft_model()


    """
    if alpha is None:
        alpha = 2*rank
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=rank,  # Low-rank adaptation rank
        lora_alpha=alpha,  # Scaling factor
        lora_dropout=dropout,  # Dropout for LoRA
        target_modules = modules  # NN (layers) components to add LoRA
    )
    model = get_peft_model(raw_model, lora_config)
    return model

In [None]:
e5_model = lora_model(rank=8, raw_model=raw_model, modules = ['attention.self.query', 'attention.self.key', 'attention.self.value'])

### Check the number of trainable parameters

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print_trainable_parameters(e5_model)

trainable params: 221954 || all params: 33582724 || trainable%: 0.6609172025473574


### Check parameters

In [None]:
for name, param in e5_model.named_parameters():
    print(f"Parameter: {name} | Requires Grad: {param.requires_grad}")

Parameter: base_model.model.bert.embeddings.word_embeddings.weight | Requires Grad: False
Parameter: base_model.model.bert.embeddings.position_embeddings.weight | Requires Grad: False
Parameter: base_model.model.bert.embeddings.token_type_embeddings.weight | Requires Grad: False
Parameter: base_model.model.bert.embeddings.LayerNorm.weight | Requires Grad: False
Parameter: base_model.model.bert.embeddings.LayerNorm.bias | Requires Grad: False
Parameter: base_model.model.bert.encoder.layer.0.attention.self.query.base_layer.weight | Requires Grad: False
Parameter: base_model.model.bert.encoder.layer.0.attention.self.query.base_layer.bias | Requires Grad: False
Parameter: base_model.model.bert.encoder.layer.0.attention.self.query.lora_A.default.weight | Requires Grad: True
Parameter: base_model.model.bert.encoder.layer.0.attention.self.query.lora_B.default.weight | Requires Grad: True
Parameter: base_model.model.bert.encoder.layer.0.attention.self.key.base_layer.weight | Requires Grad: Fal

## LoRA training

### Implement Focal Loss
https://medium.com/visionwizard/understanding-focal-loss-a-quick-read-b914422913e7

In [None]:
class FocalLoss(torch.nn.Module):
    """Define FocalLoss Class
    """
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
       """
       Parameters:
       - gamma: focal loss gamma
       - alpha: focal loss alpha
       - reduction: reduction method for the return loss
       """
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, inputs, targets):
        """
        Parameters:
        - inputs: logits (should be a tensor of size (batch_size, num_classes))
        - targets: labels
        """
        # Compute standard cross entropy
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')

        # Compute the probability of each class for the targets
        pt = torch.exp(-ce_loss)

        # Apply focal loss formula
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        # Apply class weights if provided
        if self.alpha is not None:
            alpha_t = self.alpha.gather(0, targets.data.view(-1))
            focal_loss = alpha_t * focal_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

In [None]:
class FocalLossTrainer(Trainer):
    """Define FocalLossTrainer Class
    """
    def __init__(self, *args, fl_alpha=None, **kwargs):
        """
        Parameters:
        - fl_alpha: focal loss alpha
        """
        super().__init__(*args, **kwargs)
        self.fl_alpha = fl_alpha if fl_alpha is not None else [0.5,0.5]

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Parameters:
        - model: model used to compute loss
        - inputs: a dictionary containing the batch data including keys "input_ids", "attention_mask", and "labels"
        - return_outputs: return outputs or not

        """
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Calculate focal loss

        loss_fct = FocalLoss(gamma=2.0, alpha=torch.tensor(self.fl_alpha).to(logits.device))
        loss = loss_fct(logits, labels)

        self.log({"train_loss": loss.item()})
        self.state.log_history.append({"train_loss": loss.item()})

        return (loss, outputs) if return_outputs else loss

### Define evaluation metrics

In [None]:
accuracy_m = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    """Compute evaluation metrics

    Parameters:
    - eval_pred: a tuple containing logits and labels

    Returns:
    - a dictionary containing the evaluation metrics
    """
    logits, labels = eval_pred

    # Convert logits to probabilities for AUC calculation
    probabilities = softmax(logits, axis=-1)

    # Get predicted classes
    predictions = np.argmax(logits, axis=-1)

    # Calculate F1 score
    f1 = f1_score(labels, predictions, average="weighted")

    # Calculate Accuracy
    accuracy = accuracy_m.compute(predictions=predictions, references=labels)

    # Calculate AUC
    # For binary classification, use labels as-is; for multi-class, use `multi_class='ovr'`
    try:
        if probabilities.shape[1] == 2:  # Binary classification
            positive_probs = probabilities[:, 1]
            auc = roc_auc_score(labels, positive_probs)
        else:  # Multi-class classification
            auc = roc_auc_score(labels, probabilities, multi_class='ovr', average='weighted')
    except ValueError:
        # AUC calculation may fail if there's only one class in `labels`
        auc = None

    return {
        "accuracy": accuracy['accuracy'],
        "f1": f1,
        "auc": auc
    }



### Define training arguments and trainer

In [None]:
# define training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(source_dir, 'twitter_raid_data/results_LoRA_e5'),
    overwrite_output_dir=True,
    run_name='LoRA-E5',
    save_strategy="epoch",
    logging_strategy="steps",  # Ensure logging happens at each step
    logging_steps=10,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    #learning_rate=2e-5,
    per_device_train_batch_size=10,  # Training epoch size
    per_device_eval_batch_size=1,  # Evaluation epoch size
    group_by_length=True, # group the tokenized text by length
    num_train_epochs=3
)

In [None]:
# compute frequency in order to get by class weigth (alpha) in focal loss
freq = [1 - sum(dataset['train']['labels'])/len(dataset['train']['labels']), sum(dataset['train']['labels'])/len(dataset['train']['labels'])]

In [None]:
# define LoRa trainer
model_name = "intfloat/e5-small"
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
LoRA_trainer = FocalLossTrainer(
    model=e5_model,
    fl_alpha = [1-x for x in freq],
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

## Training

In [None]:
# set environment
os.environ["WANDB_DISABLED"] = "false"
os.environ["WANDB_MODE"] = "dryrun"

In [None]:
LoRA_trainer.train()  ## The peformance after the third epoch decrease. We may take the second one as the final model.

Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.0091,0.064852,{'accuracy': 0.8407675438596491},0.831234,0.965121
2,0.0073,0.044413,{'accuracy': 0.8903070175438597},0.887466,0.976069
3,0.0075,0.053699,{'accuracy': 0.8796271929824562},0.875316,0.976018


Trainer is attempting to log a value of "{'accuracy': 0.8407675438596491}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8903070175438597}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8796271929824562}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=54720, training_loss=0.024202585291785966, metrics={'train_runtime': 5796.4135, 'train_samples_per_second': 94.403, 'train_steps_per_second': 9.44, 'total_flos': 2.1538237462044e+16, 'train_loss': 0.024202585291785966, 'epoch': 3.0})

# Raw Model Training and Evaluation

## Freeze all layers except the output layer

In [None]:
# reload raw model
model_name = "intfloat/e5-small"
raw_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # when calling AutoModelForSequenceClassification, the weights in output layer is randomly initialized

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for name, param in raw_model.named_parameters():
    if "classifier" not in name:  # Keep only the classifier layer unfrozen
        param.requires_grad = False

In [None]:
# check the number of trainable parameters in output layer
print_trainable_parameters(raw_model)

trainable params: 770 || all params: 33360770 || trainable%: 0.0023081002027231386


## Define training arguments and trainer



In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(source_dir, 'twitter_raid_data/results_raw_e5'),
    overwrite_output_dir=True,
    run_name='raw-E5',
    save_strategy="epoch",
    logging_strategy="steps",  # Ensure logging happens at each step
    logging_steps=10,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    #learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=1,
    group_by_length=True,
    num_train_epochs=3
)



In [None]:
# define raw trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
freq = [1 - sum(dataset['train']['labels'])/len(dataset['train']['labels']), sum(dataset['train']['labels'])/len(dataset['train']['labels'])]

raw_trainer = FocalLossTrainer(
    model=raw_model,
    fl_alpha = [1-x for x in freq],
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

## Training

In [None]:
os.environ["WANDB_DISABLED"] = "false"
os.environ["WANDB_MODE"] = "dryrun"

In [None]:
raw_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.075,0.080048,{'accuracy': 0.6316008771929824},0.635702,0.691894
2,0.0738,0.079073,{'accuracy': 0.6517105263157895},0.65273,0.696835
3,0.0732,0.078788,{'accuracy': 0.6485745614035088},0.651111,0.69827


Trainer is attempting to log a value of "{'accuracy': 0.6316008771929824}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.6517105263157895}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.6485745614035088}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=54720, training_loss=0.08023347747212614, metrics={'train_runtime': 3122.7747, 'train_samples_per_second': 175.229, 'train_steps_per_second': 17.523, 'total_flos': 2.131758110278776e+16, 'train_loss': 0.08023347747212614, 'epoch': 3.0})