In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import torch

seed = 2024
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
## remove randomness
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(False)

## LoRA model with twitter dataset

### Prepare data

In [5]:
import os
import pandas as pd

In [6]:
source_dir = '/content/gdrive/MyDrive/Colab_Notebooks/ML-LoRA-E5/'

In [None]:
twitter_rewritten = pd.read_csv(os.path.join(source_dir, "twitter_data/merged_output.csv"))
twitter_rewritten = twitter_rewritten.drop_duplicates().reset_index(drop=True)

In [None]:
twitter_rewritten['labels'] = 1
twitter_rewritten.columns = ['text', 'labels']
display(twitter_rewritten)

Unnamed: 0,text,labels
0,Presenting the most significant analyst recomm...,1
1,Wells Fargo suggests purchasing Las Vegas Sand...,1
2,Piper Sandler has adjusted their recommendatio...,1
3,Analysts provide insights on Tesla's recent ea...,1
4,Analysts predict that Netflix and its competit...,1
...,...,...
89722,It evokes memories of Arnab Goswami conducting...,1
89723,When various BJP representatives assert that t...,1
89724,"Modi is symbolically positioned as the leader,...",1
89725,Check out what Forbes is saying: the nation sh...,1


In [None]:
twitter_1 = pd.read_csv(os.path.join(source_dir, "twitter_data/twitter_dataset.csv"))
twitter_1['labels'] = 0
twitter_1 = twitter_1[['Text','labels']]
twitter_1 = twitter_1.drop_duplicates().reset_index(drop=True)
twitter_1.columns = ['text', 'labels']
display(twitter_1)

Unnamed: 0,text,labels
0,Party least receive say or single. Prevent pre...,0
1,Hotel still Congress may member staff. Media d...,0
2,Nice be her debate industry that year. Film wh...,0
3,Laugh explain situation career occur serious. ...,0
4,Involve sense former often approach government...,0
...,...,...
9995,Agree reflect military box ability ever hold. ...,0
9996,Born which push still. Degree sometimes contro...,0
9997,You day agent likely region. Teacher data mess...,0
9998,Guess without successful save. Particular natu...,0


In [None]:
twitter_2 = pd.read_csv(os.path.join(source_dir, "twitter_data/Twitter_Data.csv"))
twitter_2['labels'] = 0
twitter_2 = twitter_2[['clean_text','labels']]
twitter_2 = twitter_2.drop_duplicates().reset_index(drop=True)
twitter_2.columns = ['text', 'labels']
display(twitter_2)

Unnamed: 0,text,labels
0,when modi promised ‚Äúminimum government maximum...,0
1,talk all the nonsense and continue all the dra...,0
2,what did just say vote for modi welcome bjp t...,0
3,asking his supporters prefix chowkidar their n...,0
4,answer who among these the most powerful world...,0
...,...,...
162972,why these 456 crores paid neerav modi not reco...,0
162973,dear rss terrorist payal gawar what about modi...,0
162974,did you cover her interaction forum where she ...,0
162975,there big project came into india modi dream p...,0


In [None]:
twitter_f1 = pd.read_csv(os.path.join(source_dir, "twitter_data/financial_train_data.csv"))
twitter_f2 = pd.read_csv(os.path.join(source_dir, "twitter_data/financial_valid_data.csv"))
twitter_3 = pd.concat([twitter_f1, twitter_f2], axis=0, ignore_index=True)
twitter_3['labels'] = 0
twitter_3 = twitter_3[['text','labels']]
twitter_3 = twitter_3.drop_duplicates().reset_index(drop=True)
twitter_3.columns = ['text', 'labels']
display(twitter_2)

Unnamed: 0,text,labels
0,when modi promised ‚Äúminimum government maximum...,0
1,talk all the nonsense and continue all the dra...,0
2,what did just say vote for modi welcome bjp t...,0
3,asking his supporters prefix chowkidar their n...,0
4,answer who among these the most powerful world...,0
...,...,...
162972,why these 456 crores paid neerav modi not reco...,0
162973,dear rss terrorist payal gawar what about modi...,0
162974,did you cover her interaction forum where she ...,0
162975,there big project came into india modi dream p...,0


In [None]:
df = pd.concat([twitter_rewritten, twitter_1, twitter_2, twitter_3], axis=0, ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
print(df['labels'].value_counts())
df.to_csv(os.path.join(source_dir, 'twitter_data/twitter_processed.csv'), index=False)

labels
0    194084
1     89727
Name: count, dtype: int64


### Train data with E5 model with LoRA

In [7]:
!pip install evaluate datasets

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [8]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, AutoModelForSequenceClassification
import evaluate
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset

import torch
import pandas as pd
import numpy as np

import os

In [9]:
dataset = load_dataset('csv', data_files=os.path.join(source_dir, 'twitter_data/twitter_processed.csv'))

def is_valid_text(example):
    return example['text'] is not None and example['labels'] is not None

dataset = dataset["train"].filter(is_valid_text) # drop rows with missing value
display(dataset)
dataset = dataset.class_encode_column('labels')

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/283811 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels'],
    num_rows: 283810
})

Stringifying the column:   0%|          | 0/283810 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/283810 [00:00<?, ? examples/s]

In [10]:
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='labels')
print("Train label distribution:", dataset["train"].to_pandas()["labels"].value_counts())
print("Test label distribution:", dataset["test"].to_pandas()["labels"].value_counts())

Train label distribution: labels
0    155266
1     71782
Name: count, dtype: int64
Test label distribution: labels
0    38817
1    17945
Name: count, dtype: int64


In [11]:
def tokenize_function(examples):
    try:
        return tokenizer(examples["text"], max_length=512, truncation=True)
    except Exception as e:
        print("Error during tokenization:", e)
        print("Offending examples:", examples["text"])
        raise e

tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-small")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Map:   0%|          | 0/227048 [00:00<?, ? examples/s]

Map:   0%|          | 0/56762 [00:00<?, ? examples/s]

In [12]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [13]:
e_model = AutoModelForSequenceClassification.from_pretrained("intfloat/e5-small", num_labels=2)

#for name, module in e_model.named_modules():
#    print(name)
r = 8
lora_config_e5 = LoraConfig(task_type=TaskType.SEQ_CLS,
                         r=r,               # Low-rank adaptation rank
                         lora_alpha=2*r,     # Scaling factor
                         lora_dropout=0.1,  # Dropout for LoRA
                         target_modules = ['attention.self.query', 'attention.self.key', 'attention.self.value']
                         )
e5_model = get_peft_model(e_model, lora_config_e5)
print_trainable_parameters(e5_model)

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 221954 || all params: 33582724 || trainable%: 0.6609172025473574


#### Raw evaluation

In [14]:
from torch.utils.data import DataLoader

In [None]:
## functions to calculate accuracy and F1 score with a given model on a given test dataset
def tokenize_data(example, tokenizer):
    return tokenizer(example['text'], truncation=True, max_length=512)

def custom_collate_fn(features):
    # Remove the 'text' and 'labels' fields to prevent errors
    filtered_features = [
        {k: v for k, v in feature.items() if k in ['input_ids', 'attention_mask', 'token_type_ids']}
        for feature in features
    ]
    return data_collator(filtered_features)

def inference_model(model_name, test_dataset):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    raw_model = AutoModelForSequenceClassification.from_pretrained(model_name)
    raw_model.to(device)
    raw_model.eval()
    for module in raw_model.modules():
        if isinstance(module, (torch.nn.Dropout, torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.LayerNorm)):
            module.eval()
    #data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    test_data = test_dataset.map(lambda x: tokenize_data(x, tokenizer), batched=True)
    dataloader = DataLoader(test_data, batch_size=1, collate_fn=custom_collate_fn, shuffle=False)
    predictions = []
    with torch.no_grad():
      for batch in dataloader:
          batch = {k: v.to(raw_model.device) for k, v in batch.items()}
          outputs = raw_model(**batch)
          logits = outputs.logits
          batch_predictions = logits.argmax(dim=-1)
          predictions.extend(batch_predictions.cpu().numpy())
    true_labels = test_data["labels"]
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average="weighted")
    return accuracy, f1

In [16]:
accuracy_m = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average="weighted")
    accuracy = accuracy_m.compute(predictions=predictions, references=labels)
    return {
        "accuracy": accuracy,
        "f1": f1
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
inference_model("intfloat/e5-small", dataset['test'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/56762 [00:00<?, ? examples/s]

(0.6838553962157782, 0.5554612397055391)

#### LoRA training

In [17]:
import torch.nn.functional as F

class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, inputs, targets):
        # Compute standard cross entropy
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')

        # Compute the probability of each class for the targets
        pt = torch.exp(-ce_loss)

        # Apply focal loss formula
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        # Apply class weights if provided
        if self.alpha is not None:
            alpha_t = self.alpha.gather(0, targets.data.view(-1))
            focal_loss = alpha_t * focal_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss


In [18]:
class FocalLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Calculate focal loss
        loss_fct = FocalLoss(gamma=2.0, alpha=torch.tensor([0.33, 0.67]).to(logits.device))
        loss = loss_fct(logits, labels)

        self.log({"train_loss": loss.item()})
        self.state.log_history.append({"train_loss": loss.item()})

        return (loss, outputs) if return_outputs else loss

In [19]:
training_args = TrainingArguments(
    output_dir=os.path.join(source_dir, 'twitter_data/results_LoRA_e5'),
    overwrite_output_dir=True,
    run_name='LoRA-E5-no-filter',
    save_strategy="epoch",
    logging_strategy="steps",  # Ensure logging happens at each step
    logging_steps=10,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    #learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=1,
    group_by_length=True,
    num_train_epochs=3
)

from transformers import TrainerCallback

class LogLossCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        # Log the current training loss
        if state.global_step % args.logging_steps == 0:
            print(f"Step {state.global_step}: loss = {state.log_history[-1]['loss']}")

trainer = FocalLossTrainer(
    model=e5_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Pass the compute_metrics function
)




In [20]:
os.environ["WANDB_DISABLED"] = "false"
os.environ["WANDB_MODE"] = "dryrun"

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0007,0.002068,{'accuracy': 0.99309397131884},0.993109


Trainer is attempting to log a value of "{'accuracy': 0.99309397131884}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


KeyboardInterrupt: 

#### Read checkpoint to continue training

In [21]:
checkpoint_path = os.path.join(source_dir, 'twitter_data/results_LoRA_e5/checkpoint-22705')
e5_model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
trainer.train(resume_from_checkpoint=checkpoint_path)

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


  checkpoint_rng_state = torch.load(rng_file)


Epoch,Training Loss,Validation Loss,Accuracy,F1
2,0.0067,0.001403,{'accuracy': 0.99545470561291},0.995462
3,0.0006,0.001343,{'accuracy': 0.9960008456361651},0.996006


Trainer is attempting to log a value of "{'accuracy': 0.99545470561291}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9960008456361651}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=68115, training_loss=0.0008087576239607976, metrics={'train_runtime': 4184.4381, 'train_samples_per_second': 162.78, 'train_steps_per_second': 16.278, 'total_flos': 3101678113315200.0, 'train_loss': 0.0008087576239607976, 'epoch': 3.0})