In [1]:
pip install pandas numpy datasets transformers scikit-learn torch bitsandbytes accelerate



In [2]:
pip install peft accelerate transformers datasets bitsandbytes



In [3]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig
)
from sklearn.metrics import accuracy_score, f1_score
import torch  # Required for quantization dtypes

In [4]:
!pip install -U bitsandbytes>=0.41.1
!pip install -U accelerate transformers datasets



In [5]:
LABEL_NAMES = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval",
    "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
    "joy", "love", "nervousness", "optimism", "pride", "realization",
    "relief", "remorse", "sadness", "surprise", "neutral"
]
NUM_LABELS = len(LABEL_NAMES)

In [7]:
# Load parquet file
#df = pd.read_parquet("train-00000-of-00001.parquet")
from datasets import load_dataset

# Load the GoEmotions dataset
dataset = load_dataset("go_emotions")

# If you specifically want the train split
train_dataset = load_dataset("go_emotions", split="train")

# Convert to pandas DataFrame if needed
df = train_dataset.to_pandas()
# Convert labels to multi-hot vectors
def encode_labels(example):
    multi_hot = np.zeros(NUM_LABELS)
    multi_hot[example['labels']] = 1  # Set 1s for active labels
    return {'labels': multi_hot.tolist()}

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df).map(encode_labels)

# Split into train/test (90%/10%)
dataset = dataset.train_test_split(test_size=0.1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.40k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/350k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

In [8]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,               # 4-bit quantization
    bnb_4bit_quant_type="nf4",       # Normal Float 4 quantization
    bnb_4bit_use_double_quant=True,  # Second quantization for efficiency
    bnb_4bit_compute_dtype=torch.float16  # Computation in float16
)

In [9]:
try:
    from transformers import BitsAndBytesConfig
    import bitsandbytes as bnb
    print(f"bitsandbytes version: {bnb.__version__}")  # Should be ≥0.41.1
except ImportError:
    raise ImportError(
        "4-bit quantization requires bitsandbytes. Install with: "
        "`pip install -U bitsandbytes>=0.41.1`"
    )

# Proceed with quantization config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

bitsandbytes version: 0.45.5


In [10]:
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)

# Initialize 4-bit quantized model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification",
    quantization_config=quant_config,
    device_map="auto"
)

# Add LoRA adapters
peft_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,
    target_modules=["query", "value"],  # For BERT
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Should show ~0.1-1% of parameters

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 611,356 || all params: 110,115,128 || trainable%: 0.5552


In [11]:
print(model)  # Should show "4-bit" in the model description
print(next(model.parameters()).dtype)  # Should show torch.float16

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSdpaSelfAttention(
                  (query): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                     

In [12]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_data = dataset.map(tokenize_function, batched=True)
tokenized_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/39069 [00:00<?, ? examples/s]

Map:   0%|          | 0/4341 [00:00<?, ? examples/s]

In [13]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lora_bert_output",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-3,
    num_train_epochs=3,
    eval_strategy="epoch",  # ← Changed from evaluation_strategy
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    fp16=True,
    report_to="none",
)

In [14]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    preds = (logits > 0).astype(int)  # Multi-label threshold
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

In [15]:
def convert_labels_to_float(example):
    example["labels"] = example["labels"].float()
    return example

tokenized_data = tokenized_data.map(convert_labels_to_float)

Map:   0%|          | 0/39069 [00:00<?, ? examples/s]

Map:   0%|          | 0/4341 [00:00<?, ? examples/s]

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    compute_metrics=compute_metrics
)

trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


RuntimeError: result type Float can't be cast to the desired output type Long