In [None]:
!pip -q install "pyarrow>=14,<20"

!pip -q install \
  "transformers==4.44.2" \
  "datasets==2.19.1" \
  "evaluate==0.4.2" \
  "peft==0.11.1" \
  "accelerate==0.33.0" \
  "huggingface-hub>=0.23,<0.25" \
  "tokenizers>=0.19,<0.21" \
  "safetensors>=0.4" \
  --no-warn-conflicts


In [None]:
import os, time, random, numpy as np, torch
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)

#Repro & device ---
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

#Ensure PEFT is available (for LoRA) ---
try:
    from peft import LoraConfig, get_peft_model, TaskType
except Exception as e:
    print("Installing peft ...")
    !pip -q install "peft==0.11.1" --no-deps
    from peft import LoraConfig, get_peft_model, TaskType

#Load dataset (NO DOWNSAMPLING) ---
dataset = load_dataset("armanc/pubmed-rct20k")
dataset = dataset.class_encode_column("label")

print(dataset)
feat = dataset["train"].features["label"]
label_names = list(feat.names)
label2id = {n:i for i,n in enumerate(label_names)}
id2label = {i:n for i,n in enumerate(label_names)}
num_labels = len(label_names)
print("Labels:", label_names)

#Tokenizer & preprocessing (dynamic padding) ---
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
max_length = 128

def preprocess(batch):
    enc = tokenizer(batch["text"], truncation=True, max_length=max_length)
    enc["labels"] = batch["label"]
    return enc

encoded = dataset.map(
    preprocess, batched=True, remove_columns=dataset["train"].column_names
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#Metrics using sklearn ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
    }

def count_trainable(m): return sum(p.numel() for p in m.parameters() if p.requires_grad)
def count_all(m): return sum(p.numel() for p in m.parameters())

2025-10-15 06:40:51.889466: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760510452.095613     107 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760510452.156409     107 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device: cuda


Downloading readme:   0%|          | 0.00/646 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading metadata: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/40.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.83M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/176642 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29672 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/29578 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/176642 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/29672 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/29578 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['abstract_id', 'label', 'text', 'sentence_id'],
        num_rows: 176642
    })
    validation: Dataset({
        features: ['abstract_id', 'label', 'text', 'sentence_id'],
        num_rows: 29672
    })
    test: Dataset({
        features: ['abstract_id', 'label', 'text', 'sentence_id'],
        num_rows: 29578
    })
})
Labels: ['background', 'conclusions', 'methods', 'objective', 'results']


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/176642 [00:00<?, ? examples/s]

Map:   0%|          | 0/29672 [00:00<?, ? examples/s]

Map:   0%|          | 0/29578 [00:00<?, ? examples/s]

In [None]:
# =========================
# Full Fine-tuning (DistilBERT)
# =========================
set_seed(42)
model_ft = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels, id2label=id2label, label2id=label2id
).to(device)

args_ft = TrainingArguments(
    output_dir="/kaggle/working/distilbert_ft_pubmed20k",
    save_strategy="no",          
    evaluation_strategy="no",    
    logging_steps=200,           
    load_best_model_at_end=False,
    save_total_limit=1,          
    save_safetensors=True,
    learning_rate=3e-5,                 
    per_device_train_batch_size=16,     
    per_device_eval_batch_size=32,
    num_train_epochs=2,                 
    #logging_steps=200,
    fp16=(device=="cuda"),
    report_to="none",
    seed=42,
    dataloader_num_workers=2,
    
)

trainer_ft = Trainer(
    model=model_ft,
    args=args_ft,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

t0 = time.time()
trainer_ft.train()
ft_time_min = round((time.time() - t0)/60, 2)

ft_val = trainer_ft.evaluate(encoded["validation"])
ft_test = trainer_ft.evaluate(encoded["test"])
print("\n[Full FT] Val:", ft_val, " Test:", ft_test, " Time(min):", ft_time_min)

preds_ft = trainer_ft.predict(encoded["test"])
y_true_ft = preds_ft.label_ids
y_pred_ft = np.argmax(preds_ft.predictions, axis=-1)
print("\n[Full FT] Confusion matrix:\n", confusion_matrix(y_true_ft, y_pred_ft))
print("\n[Full FT] Classification report:\n",
      classification_report(y_true_ft, y_pred_ft, target_names=label_names, digits=4))

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
200,0.7521
400,0.5284
600,0.4911
800,0.4479
1000,0.4611
1200,0.4654
1400,0.4512
1600,0.4461
1800,0.4584
2000,0.4444


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[Full FT] Val: {'eval_loss': 0.36123350262641907, 'eval_accuracy': 0.87712321380426, 'eval_f1_macro': 0.8166316859077194, 'eval_runtime': 41.8575, 'eval_samples_per_second': 708.882, 'eval_steps_per_second': 22.17, 'epoch': 2.0}  Test: {'eval_loss': 0.3902270197868347, 'eval_accuracy': 0.8673676381094056, 'eval_f1_macro': 0.8061397801144896, 'eval_runtime': 42.4731, 'eval_samples_per_second': 696.394, 'eval_steps_per_second': 21.779, 'epoch': 2.0}  Time(min): 27.53


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[Full FT] Confusion matrix:
 [[2332  220  121  390   14]
 [ 334 3745   53    5  434]
 [  86   56 9379   53  310]
 [ 834   57  102 1329   11]
 [  28  336  475    4 8870]]

[Full FT] Classification report:
               precision    recall  f1-score   support

  background     0.6453    0.7579    0.6971      3077
 conclusions     0.8484    0.8193    0.8336      4571
     methods     0.9259    0.9489    0.9372      9884
   objective     0.7462    0.5697    0.6461      2333
     results     0.9202    0.9132    0.9167      9713

    accuracy                         0.8674     29578
   macro avg     0.8172    0.8018    0.8061     29578
weighted avg     0.8687    0.8674    0.8665     29578



In [3]:
trainer_ft.evaluate(encoded["validation"])
trainer_ft.evaluate(encoded["test"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{'eval_loss': 0.3902270197868347,
 'eval_accuracy': 0.8673676381094056,
 'eval_f1_macro': 0.8061397801144896,
 'eval_runtime': 42.4558,
 'eval_samples_per_second': 696.677,
 'eval_steps_per_second': 21.787,
 'epoch': 2.0}

In [None]:
# =========================
# LoRA (PEFT) on DistilBERT
# =========================
set_seed(42)
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels, id2label=id2label, label2id=label2id
)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8, lora_alpha=16, lora_dropout=0.1,
    target_modules=["q_lin","v_lin"]
)
model_lora = get_peft_model(base_model, peft_config).to(device)
model_lora.print_trainable_parameters()

args_lora = TrainingArguments(
    output_dir="/kaggle/working/distilbert_lora_pubmed20k",
    save_strategy="no",
    evaluation_strategy="no",
    logging_steps=200,
    load_best_model_at_end=False,
    save_total_limit=1,
    save_safetensors=True,
    learning_rate=2e-4,                 
    per_device_train_batch_size=32,     
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    #logging_steps=200,
    fp16=(device=="cuda"),
    report_to="none",
    seed=42,
    dataloader_num_workers=2,
)

trainer_lora = Trainer(
    model=model_lora,
    args=args_lora,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

t0 = time.time()
trainer_lora.train()
lora_time_min = round((time.time() - t0)/60, 2)

lora_val = trainer_lora.evaluate(encoded["validation"])
lora_test = trainer_lora.evaluate(encoded["test"])
print("\n[LoRA] Val:", lora_val, " Test:", lora_test, " Time(min):", lora_time_min)

preds_lora = trainer_lora.predict(encoded["test"])
y_true_l = preds_lora.label_ids
y_pred_l = np.argmax(preds_lora.predictions, axis=-1)
print("\n[LoRA] Confusion matrix:\n", confusion_matrix(y_true_l, y_pred_l))
print("\n[LoRA] Classification report:\n",
      classification_report(y_true_l, y_pred_l, target_names=label_names, digits=4))

# --- (F) Summary ---
def g(d, k): return round(float(d[k]), 4) if k in d else None
summary = {
    "FT_acc_val": g(ft_val, "eval_accuracy"),
    "FT_f1_val": g(ft_val, "eval_f1_macro"),
    "FT_acc_test": g(ft_test, "eval_accuracy"),
    "FT_f1_test": g(ft_test, "eval_f1_macro"),
    "FT_params_all": sum(p.numel() for p in model_ft.parameters()),
    "FT_params_trainable": sum(p.numel() for p in model_ft.parameters() if p.requires_grad),
    "FT_time_min": ft_time_min,

    "LoRA_acc_val": g(lora_val, "eval_accuracy"),
    "LoRA_f1_val": g(lora_val, "eval_f1_macro"),
    "LoRA_acc_test": g(lora_test, "eval_accuracy"),
    "LoRA_f1_test": g(lora_test, "eval_f1_macro"),
    "LoRA_params_all": sum(p.numel() for p in model_lora.parameters()),
    "LoRA_params_trainable": sum(p.numel() for p in model_lora.parameters() if p.requires_grad),
    "LoRA_time_min": lora_time_min,
}
print("\n=== Summary ===\n", summary)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


trainable params: 741,893 || all params: 67,699,210 || trainable%: 1.0959


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
200,0.7681
400,0.5227
600,0.4911
800,0.4851
1000,0.4844
1200,0.4676
1400,0.4493
1600,0.445
1800,0.4439
2000,0.4585


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[LoRA] Val: {'eval_loss': 0.3639136552810669, 'eval_accuracy': 0.8661701267187921, 'eval_f1_macro': 0.803980412297809, 'eval_runtime': 47.4867, 'eval_samples_per_second': 624.848, 'eval_steps_per_second': 9.771, 'epoch': 2.0}  Test: {'eval_loss': 0.39251387119293213, 'eval_accuracy': 0.8576982892690513, 'eval_f1_macro': 0.7962485806841789, 'eval_runtime': 47.0215, 'eval_samples_per_second': 629.031, 'eval_steps_per_second': 9.847, 'epoch': 2.0}  Time(min): 18.77


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[LoRA] Confusion matrix:
 [[2302  266  139  345   25]
 [ 335 3649   68    4  515]
 [ 105   61 9324   52  342]
 [ 851   67  109 1294   12]
 [  31  321  557    4 8800]]

[LoRA] Classification report:
               precision    recall  f1-score   support

  background     0.6352    0.7481    0.6871      3077
 conclusions     0.8362    0.7983    0.8168      4571
     methods     0.9144    0.9433    0.9286      9884
   objective     0.7616    0.5547    0.6419      2333
     results     0.9078    0.9060    0.9069      9713

    accuracy                         0.8577     29578
   macro avg     0.8110    0.7901    0.7962     29578
weighted avg     0.8590    0.8577    0.8565     29578


=== Summary ===
 {'FT_acc_val': 0.8771, 'FT_f1_val': 0.8166, 'FT_acc_test': 0.8674, 'FT_f1_test': 0.8061, 'FT_params_all': 66957317, 'FT_params_trainable': 66957317, 'FT_time_min': 27.53, 'LoRA_acc_val': 0.8662, 'LoRA_f1_val': 0.804, 'LoRA_acc_test': 0.8577, 'LoRA_f1_test': 0.7962, 'LoRA_params_all': 67699210