# Library Install and Load

In [9]:
!pip install datasets transformers torch peft accelerate scikit-learn

from google.colab import drive
drive.mount('/content/drive')

from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import get_peft_model, LoraConfig, TaskType
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import time
import gc
import os
import json
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import wandb
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
try:
    wandb.finish()
except:
    pass

print(f"GPU: {torch.cuda.is_available()}")

project_folder = '/content/drive/MyDrive/KMU/NLP'
os.makedirs(project_folder, exist_ok=True)
print(f"Project Folder: {project_folder}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
GPU: True
Project Folder: /content/drive/MyDrive/KMU/NLP


In [4]:
!pip install --upgrade --force-reinstall datasets transformers huggingface_hub fsspec

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-20.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.3.0-cp31

# Download Dataset and Preprocessing

In [3]:
print("Start Download Dataset")
print("=" * 50)

### Downlaod SST2
sst2_dataset = load_dataset("glue", "sst2", download_mode="force_redownload")

print("Complete loading SST2")
print(f"Train: {len(sst2_dataset['train'])}")
print(f"Valid: {len(sst2_dataset['validation'])}")

print("\nSST2 Sample:")
for i in range(2):
    sample = sst2_dataset['train'][i]
    label_text = "positive" if sample['label'] == 1 else "negative"
    print(f"Text: {sample['sentence']}")
    print(f"Label: {sample['label']} ({label_text})")
    print()


### Downlaod AG News
ag_news_dataset = load_dataset("ag_news")

print("Complete loading AG News")
print(f"Train: {len(ag_news_dataset['train'])}")
print(f"Test: {len(ag_news_dataset['test'])}")

train_data = ag_news_dataset['train']
df = pd.DataFrame({'text': train_data['text'], 'label': train_data['label']})

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.1, random_state=42, stratify=df['label']
)

# Reconstructing the AG News dataset
ag_news_split = DatasetDict({
    'train': Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()}),
    'validation': Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()}),
    'test': ag_news_dataset['test']
})

print(f"After Split - Train: {len(ag_news_split['train'])}, Valid: {len(ag_news_split['validation'])}")

ag_news_labels = ['World', 'Sports', 'Business', 'Sci/Tech']
print("\nAG News Sample:")
for i in range(2):
    sample = ag_news_split['train'][i]
    print(f"Text: {sample['text'][:80]}...")
    print(f"Label: {sample['label']} ({ag_news_labels[sample['label']]})")
    print()

# Download KLUE-NLI
klue_nli_dataset = load_dataset("klue", "nli")

print("Complete loading KLUE-NLI")
print(f"Origin Train: {len(klue_nli_dataset['train'])}")
print(f"Origin Valid: {len(klue_nli_dataset['validation'])}")

np.random.seed(42)

train_indices = np.random.choice(len(klue_nli_dataset['train']), 5000, replace=False)
klue_train_sample = klue_nli_dataset['train'].select(train_indices)

val_indices = np.random.choice(len(klue_nli_dataset['validation']), 1000, replace=False)
klue_val_sample = klue_nli_dataset['validation'].select(val_indices)

klue_nli_split = DatasetDict({
    'train': klue_train_sample,
    'validation': klue_val_sample
})

print(f"After Split - Train: {len(klue_nli_split['train'])}, Valid: {len(klue_nli_split['validation'])}")

klue_nli_labels = ['entailment', 'contradiction', 'neutral']
print("\nKLUE-NLI Sample:")
for i in range(2):
    sample = klue_nli_split['train'][i]
    print(f"Premise: {sample['premise']}")
    print(f"Hypothesis: {sample['hypothesis']}")
    print(f"Label: {sample['label']} ({klue_nli_labels[sample['label']]})")
    print()

Start Download Dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Complete loading SST2
Train: 67349
Valid: 872

SST2 Sample:
Text: hide new secretions from the parental units 
Label: 0 (negative)

Text: contains no wit , only labored gags 
Label: 0 (negative)



README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Complete loading AG News
Train: 120000
Test: 7600
After Split - Train: 108000, Valid: 12000

AG News Sample:
Text: 10 seconds that change everything ATHENS - Ten seconds. Barely time enough to ti...
Label: 1 (Sports)

Text: Charline Labonte rises to challenge Charline Labonte has served notice she wants...
Label: 1 (Sports)



README.md:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.83M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/224k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24998 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Complete loading KLUE-NLI
Origin Train: 24998
Origin Valid: 3000
After Split - Train: 5000, Valid: 1000

KLUE-NLI Sample:
Premise: 또한 대전시에 있는 대학, 연구기관, 공공기관 등 인적 역량이 뛰어난 기관들과의 협력체계를 잘 구축해 사업계획의 실현가능성에서 높은 점수를 받았다.
Hypothesis: 인적 역량이 뛰어난 기관들과의 협력체계를 잘 구축하면 사업계획에 높은 점수를 받을 수 있다.
Label: 0 (entailment)

Premise: 프리터인 가네코 뎃페이는 아침 출근 시간으로 북적대는 지하철을 타고 취직 면접을 보러 가는 중, 여중생에게 치한이라고 오해받는다.
Hypothesis: 여중생은 가네코 뎃페이가 치한이라고 생각했다.
Label: 0 (entailment)



# Tokenizer

In [4]:
# Load DistilBERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_sst2(examples):
    return tokenizer(examples['sentence'], truncation=True, padding=True, max_length=128)

def tokenize_ag_news(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=256)

def tokenize_klue_nli(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding=True, max_length=256)

# SST2 Tokenizing
sst2_tokenized = sst2_dataset.map(tokenize_sst2, batched=True, remove_columns=['sentence', 'idx'])
print("Complete Tokenizing SST2")

# AG News Tokenizing
ag_news_tokenized = ag_news_split.map(tokenize_ag_news, batched=True, remove_columns=['text'])
print("Complete Tokenizing AG News")

# KLUE-NLI Tokenizing
klue_nli_tokenized = klue_nli_split.map(
    tokenize_klue_nli,
    batched=True,
    remove_columns=['premise', 'hypothesis', 'source']
)
print("Complete Tokenizing KLUE-NLI")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Complete Tokenizing SST2


Map:   0%|          | 0/108000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Complete Tokenizing AG News


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Complete Tokenizing KLUE-NLI


In [5]:
def get_gpu_memory():
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / 1024**3
    return 0

def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {'accuracy': accuracy, 'f1': f1}

def clear_memory():
    torch.cuda.empty_cache()
    gc.collect()
    print(f"Complete cleanup Memory - GPU Memory: {get_gpu_memory():.2f} GB")

# Prefix Tuning DistillBERT

In [7]:
def train_soft_prompt_model(model_name, train_data, eval_data, num_labels, prompt_length=20, num_epochs=3):
    print(f"\n{model_name} - Start Prefix Tuning!")
    print("-" * 50)

    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=num_labels
    )

    for param in model.parameters():
        param.requires_grad = False

    # prefix prompt embeddings
    hidden_size = model.distilbert.config.hidden_size  # 768
    soft_prompt = nn.Parameter(torch.randn(prompt_length, hidden_size) * 0.1)

    model.soft_prompt = soft_prompt
    model.prompt_length = prompt_length

    original_forward = model.forward

    def new_forward(input_ids, attention_mask=None, labels=None, **kwargs):
        batch_size = input_ids.size(0)

        soft_prompt_expanded = model.soft_prompt.unsqueeze(0).expand(batch_size, -1, -1)

        inputs_embeds = model.distilbert.embeddings.word_embeddings(input_ids)
        inputs_embeds = torch.cat([soft_prompt_expanded, inputs_embeds], dim=1)

        # Attention mask
        if attention_mask is not None:
            prompt_attention_mask = torch.ones(batch_size, model.prompt_length,
                                             dtype=attention_mask.dtype,
                                             device=attention_mask.device)
            attention_mask = torch.cat([prompt_attention_mask, attention_mask], dim=1)

        # DistilBERT
        outputs = model.distilbert(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask
        )

        sequence_output = outputs[0]
        cls_output = sequence_output[:, model.prompt_length, :]  # [CLS] token at shifted position

        # Classification head
        logits = model.classifier(cls_output)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))

        return {
            'loss': loss,
            'logits': logits
        }

    # Forward function change
    model.forward = new_forward

    # Parameter Infomation
    total_params, trainable_params = get_model_parameters(model)
    print(f"Prefix Prompt Model Info:")
    print(f"   Total Parameter: {total_params:,}")
    print(f"   Train Parameter: {trainable_params:,}")
    print(f"   Train Ratio: {trainable_params/total_params*100:.4f}%")
    print(f"   Prompt Length: {prompt_length}")

    # Train Setting
    output_dir = f"/content/drive/MyDrive/KMU/NLP/training_outputs/{model_name}_prefix_prompt"
    os.makedirs(output_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=1e-3,
        warmup_steps=500,
        weight_decay=0.01,
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        greater_is_better=True,
        report_to=[],
        dataloader_pin_memory=False,
    )

    # Trainer
    data_collator = DataCollatorWithPadding(tokenizer)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=eval_data,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )

    # Train
    start_time = time.time()
    start_memory = get_gpu_memory()

    train_result = trainer.train()
    eval_result = trainer.evaluate()

    end_time = time.time()
    max_memory = get_gpu_memory()

    # save results
    results = {
        'model_name': f"{model_name}_Prefix",
        'training_time': end_time - start_time,
        'max_memory_usage': max_memory,
        'accuracy': eval_result['eval_accuracy'],
        'f1_score': eval_result['eval_f1'],
        'total_params': total_params,
        'trainable_params': trainable_params,
        'prompt_length': prompt_length
    }

    print(f"{model_name} Complete Prefix Tuning!")
    print(f"   Train Time: {results['training_time']:.1f}sec")
    print(f"   Max Memory: {results['max_memory_usage']:.2f} GB")
    print(f"   Accuracy: {results['accuracy']:.4f}")
    print(f"   F1-Score: {results['f1_score']:.4f}")

    return model, results

In [10]:
# SST2 Prefix Fine-tuning
clear_memory()
sst2_soft_prompt_model, sst2_soft_prompt_results = train_soft_prompt_model(
    "SST2", sst2_tokenized['train'], sst2_tokenized['validation'], 2, prompt_length=20
)

Complete cleanup Memory - GPU Memory: 0.00 GB

SST2 - Start Prefix Tuning!
--------------------------------------------------


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Prefix Prompt Model Info:
   Total Parameter: 66,970,370
   Train Parameter: 15,360
   Train Ratio: 0.0229%
   Prompt Length: 20


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.493,0.488297,0.755734,0.754474
2,0.4197,0.481333,0.780963,0.780981
3,0.4246,0.468667,0.792431,0.792435


SST2 Complete Prefix Tuning!
   Train Time: 1009.4sec
   Max Memory: 0.27 GB
   Accuracy: 0.7924
   F1-Score: 0.7924


In [11]:
# AG News Prefix Fine-tuning
clear_memory()
ag_news_soft_prompt_model, ag_news_soft_prompt_results = train_soft_prompt_model(
    "AG_News", ag_news_tokenized['train'], ag_news_tokenized['validation'], 4, prompt_length=20
)

Complete cleanup Memory - GPU Memory: 0.27 GB

AG_News - Start Prefix Tuning!
--------------------------------------------------


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Prefix Prompt Model Info:
   Total Parameter: 66,971,908
   Train Parameter: 15,360
   Train Ratio: 0.0229%
   Prompt Length: 20


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3373,0.321583,0.894083,0.89411
2,0.3176,0.303981,0.901667,0.901459
3,0.3153,0.296875,0.905167,0.905175


AG_News Complete Prefix Tuning!
   Train Time: 5676.0sec
   Max Memory: 0.52 GB
   Accuracy: 0.9052
   F1-Score: 0.9052


In [12]:
# KLUE-NLI Prefix Fine-tuning
clear_memory()
klue_nli_soft_prompt_model, klue_nli_soft_prompt_results = train_soft_prompt_model(
    "KLUE_NLI", klue_nli_tokenized['train'], klue_nli_tokenized['validation'], 3, prompt_length=20
)

Complete cleanup Memory - GPU Memory: 0.52 GB

KLUE_NLI - Start Prefix Tuning!
--------------------------------------------------


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Prefix Prompt Model Info:
   Total Parameter: 66,971,139
   Train Parameter: 15,360
   Train Ratio: 0.0229%
   Prompt Length: 20


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1036,1.102652,0.312,0.14839
2,1.1009,1.099744,0.341,0.18988
3,1.1004,1.099401,0.34,0.251549


KLUE_NLI Complete Prefix Tuning!
   Train Time: 289.9sec
   Max Memory: 0.77 GB
   Accuracy: 0.3410
   F1-Score: 0.1899
