In [1]:
from transformers import AutoTokenizer               

from evaluation.evaluator import EarlyExitEvaluator
from strategies.confidence_exit import ConfidenceExit
from models.gpt2_wrapper import GPT2WithEarlyExit
from evaluation.dataset_loaders.sst2 import load_sst2

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [3]:
# simple test one sentiment data
strategy = ConfidenceExit(threshold=1, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer)

dataset = load_sst2(number=50, task="train")   # Number of data to use = 100, without kv 

evaluator = EarlyExitEvaluator(tokenizer)

result = evaluator.evaluate(
    model=model,
    strategy=strategy,
    dataset=dataset,
    task_type="classification",
    dataset_name="sst2",
)

print(result)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Evaluating: 100%|███████████████████████████████| 50/50 [00:01<00:00, 25.75it/s]

{'metric': 'accuracy', 'score': np.float64(0.3), 'avg_latency_sec': np.float64(0.03865469455718994), 'tokens_per_sec': 25.870078950449127, 'avg_layers_used': np.float64(12.0), 'num_samples': 50}





### KV-Cache Behavior in Early-Exit GPT-2 Wrapper

Early-exit GPT-2 wrapper supports two execution modes depending on the task.
KV-cache is handled differently for classification vs generation tasks.

#### Classification Tasks (SST-2, AGNews) — No KV-Cache Used

#### Generation Tasks (Summarization, Translation, QA)

For generation, two modes depending on use_kv parameter. 
if use_kv = False (Full Recompute, slow mode)
- Every new token recomputes all layers
- Early exit only skips layers inside one forward pass
- KV-cache is not stored
- Useful for reproducing naive early-exit results

if use_kv = True (KV Vached Early Exit, fast mode)
- step 1: encode the prompt once and produce hidden states for the prompt, KV pair for every layer
- step 2: decode tokens with early exit:
    - for each token run layers sequenctiall..
    - At each layer, compute confidence
        - If early exit triggers at layer L:
        - Layers 0..L compute normally and update KV
        - Layers L+1..final are skipped
        - Their KV is copied forward unchanged

In [4]:
### testing with KV similar to CALM paper
from evaluation.dataset_loaders.sst2 import load_sst2
from evaluation.dataset_loaders.agnews import load_agnews
from evaluation.dataset_loaders.cnn_dm import load_cnndm
from evaluation.dataset_loaders.squad import load_squad
from evaluation.dataset_loaders.wmt_en_fr import load_wmt_enfr

strategy = ConfidenceExit(threshold=0.7, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer, use_kv="True")

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.002)

    # ---------- IMPORTANT: pass dataset_name for classification ----------
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

Testing sst2...


Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 11.95it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(1.0), 'avg_latency_sec': np.float64(0.08323502540588379), 'tokens_per_sec': 12.01417306144429, 'avg_layers_used': np.float64(12.0), 'num_samples': 1}
Testing agnews...


Evaluating: 100%|███████████████████████████████| 15/15 [00:00<00:00, 20.21it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.06666666666666667), 'avg_latency_sec': np.float64(0.04930445353190104), 'tokens_per_sec': 20.282143465051863, 'avg_layers_used': np.float64(12.0), 'num_samples': 15}
Testing cnn_dm...


Token indices sequence length is longer than the specified maximum sequence length for this model (1032 > 1024). Running this sequence through the model will result in indexing errors
Evaluating: 100%|███████████████████████████████| 22/22 [00:13<00:00,  1.59it/s]


cnn_dm {'metric': 'rougeL', 'score': np.float64(0.05079575095102862), 'avg_latency_sec': np.float64(0.6256155859340321), 'tokens_per_sec': 1.5984256506445873, 'avg_layers_used': np.float64(4.051136363636363), 'num_samples': 22}
Testing wmt14_enfr...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluating: 100%|█████████████████████████████████| 6/6 [00:03<00:00,  1.94it/s]


wmt14_enfr {'metric': 'bleu', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.5145148436228434), 'tokens_per_sec': 1.9435785233302878, 'avg_layers_used': np.float64(4.015625), 'num_samples': 6}
Testing squad...


Evaluating: 100%|███████████████████████████████| 21/21 [00:09<00:00,  2.32it/s]

squad {'metric': 'token_f1', 'score': np.float64(0.004896882945663434), 'avg_latency_sec': np.float64(0.4303716137295678), 'tokens_per_sec': 2.3235733215164354, 'avg_layers_used': np.float64(4.017857142857143), 'num_samples': 21}





In [5]:
### testing without kv
strategy = ConfidenceExit(threshold=0.7, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer, use_kv="False")

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.002)

    # pass dataset_name for classification
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

Testing sst2...


Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 12.17it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(1.0), 'avg_latency_sec': np.float64(0.08173990249633789), 'tokens_per_sec': 12.233926998442431, 'avg_layers_used': np.float64(12.0), 'num_samples': 1}
Testing agnews...


Evaluating: 100%|███████████████████████████████| 15/15 [00:00<00:00, 19.74it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.06666666666666667), 'avg_latency_sec': np.float64(0.050461435317993165), 'tokens_per_sec': 19.817113676975165, 'avg_layers_used': np.float64(12.0), 'num_samples': 15}
Testing cnn_dm...


Evaluating: 100%|███████████████████████████████| 22/22 [00:14<00:00,  1.50it/s]


cnn_dm {'metric': 'rougeL', 'score': np.float64(0.05079575095102862), 'avg_latency_sec': np.float64(0.6641134348782626), 'tokens_per_sec': 1.5057668577105479, 'avg_layers_used': np.float64(4.051136363636363), 'num_samples': 22}
Testing wmt14_enfr...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluating: 100%|█████████████████████████████████| 6/6 [00:03<00:00,  1.90it/s]


wmt14_enfr {'metric': 'bleu', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.5247013966242472), 'tokens_per_sec': 1.9058458895548298, 'avg_layers_used': np.float64(4.015625), 'num_samples': 6}
Testing squad...


Evaluating: 100%|███████████████████████████████| 21/21 [00:09<00:00,  2.26it/s]

squad {'metric': 'token_f1', 'score': np.float64(0.004896882945663434), 'avg_latency_sec': np.float64(0.44241390909467426), 'tokens_per_sec': 2.260326765146991, 'avg_layers_used': np.float64(4.017857142857143), 'num_samples': 21}





In [None]:
### Strategy 2 - Confidence threshold should be (meet) in Continous layers

In [7]:
from strategies.continous_confidence_exit import ContinuousConfidenceExit

strategy = ContinuousConfidenceExit(
    threshold=0.75,
    required_consecutive=2,
    allowed_layers=[3, 6, 9, 11]
)

model = GPT2WithEarlyExit("gpt2", strategy, tokenizer)
evaluator = EarlyExitEvaluator(tokenizer)