In [1]:
from transformers import AutoTokenizer               

from evaluation.evaluator import EarlyExitEvaluator
from strategies.confidence_exit import ConfidenceExit
from models.gpt2_wrapper import GPT2WithEarlyExit
from evaluation.dataset_loaders.sst2 import load_sst2

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [6]:
# simple test one sentiment data
strategy = ConfidenceExit(threshold=0.8, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer)

dataset = load_sst2(number=50, task="train")   # Number of data to use = 100, without kv 

evaluator = EarlyExitEvaluator(tokenizer)

result = evaluator.evaluate(
    model=model,
    strategy=strategy,
    dataset=dataset,
    task_type="classification",
    dataset_name="sst2",
)

print(result)

Evaluating: 100%|███████████████████████████████| 50/50 [00:00<00:00, 50.29it/s]

{'metric': 'accuracy', 'score': np.float64(0.74), 'avg_latency_sec': np.float64(0.019784612655639647), 'tokens_per_sec': 50.544330455463715, 'avg_layers_used': np.float64(6.34), 'num_samples': 50}





### KV-Cache Behavior in Early-Exit GPT-2 Wrapper

Early-exit GPT-2 wrapper supports two execution modes depending on the task.
KV-cache is handled differently for classification vs generation tasks.

#### Classification Tasks (SST-2, AGNews) — No KV-Cache Used

#### Generation Tasks (Summarization, Translation, QA)

For generation, two modes depending on use_kv parameter. 
if use_kv = False (Full Recompute, slow mode)
- Every new token recomputes all layers
- Early exit only skips layers inside one forward pass
- KV-cache is not stored
- Useful for reproducing naive early-exit results

if use_kv = True (KV Vached Early Exit, fast mode)
- step 1: encode the prompt once and produce hidden states for the prompt, KV pair for every layer
- step 2: decode tokens with early exit:
    - for each token run layers sequenctiall..
    - At each layer, compute confidence
        - If early exit triggers at layer L:
        - Layers 0..L compute normally and update KV
        - Layers L+1..final are skipped
        - Their KV is copied forward unchanged

In [3]:
### testing with KV similar to CALM paper
from evaluation.dataset_loaders.sst2 import load_sst2
from evaluation.dataset_loaders.agnews import load_agnews
from evaluation.dataset_loaders.cnn_dm import load_cnndm
from evaluation.dataset_loaders.squad import load_squad
from evaluation.dataset_loaders.wmt_en_fr import load_wmt_enfr

strategy = ConfidenceExit(threshold=0.7, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer, use_kv="True")

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.002)

    # ---------- IMPORTANT: pass dataset_name for classification ----------
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

Testing sst2...


Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 23.52it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(1.0), 'avg_latency_sec': np.float64(0.04217815399169922), 'tokens_per_sec': 23.70895606756054, 'avg_layers_used': np.float64(4.0), 'num_samples': 1}
Testing agnews...


Evaluating: 100%|███████████████████████████████| 15/15 [00:00<00:00, 44.23it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.26666666666666666), 'avg_latency_sec': np.float64(0.02246535619099935), 'tokens_per_sec': 44.51298218902248, 'avg_layers_used': np.float64(5.2), 'num_samples': 15}
Testing cnn_dm...


Token indices sequence length is longer than the specified maximum sequence length for this model (1032 > 1024). Running this sequence through the model will result in indexing errors
Evaluating: 100%|███████████████████████████████| 22/22 [00:14<00:00,  1.53it/s]


cnn_dm {'metric': 'rougeL', 'score': np.float64(0.05079575095102862), 'avg_latency_sec': np.float64(0.6509716402400624), 'tokens_per_sec': 1.5361652308405087, 'avg_layers_used': np.float64(4.051136363636363), 'num_samples': 22}
Testing wmt14_enfr...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluating: 100%|█████████████████████████████████| 6/6 [00:03<00:00,  1.91it/s]


wmt14_enfr {'metric': 'bleu', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.5234866937001547), 'tokens_per_sec': 1.910268230376043, 'avg_layers_used': np.float64(4.015625), 'num_samples': 6}
Testing squad...


Evaluating: 100%|███████████████████████████████| 21/21 [00:09<00:00,  2.22it/s]

squad {'metric': 'token_f1', 'score': np.float64(0.004896882945663434), 'avg_latency_sec': np.float64(0.4498912152789888), 'tokens_per_sec': 2.222759560619282, 'avg_layers_used': np.float64(4.017857142857143), 'num_samples': 21}





In [11]:
### testing without kv
from evaluation.dataset_loaders.sst2 import load_sst2
from evaluation.dataset_loaders.agnews import load_agnews
from evaluation.dataset_loaders.cnn_dm import load_cnndm
from evaluation.dataset_loaders.squad import load_squad
from evaluation.dataset_loaders.wmt_en_fr import load_wmt_enfr

strategy = ConfidenceExit(threshold=0.8, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer, use_kv="False")

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.002)

    # pass dataset_name for classification
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

Testing sst2...


Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 16.61it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.05985116958618164), 'tokens_per_sec': 16.7081112518623, 'avg_layers_used': np.float64(7.0), 'num_samples': 1}
Testing agnews...


Evaluating: 100%|███████████████████████████████| 15/15 [00:00<00:00, 59.73it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.13333333333333333), 'avg_latency_sec': np.float64(0.016642173131306965), 'tokens_per_sec': 60.08830650360303, 'avg_layers_used': np.float64(4.2), 'num_samples': 15}
Testing cnn_dm...


Evaluating:   9%|██▉                             | 2/22 [00:10<01:47,  5.39s/it]


IndexError: index out of range in self

In [None]:
### testing without kv
strategy = ConfidenceExit(threshold=0.1, allowed_layers=[2,4,6,8,10])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer, use_kv="False")

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.002)

    # pass dataset_name for classification
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

In [11]:
### testing without kv
strategy = ConfidenceExit(threshold=0.1, allowed_layers=[2,4,6,8,10])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer, use_kv="False")

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.005)

    # pass dataset_name for classification
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

Testing sst2...


Evaluating: 100%|█████████████████████████████████| 4/4 [00:00<00:00, 53.81it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(0.75), 'avg_latency_sec': np.float64(0.01846897602081299), 'tokens_per_sec': 54.14485344899922, 'avg_layers_used': np.float64(3.0), 'num_samples': 4}
Testing agnews...


Evaluating: 100%|███████████████████████████████| 38/38 [00:00<00:00, 75.34it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.15789473684210525), 'avg_latency_sec': np.float64(0.013190169083444695), 'tokens_per_sec': 75.81403950728156, 'avg_layers_used': np.float64(3.0), 'num_samples': 38}
Testing cnn_dm...


Evaluating: 100%|███████████████████████████████| 57/57 [00:32<00:00,  1.75it/s]


cnn_dm {'metric': 'rougeL', 'score': np.float64(0.050215135413532744), 'avg_latency_sec': np.float64(0.5711161128261633), 'tokens_per_sec': 1.7509574279939475, 'avg_layers_used': np.float64(3.0), 'num_samples': 57}
Testing wmt14_enfr...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluating: 100%|███████████████████████████████| 15/15 [00:05<00:00,  2.59it/s]


wmt14_enfr {'metric': 'bleu', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.3851939678192139), 'tokens_per_sec': 2.596094652420253, 'avg_layers_used': np.float64(3.0), 'num_samples': 15}
Testing squad...


Evaluating: 100%|███████████████████████████████| 52/52 [00:18<00:00,  2.76it/s]

squad {'metric': 'token_f1', 'score': np.float64(0.005145796465739077), 'avg_latency_sec': np.float64(0.3616780317746676), 'tokens_per_sec': 2.7648900738959434, 'avg_layers_used': np.float64(3.0), 'num_samples': 52}





In [None]:
### Strategy 2 - Confidence threshold should be (meet) in Continous layers

In [7]:
from strategies.continous_confidence_exit import ContinuousConfidenceExit

strategy = ContinuousConfidenceExit(
    threshold=0.75,
    required_consecutive=2,
    allowed_layers=[3, 6, 9, 11]
)

model = GPT2WithEarlyExit("gpt2", strategy, tokenizer)
evaluator = EarlyExitEvaluator(tokenizer)