In [1]:
from transformers import AutoTokenizer               

from evaluation.evaluator import EarlyExitEvaluator
from strategies.confidence_exit import ConfidenceExit
from models.gpt2_wrapper import GPT2WithEarlyExit
from evaluation.dataset_loaders.sst2 import load_sst2

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [2]:
# simple test one sentiment data
strategy = ConfidenceExit(threshold=0.8, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer)

dataset = load_sst2(number=500, task="train")   # Number of data to use = 100, without kv 

evaluator = EarlyExitEvaluator(tokenizer)

result = evaluator.evaluate(
    model=model,
    strategy=strategy,
    dataset=dataset,
    task_type="classification",
    dataset_name="sst2",
)

print(result)

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Evaluating: 100%|█████████████████████████████| 500/500 [00:09<00:00, 50.89it/s]

{'metric': 'accuracy', 'score': np.float64(0.628), 'avg_latency_sec': np.float64(0.019555476188659666), 'tokens_per_sec': 51.13657117589935, 'avg_layers_used': np.float64(6.446), 'num_samples': 500}





### KV-Cache Behavior in Early-Exit GPT-2 Wrapper

Early-exit GPT-2 wrapper supports two execution modes depending on the task.
KV-cache is handled differently for classification vs generation tasks.

#### Classification Tasks (SST-2, AGNews) — No KV-Cache Used

#### Generation Tasks (Summarization, Translation, QA)

For generation, two modes depending on use_kv parameter. 
if use_kv = False (Full Recompute, slow mode)
- Every new token recomputes all layers
- Early exit only skips layers inside one forward pass
- KV-cache is not stored
- Useful for reproducing naive early-exit results

if use_kv = True (KV Vached Early Exit, fast mode)
- step 1: encode the prompt once and produce hidden states for the prompt, KV pair for every layer
- step 2: decode tokens with early exit:
    - for each token run layers sequenctiall..
    - At each layer, compute confidence
        - If early exit triggers at layer L:
        - Layers 0..L compute normally and update KV
        - Layers L+1..final are skipped
        - Their KV is copied forward unchanged

In [3]:
### testing with KV similar to CALM paper
from evaluation.dataset_loaders.sst2 import load_sst2
from evaluation.dataset_loaders.agnews import load_agnews
from evaluation.dataset_loaders.cnn_dm import load_cnndm
from evaluation.dataset_loaders.squad import load_squad
from evaluation.dataset_loaders.wmt_en_fr import load_wmt_enfr

strategy = ConfidenceExit(threshold=0.5, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer, use_kv="True")

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.01)

    # ---------- IMPORTANT: pass dataset_name for classification ----------
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

Testing sst2...


Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 22.94it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.04308724403381348), 'tokens_per_sec': 23.208725051322205, 'avg_layers_used': np.float64(4.0), 'num_samples': 1}
Testing agnews...


Evaluating: 100%|███████████████████████████████| 15/15 [00:00<00:00, 61.85it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.13333333333333333), 'avg_latency_sec': np.float64(0.016067647933959962), 'tokens_per_sec': 62.236862800960346, 'avg_layers_used': np.float64(4.0), 'num_samples': 15}
Testing cnn_dm...


Token indices sequence length is longer than the specified maximum sequence length for this model (1032 > 1024). Running this sequence through the model will result in indexing errors
Evaluating: 100%|███████████████████████████████| 22/22 [01:15<00:00,  3.45s/it]


cnn_dm {'metric': 'rougeL', 'score': np.float64(0.06866004172580169), 'avg_latency_sec': np.float64(3.4483406868847934), 'tokens_per_sec': 0.2899945483354758, 'avg_layers_used': np.float64(5.365056818181818), 'num_samples': 22}
Testing wmt14_enfr...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluating: 100%|█████████████████████████████████| 6/6 [00:04<00:00,  1.31it/s]


wmt14_enfr {'metric': 'bleu', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.7641123533248901), 'tokens_per_sec': 1.3087080658344148, 'avg_layers_used': np.float64(5.291666666666667), 'num_samples': 6}
Testing squad...


Evaluating: 100%|███████████████████████████████| 21/21 [00:24<00:00,  1.17s/it]

squad {'metric': 'token_f1', 'score': np.float64(0.005748605748605749), 'avg_latency_sec': np.float64(1.1665068240392775), 'tokens_per_sec': 0.8572603086343615, 'avg_layers_used': np.float64(5.2976190476190474), 'num_samples': 21}





In [None]:
### testing without kv
from evaluation.dataset_loaders.sst2 import load_sst2
from evaluation.dataset_loaders.agnews import load_agnews
from evaluation.dataset_loaders.cnn_dm import load_cnndm
from evaluation.dataset_loaders.squad import load_squad
from evaluation.dataset_loaders.wmt_en_fr import load_wmt_enfr

strategy = ConfidenceExit(threshold=0.5, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer)

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.01)

    # pass dataset_name for classification
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

Testing sst2...


Evaluating: 100%|█████████████████████████████████| 1/1 [00:00<00:00, 17.65it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.05635786056518555), 'tokens_per_sec': 17.743753754515996, 'avg_layers_used': np.float64(7.0), 'num_samples': 1}
Testing agnews...


Evaluating: 100%|███████████████████████████████| 15/15 [00:00<00:00, 56.29it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.13333333333333333), 'avg_latency_sec': np.float64(0.017653989791870116), 'tokens_per_sec': 56.6444192949807, 'avg_layers_used': np.float64(4.2), 'num_samples': 15}
Testing cnn_dm...


Evaluating: 100%|███████████████████████████████| 22/22 [01:24<00:00,  3.82s/it]


cnn_dm {'metric': 'rougeL', 'score': np.float64(0.07329681276307656), 'avg_latency_sec': np.float64(3.821387919512662), 'tokens_per_sec': 0.261685026765754, 'avg_layers_used': np.float64(5.923295454545454), 'num_samples': 22}
Testing wmt14_enfr...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluating: 100%|█████████████████████████████████| 6/6 [00:05<00:00,  1.23it/s]

In [12]:
### testing without kv
strategy = ConfidenceExit(threshold=0.1, allowed_layers=[2,4,6,8,10])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer, use_kv="False")

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.002)

    # pass dataset_name for classification
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 5cbf51b6-40eb-4420-90e5-b00809b979a0)')' thrown while requesting HEAD https://huggingface.co/gpt2/resolve/main/config.json
Retrying in 1s [Retry 1/5].


Testing sst2...


KeyboardInterrupt: 

In [11]:
### testing without kv
strategy = ConfidenceExit(threshold=0.1, allowed_layers=[2,4,6,8,10])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer, use_kv="False")

evaluator = EarlyExitEvaluator(tokenizer)

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.005)

    # pass dataset_name for classification
    if task == "classification":
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
            dataset_name=name,      # e.g. "sst2" or "agnews"
        )
    else:
        result = evaluator.evaluate(
            model=model,
            strategy=strategy,
            dataset=dataset,
            task_type=task,
        )

    print(name, result)

Testing sst2...


Evaluating: 100%|█████████████████████████████████| 4/4 [00:00<00:00, 53.81it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(0.75), 'avg_latency_sec': np.float64(0.01846897602081299), 'tokens_per_sec': 54.14485344899922, 'avg_layers_used': np.float64(3.0), 'num_samples': 4}
Testing agnews...


Evaluating: 100%|███████████████████████████████| 38/38 [00:00<00:00, 75.34it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.15789473684210525), 'avg_latency_sec': np.float64(0.013190169083444695), 'tokens_per_sec': 75.81403950728156, 'avg_layers_used': np.float64(3.0), 'num_samples': 38}
Testing cnn_dm...


Evaluating: 100%|███████████████████████████████| 57/57 [00:32<00:00,  1.75it/s]


cnn_dm {'metric': 'rougeL', 'score': np.float64(0.050215135413532744), 'avg_latency_sec': np.float64(0.5711161128261633), 'tokens_per_sec': 1.7509574279939475, 'avg_layers_used': np.float64(3.0), 'num_samples': 57}
Testing wmt14_enfr...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluating: 100%|███████████████████████████████| 15/15 [00:05<00:00,  2.59it/s]


wmt14_enfr {'metric': 'bleu', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.3851939678192139), 'tokens_per_sec': 2.596094652420253, 'avg_layers_used': np.float64(3.0), 'num_samples': 15}
Testing squad...


Evaluating: 100%|███████████████████████████████| 52/52 [00:18<00:00,  2.76it/s]

squad {'metric': 'token_f1', 'score': np.float64(0.005145796465739077), 'avg_latency_sec': np.float64(0.3616780317746676), 'tokens_per_sec': 2.7648900738959434, 'avg_layers_used': np.float64(3.0), 'num_samples': 52}





In [None]:
### Strategy 2 - Confidence threshold should be (meet) in Continous layers

In [7]:
from strategies.continous_confidence_exit import ContinuousConfidenceExit

strategy = ContinuousConfidenceExit(
    threshold=0.75,
    required_consecutive=2,
    allowed_layers=[3, 6, 9, 11]
)

model = GPT2WithEarlyExit("gpt2", strategy, tokenizer)
evaluator = EarlyExitEvaluator(tokenizer)