In [1]:
from transformers import AutoTokenizer               # <-- You forgot this import

from evaluation.evaluator import EarlyExitEvaluator
from strategies.confidence_exit import ConfidenceExit
from models.gpt2_wrapper import GPT2WithEarlyExit
from evaluation.dataset_loaders.sst2 import load_sst2

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [2]:
strategy = ConfidenceExit(threshold=0.8, allowed_layers=[3,6,9])
model = GPT2WithEarlyExit("gpt2", strategy, tokenizer)

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [3]:
dataset = load_sst2(fraction=0.10)   # <-- use 10% data

evaluator = EarlyExitEvaluator(tokenizer)

result = evaluator.evaluate(
    model=model,
    strategy=strategy,
    dataset=dataset,
    task_type="classification",
)

print(result)

Evaluating: 100%|███████████████████████████████| 87/87 [00:02<00:00, 40.97it/s]

{'metric': 'accuracy', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.024290558935581952), 'tokens_per_sec': 41.16825811427307, 'avg_layers_used': np.float64(5.114942528735632), 'num_samples': 87}





In [4]:
from evaluation.dataset_loaders.agnews import load_agnews
from evaluation.dataset_loaders.cnn_dm import load_cnndm
from evaluation.dataset_loaders.squad import load_squad
from evaluation.dataset_loaders.wmt_en_fr import load_wmt_enfr

datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"Testing {name}...")

    dataset = loader(fraction=0.10)

    result = evaluator.evaluate(
        model=model,
        strategy=strategy,
        dataset=dataset,
        task_type=task,
    )

    print(name, result)

Testing sst2...


Evaluating: 100%|███████████████████████████████| 87/87 [00:01<00:00, 53.04it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.0187428984148749), 'tokens_per_sec': 53.35354105138677, 'avg_layers_used': np.float64(5.114942528735632), 'num_samples': 87}
Testing agnews...


Evaluating: 100%|█████████████████████████████| 760/760 [00:18<00:00, 41.38it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.024057050127732127), 'tokens_per_sec': 41.56785618729018, 'avg_layers_used': np.float64(5.859210526315789), 'num_samples': 760}
Testing cnn_dm...


Token indices sequence length is longer than the specified maximum sequence length for this model (1156 > 1024). Running this sequence through the model will result in indexing errors
Evaluating: 100%|███████████████████████████| 1336/1336 [03:16<00:00,  6.80it/s]


cnn_dm {'metric': 'rougeL', 'score': np.float64(0.025857347737891605), 'avg_latency_sec': np.float64(0.1468209526139105), 'tokens_per_sec': 6.811016971328759, 'avg_layers_used': np.float64(6.017964071856287), 'num_samples': 1336}
Testing wmt14_enfr...


Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluating: 100%|█████████████████████████████| 300/300 [00:07<00:00, 41.14it/s]


wmt14_enfr {'metric': 'bleu', 'score': np.float64(1.4693051022486856e-06), 'avg_latency_sec': np.float64(0.024202081362406414), 'tokens_per_sec': 41.318760358905344, 'avg_layers_used': np.float64(6.303333333333334), 'num_samples': 300}
Testing squad...


Evaluating: 100%|███████████████████████████| 1057/1057 [00:50<00:00, 21.09it/s]

squad {'metric': 'token_f1', 'score': np.float64(0.016823466137562637), 'avg_latency_sec': np.float64(0.04724731553582053), 'tokens_per_sec': 21.165223646237646, 'avg_layers_used': np.float64(6.811731315042573), 'num_samples': 1057}





In [None]:
### Strategy 2 - Confidence threshold should be (meet) in Continous layers

In [12]:
from strategies.continous_confidence_exit import ContinuousConfidenceExit

strategy = ContinuousConfidenceExit(
    threshold=0.75,
    required_consecutive=2,
    allowed_layers=[3, 6, 9, 11]
)

model = GPT2WithEarlyExit("gpt2", strategy, tokenizer)
evaluator = EarlyExitEvaluator(tokenizer)

In [14]:
datasets = [
    ("sst2", load_sst2, "classification"),
    ("agnews", load_agnews, "classification"),
    ("cnn_dm", load_cnndm, "summarization"),
    ("wmt14_enfr", load_wmt_enfr, "translation"),
    ("squad", load_squad, "qa"),
]

for name, loader, task in datasets:
    print(f"\n========== Testing {name.upper()} ==========\n")

    # Use 2% of dataset
    dataset = loader(fraction=0.02)

    result = evaluator.evaluate(
        model=model,
        strategy=strategy,
        dataset=dataset,
        task_type=task,
    )

    print(name, result)





Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Evaluating: 100%|███████████████████████████████| 17/17 [00:00<00:00, 21.62it/s]


sst2 {'metric': 'accuracy', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.04608050514669979), 'tokens_per_sec': 21.70115099251724, 'avg_layers_used': np.float64(12.0), 'num_samples': 17}




Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Evaluating: 100%|█████████████████████████████| 152/152 [00:08<00:00, 18.96it/s]


agnews {'metric': 'accuracy', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.052560009454426015), 'tokens_per_sec': 19.025871767909113, 'avg_layers_used': np.float64(12.0), 'num_samples': 152}




Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Evaluating: 100%|█████████████████████████████| 267/267 [01:19<00:00,  3.34it/s]


cnn_dm {'metric': 'rougeL', 'score': np.float64(0.029517060847254528), 'avg_latency_sec': np.float64(0.2987214992108863), 'tokens_per_sec': 3.3475996961773316, 'avg_layers_used': np.float64(12.0), 'num_samples': 267}




Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Evaluating: 100%|███████████████████████████████| 60/60 [00:03<00:00, 17.30it/s]


wmt14_enfr {'metric': 'bleu', 'score': np.float64(0.0), 'avg_latency_sec': np.float64(0.05763123830159505), 'tokens_per_sec': 17.351700735056443, 'avg_layers_used': np.float64(12.0), 'num_samples': 60}




Map:   0%|          | 0/211 [00:00<?, ? examples/s]

Evaluating: 100%|█████████████████████████████| 211/211 [00:16<00:00, 12.60it/s]

squad {'metric': 'token_f1', 'score': np.float64(0.045221055055178275), 'avg_latency_sec': np.float64(0.07913087555582489), 'tokens_per_sec': 12.637292244978694, 'avg_layers_used': np.float64(12.0), 'num_samples': 211}



