In [1]:
import torch
import transformers

from life_after_bert import LaBEvaluator

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = transformers.AutoModelForMaskedLM.from_pretrained("roberta-large")
tokenizer = transformers.AutoTokenizer.from_pretrained("roberta-large")

In [3]:
task_infos = [
    ("Age Comparison", 2), 
    ("Always Never", 5), 
    ("Antonym Negation", 2), 
    ("Multihop Composition", 3), 
    ("Size Comparison", 2),
    ("Taxonomy Conjunction", 3)
]

evaluator = LaBEvaluator()
task_accs = evaluator.evaluate(model, tokenizer, task_infos, model_arch="encoder", device=device)
task_accs

2022-04-21 11:57:24 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_age_comparison_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:57:28 | INFO | eval.py | Accuracy on Age Comparison: 0.986
2022-04-21 11:57:28 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_always_never_dev.jsonl


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

2022-04-21 11:57:29 | INFO | eval.py | Accuracy on Always Never: 0.1357142857142857
2022-04-21 11:57:29 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_antonym_negation_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:57:31 | INFO | eval.py | Accuracy on Antonym Negation: 0.744
2022-04-21 11:57:31 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_multihop_composition_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:57:32 | INFO | eval.py | Accuracy on Multihop Composition: 0.28
2022-04-21 11:57:32 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_size_comparison_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:57:34 | INFO | eval.py | Accuracy on Size Comparison: 0.874
2022-04-21 11:57:34 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_taxonomy_conjunction_dev.jsonl


Evaluating:   0%|          | 0/38 [00:00<?, ?it/s]

2022-04-21 11:57:36 | INFO | eval.py | Accuracy on Taxonomy Conjunction: 0.4540901502504174


{'Age Comparison': 0.986,
 'Always Never': 0.1357142857142857,
 'Antonym Negation': 0.744,
 'Multihop Composition': 0.28,
 'Size Comparison': 0.874,
 'Taxonomy Conjunction': 0.4540901502504174}

In [4]:
model = transformers.AutoModelForCausalLM.from_pretrained("gpt2-large")
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2-large", mask_token="[MASK]")
task_accs = evaluator.evaluate(model, tokenizer, task_infos, model_arch="decoder", device=device)
task_accs

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


2022-04-21 11:57:52 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_age_comparison_dev.jsonl


Using pad_token, but it is not set yet.




Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:58:00 | INFO | eval.py | Accuracy on Age Comparison: 0.696
2022-04-21 11:58:00 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_always_never_dev.jsonl


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

2022-04-21 11:58:09 | INFO | eval.py | Accuracy on Always Never: 0.2571428571428571
2022-04-21 11:58:09 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_antonym_negation_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:58:17 | INFO | eval.py | Accuracy on Antonym Negation: 0.586
2022-04-21 11:58:17 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_multihop_composition_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:58:27 | INFO | eval.py | Accuracy on Multihop Composition: 0.338
2022-04-21 11:58:27 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_size_comparison_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:58:34 | INFO | eval.py | Accuracy on Size Comparison: 0.508
2022-04-21 11:58:34 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_taxonomy_conjunction_dev.jsonl


Evaluating:   0%|          | 0/38 [00:00<?, ?it/s]

2022-04-21 11:58:47 | INFO | eval.py | Accuracy on Taxonomy Conjunction: 0.39065108514190316


{'Age Comparison': 0.696,
 'Always Never': 0.2571428571428571,
 'Antonym Negation': 0.586,
 'Multihop Composition': 0.338,
 'Size Comparison': 0.508,
 'Taxonomy Conjunction': 0.39065108514190316}

In [5]:
model = transformers.T5ForConditionalGeneration.from_pretrained("t5-large")
tokenizer = transformers.AutoTokenizer.from_pretrained("t5-large", mask_token="<extra_id_0>")
task_accs = evaluator.evaluate(model, tokenizer, task_infos, model_arch="encoder-decoder", device=device)
task_accs

2022-04-21 11:59:01 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_age_comparison_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:59:04 | INFO | eval.py | Accuracy on Age Comparison: 0.94
2022-04-21 11:59:04 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_always_never_dev.jsonl


Evaluating:   0%|          | 0/18 [00:00<?, ?it/s]

2022-04-21 11:59:05 | INFO | eval.py | Accuracy on Always Never: 0.2571428571428571
2022-04-21 11:59:05 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_antonym_negation_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:59:07 | INFO | eval.py | Accuracy on Antonym Negation: 0.646
2022-04-21 11:59:07 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_multihop_composition_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:59:09 | INFO | eval.py | Accuracy on Multihop Composition: 0.338
2022-04-21 11:59:09 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_size_comparison_dev.jsonl


Evaluating:   0%|          | 0/32 [00:00<?, ?it/s]

2022-04-21 11:59:12 | INFO | eval.py | Accuracy on Size Comparison: 0.832
2022-04-21 11:59:12 | INFO | data.py | Loading jsonl file from /home/kzhao/life-after-bert/tests/data/oLMpics_taxonomy_conjunction_dev.jsonl


Evaluating:   0%|          | 0/38 [00:00<?, ?it/s]

2022-04-21 11:59:14 | INFO | eval.py | Accuracy on Taxonomy Conjunction: 0.42237061769616024


{'Age Comparison': 0.94,
 'Always Never': 0.2571428571428571,
 'Antonym Negation': 0.646,
 'Multihop Composition': 0.338,
 'Size Comparison': 0.832,
 'Taxonomy Conjunction': 0.42237061769616024}