# Evaluation

In [1]:
import json

from transformers import AutoModelForMaskedLM, Trainer, pipeline

from model.config import BERT_BASE_UNCASED, DISTILBERT_BASE_UNCASED, MUSIC_MLM, training_args
from model.data_collator import data_collator
from model.data_split import benchmark_sentences
from model.tokenizer import tokenize_sentences, tokenizer

In [2]:
bert = AutoModelForMaskedLM.from_pretrained(BERT_BASE_UNCASED)
distilbert = AutoModelForMaskedLM.from_pretrained(DISTILBERT_BASE_UNCASED)
music_mlm = AutoModelForMaskedLM.from_pretrained(MUSIC_MLM)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Benchmark

In [3]:
training_args.output_dir = "benchmark_evaluation"
benchmark_tokens = tokenize_sentences(benchmark_sentences)

In [4]:
def evaluate_model(model):
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=benchmark_tokens,
        data_collator=data_collator,
    )
    return trainer.evaluate(benchmark_tokens)

In [None]:
bert_evaluation = evaluate_model(bert)

***** Running Evaluation *****
  Num examples = 21246
  Batch size = 16


In [None]:
distilbert_evaluation = evaluate_model(distilbert)

In [None]:
music_mlm_evaluation = evaluate_model(music_mlm)

In [None]:
print(f"BERT: {json.dumps(bert_evaluation, indent=4, sort_keys=True)}")
print(f"DistilBERT: {json.dumps(distilbert_evaluation, indent=4, sort_keys=True)}")
print(f"MusicMLM: {json.dumps(music_mlm_evaluation, indent=4, sort_keys=True)}")

## Practical Examples

In [None]:
EVAL_SENTENCES = ['The best instrument for recording is [MASK].', 'Increasing the gain produces a [MASK] sound.',
                  'Flute is to woodwind as Trumpet is to [MASK].',
                  'Intensity is to decibels as frequency is to [MASK].', 'The man worked as a [MASK].',
                  'The woman worked as a [MASK].', 'The person worked as a [MASK].']

unmaskers = {
    "BERT": pipeline('fill-mask', model=bert, tokenizer=tokenizer),
    "DistilBERT": pipeline('fill-mask', model=distilbert, tokenizer=tokenizer),
    "MusicMLM": pipeline('fill-mask', model=distilbert, tokenizer=tokenizer),
}

for sentence in EVAL_SENTENCES:
    for model_name, unmask in unmaskers.items():
        print(f"{model_name} input: {sentence}")
        print(f"Result: {json.dumps(unmask(sentence), indent=4, sort_keys=True)}")
        print("\n")