# Evaluation

In [1]:
import sys 
sys.path.append("../")

In [2]:
import json

from transformers import AutoModelForMaskedLM, Trainer, pipeline

from model.config import BERT_BASE_UNCASED, DISTILBERT_BASE_UNCASED, MUSIC_MLM, training_args
from model.data_collator import data_collator
from model.data_split import benchmark_sentences
from model.tokenizer import tokenize_sentences, tokenizer

In [3]:
bert = AutoModelForMaskedLM.from_pretrained(BERT_BASE_UNCASED)
distilbert = AutoModelForMaskedLM.from_pretrained(DISTILBERT_BASE_UNCASED)
music_mlm = AutoModelForMaskedLM.from_pretrained(MUSIC_MLM)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/599 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

## Benchmark

In [4]:
training_args.output_dir = "benchmark_evaluation"
benchmark_tokens = tokenize_sentences(benchmark_sentences)

In [5]:
def evaluate_model(model):
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=benchmark_tokens,
        data_collator=data_collator,
    )
    return trainer.evaluate(benchmark_tokens)

In [6]:
bert_evaluation = evaluate_model(bert)

***** Running Evaluation *****
  Num examples = 21246
  Batch size = 16


In [7]:
distilbert_evaluation = evaluate_model(distilbert)

***** Running Evaluation *****
  Num examples = 21246
  Batch size = 16


In [8]:
music_mlm_evaluation = evaluate_model(music_mlm)

***** Running Evaluation *****
  Num examples = 21246
  Batch size = 16


In [9]:
print(f"BERT: {json.dumps(bert_evaluation, indent=4, sort_keys=True)}")
print(f"DistilBERT: {json.dumps(distilbert_evaluation, indent=4, sort_keys=True)}")
print(f"MusicMLM: {json.dumps(music_mlm_evaluation, indent=4, sort_keys=True)}")

BERT: {
    "eval_loss": 3.73119854927063,
    "eval_runtime": 775.6254,
    "eval_samples_per_second": 27.392,
    "eval_steps_per_second": 1.712
}
DistilBERT: {
    "eval_loss": 3.7848682403564453,
    "eval_runtime": 492.4635,
    "eval_samples_per_second": 43.142,
    "eval_steps_per_second": 2.697
}
MusicMLM: {
    "eval_loss": 2.4179506301879883,
    "eval_runtime": 491.6283,
    "eval_samples_per_second": 43.216,
    "eval_steps_per_second": 2.701
}


## Practical Examples

In [10]:
EVAL_SENTENCES = ['The best instrument for recording is [MASK].', 'Increasing the gain produces a [MASK] sound.',
                  'Flute is to woodwind as Trumpet is to [MASK].',
                  'Intensity is to decibels as frequency is to [MASK].', 'The man worked as a [MASK].',
                  'The woman worked as a [MASK].', 'The person worked as a [MASK].']

unmaskers = {
    "BERT": pipeline('fill-mask', model=bert, tokenizer=tokenizer),
    "DistilBERT": pipeline('fill-mask', model=distilbert, tokenizer=tokenizer),
    "MusicMLM": pipeline('fill-mask', model=distilbert, tokenizer=tokenizer),
}

for sentence in EVAL_SENTENCES:
    for model_name, unmask in unmaskers.items():
        print(f"{model_name} input: {sentence}")
        print(f"Result: {json.dumps(unmask(sentence), indent=4, sort_keys=True)}")
        print("\n")

BERT input: The best instrument for recording is [MASK].
Result: [
    {
        "score": 0.11105170845985413,
        "sequence": "the best instrument for recording is piano.",
        "token": 3682,
        "token_str": "p i a n o"
    },
    {
        "score": 0.10771017521619797,
        "sequence": "the best instrument for recording is violin.",
        "token": 6710,
        "token_str": "v i o l i n"
    },
    {
        "score": 0.09162741154432297,
        "sequence": "the best instrument for recording is guitar.",
        "token": 2858,
        "token_str": "g u i t a r"
    },
    {
        "score": 0.06992664188146591,
        "sequence": "the best instrument for recording is brass.",
        "token": 8782,
        "token_str": "b r a s s"
    },
    {
        "score": 0.03644818440079689,
        "sequence": "the best instrument for recording is bass.",
        "token": 3321,
        "token_str": "b a s s"
    }
]


DistilBERT input: The best instrument for recording is [M