## F1 score on bigrams

In [1]:
import re
import string
from nltk.util import bigrams
from collections import Counter

regex_punctuation = re.compile('[%s]' % re.escape(string.punctuation))

In [2]:
gt = "Ben Shneiderman is an active and longtime research contributor with more than 400 publications  since 1972. Shneiderman's published work includes 208 journal articles  and 197 proceedings papers"
pred = "Ben Shneiderman is a highly productive scientist and computer scientist. He has published over 400 works since 1972, including 208 journal articles and 197 proceedings papers. He is well known for his pioneering work on human-computer interaction and information visualization. He is a Professor of Computer Science at the University of Maryland and founding Director of the Human-Computer Interaction Lab. He has won numerous awards and honors, including the Association for Computing Machinery's Software System Award and the ACM CHI Lifetime."

In [3]:
def string_to_tokens(s):
    '''
    This is a very basic way of tokenizing your text.
    Probably not useful for real life datasets.
    '''
    return ['<bos>'] + re.sub(regex_punctuation, '' , s.lower()).split() + ['<eos>']

In [4]:
gt_words = string_to_tokens(gt)
pred_words = string_to_tokens(pred)
print(gt_words, pred_words)

['<bos>', 'vanilla', 'is', 'the', 'best', 'ice', 'cream', 'flavor', 'in', 'the', 'world', '<eos>'] ['<bos>', 'vanilla', '<eos>']


In [5]:
gt_bigrams = list(bigrams(gt_words)) 
pred_bigrams = list(bigrams(pred_words)) 
print(gt_bigrams, pred_bigrams)

[('<bos>', 'vanilla'), ('vanilla', 'is'), ('is', 'the'), ('the', 'best'), ('best', 'ice'), ('ice', 'cream'), ('cream', 'flavor'), ('flavor', 'in'), ('in', 'the'), ('the', 'world'), ('world', '<eos>')] [('<bos>', 'vanilla'), ('vanilla', '<eos>')]


![f1 formula](https://wikimedia.org/api/rest_v1/media/math/render/svg/f5c869c51dba6f1df65a6e6630c516de161632d4)

In [6]:
def f1(pred_bigrams, gt_bigrams):
    shared_ngrams = Counter(pred_bigrams) & Counter(gt_bigrams)
    num_same = sum(shared_ngrams.values())
    
    if num_same == 0: return 0
    precision = 1.0 * num_same / len(pred_bigrams)
    recall = 1.0 * num_same / len(gt_bigrams)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [7]:
f1(pred_bigrams, gt_bigrams)

0.15384615384615385

In [8]:
pred_bigrams = list(bigrams(string_to_tokens('Chocolate is the best ice cream flavor in the world.')))
f1(pred_bigrams, gt_bigrams)

0.8181818181818182

In [9]:
pred_bigrams = list(bigrams(string_to_tokens('Vanilla is not the best ice cream flavor in the world.')))
f1(pred_bigrams, gt_bigrams)

0.8695652173913043

## BERTscore

In [10]:
# !pip install evaluate
# !pip install bert_score
# !pip install pyarrow==11.0.0

In [11]:
 import evaluate

bertscore = evaluate.load('bertscore')

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
bertscore.compute(
    predictions=['Vanilla is the best ice cream flavor in the world.'],
    references=['Vanilla.'],
    model_type="distilbert-base-uncased"
)

{'precision': [0.6896928548812866],
 'recall': [0.7891448736190796],
 'f1': [0.7360748052597046],
 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.29.2)'}

In [13]:
bertscore.compute(
    predictions=['Vanilla is the best ice cream flavor in the world.'],
    references=['Chocolate is the best ice cream flavor in the world.'],
    model_type="distilbert-base-uncased"
)

{'precision': [0.9863595962524414],
 'recall': [0.9863595962524414],
 'f1': [0.9863595962524414],
 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.29.2)'}

In [14]:
bertscore.compute(
    predictions=['Vanilla is the best ice cream flavor in the world.'],
    references=['Vanilla is not the best ice cream flavor in the world.'],
    model_type="distilbert-base-uncased"
)

{'precision': [0.9881718158721924],
 'recall': [0.9713627099990845],
 'f1': [0.979695200920105],
 'hashcode': 'distilbert-base-uncased_L5_no-idf_version=0.3.12(hug_trans=4.29.2)'}

## Perplexity

In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
from torch import nn
import numpy as np

# Code from a wonderful Kaggle Notebook by Pilipp Singer
# https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking

class Perplexity(nn.Module):
    def __init__(self, reduce: bool = True):
        super().__init__()
        self.loss_fn = nn.CrossEntropyLoss()
        self.reduce = reduce

    def forward(self, logits, labels):
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        perplexity = []
        for i in range(labels.shape[0]):
            perplexity.append(self.loss_fn(shift_logits[i], shift_labels[i]))
        perplexity = torch.stack(perplexity, dim=0)
        if self.reduce:
            perplexity = torch.mean(perplexity)
        return perplexity 
    
perp = Perplexity()

In [16]:
def perplexity(model, prompt):
    tokenizer = AutoTokenizer.from_pretrained(model)

    model = AutoModelForCausalLM.from_pretrained(
        model,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
    )
    
    with torch.no_grad():
        inputs = tokenizer([prompt], return_tensors="pt", add_special_tokens=False, truncation=True).to("cuda")
        logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]).logits
        labels = inputs["input_ids"]
        return perp(logits[0].unsqueeze(0), labels[0].unsqueeze(0)).item()

In [17]:
prompt = """
    Q: Which ice cream flavor is the best?
    A: Vanilla is the best ice cream flavor in the world.
"""

In [18]:
perplexity("tiiuae/falcon-7b", prompt)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.19s/it]


2.111328125

In [19]:
perplexity("mosaicml/mpt-7b", prompt)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Instantiating an MPTForCausalLM model from /home/radek/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-7b/72e5f594ce36f9cabfa2a9fd8f58b491eb467ee7/modeling_mpt.py
You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.07s/it]


1.77734375