In [28]:
import tempfile

from tqdm import tqdm_notebook
from transformers import AutoModelForCausalLM, AutoTokenizer
from minicons import scorer

import pandas as pd
import os


def load_hf_model(model_name: str, no_cache=False, **kwargs):
    model = None
    tokenizer = None

    try:
        if no_cache:
            with tempfile.TemporaryDirectory() as tmpdirname:
                model = AutoModelForCausalLM.from_pretrained(
                    model_name, cache_dir=tmpdirname, **kwargs
                )
                tokenizer = AutoTokenizer.from_pretrained(
                    model_name, cache_dir=tmpdirname, **kwargs
                )
        else:
            model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
            tokenizer = AutoTokenizer.from_pretrained(model_name, **kwargs)
    except OSError:
        pass

    if model is None:
        return None

    ilm_model = scorer.IncrementalLMScorer(
        model,
        "cuda",
        tokenizer=tokenizer,
    )

    return ilm_model


def score_tse(model, tse_df):
    tse_df["sen_prob"] = pd.Series(dtype=object).astype(object)
    tse_df["wrong_prob"] = pd.Series(dtype=object).astype(object)

    max_length = None  # ilm_model.model.transformer.config.n_ctx

    for idx, row in tqdm_notebook(tse_df.iterrows()):
        sen_prob, wrong_prob = score_pair(model, row.sen, row.wrong_sen, max_length)

        sen_nll = -sen_prob.sum().item()
        wrong_nll = -wrong_prob.sum().item()

        tse_df.at[idx, "sen_prob"] = sen_prob.tolist()
        tse_df.at[idx, "wrong_prob"] = wrong_prob.tolist()

        tse_df.loc[idx, "sen_nll"] = sen_nll
        tse_df.loc[idx, "wrong_nll"] = wrong_nll
        tse_df.loc[idx, "delta"] = wrong_nll - sen_nll

    return tse_df


def score_pair(ilm_model, sen, wrong_sen, max_length):
    sen_len = len(ilm_model.tokenizer.tokenize(sen))
    wrong_sen_len = len(ilm_model.tokenizer.tokenize(wrong_sen))

    if (max_length is not None) and (
        (sen_len >= max_length) or (wrong_sen_len >= max_length)
    ):
        return 0.0, 0.0

    stimuli = [sen, wrong_sen]

    return ilm_model.sequence_score(stimuli, reduction=lambda x: x)

In [24]:
from datasets import get_dataset_config_names, load_dataset

dataset_name = "jumelet/multiblimp"

# List all configs
configs = get_dataset_config_names(dataset_name)
print(configs)

configs = ['nld'] # define what languages you want to load data for here

datasets = {
    cfg: load_dataset(dataset_name, cfg)['train'].to_pandas()
    for cfg in configs
}

['abk', 'aln', 'amh', 'apu', 'aqz', 'arb', 'azz', 'bel', 'ben', 'bho', 'bor', 'bre', 'bua', 'bul', 'cat', 'ces', 'chu', 'cym', 'dan', 'deu', 'egy', 'ell', 'eng', 'est', 'eus', 'fao', 'fas', 'fin', 'fra', 'frm', 'fro', 'gla', 'gle', 'glg', 'got', 'grc', 'guj', 'hbo', 'hbs', 'heb', 'hin', 'hit', 'hsb', 'hun', 'hye', 'hyw', 'isl', 'ita', 'kat', 'kaz', 'kir', 'kmr', 'koi', 'kpv', 'krl', 'kxh', 'lat', 'lav', 'lij', 'lit', 'mar', 'mdf', 'mkd', 'myv', 'nds', 'nhi', 'nld', 'olo', 'orv', 'ota', 'pcm', 'pol', 'por', 'quc', 'ron', 'rus', 'sah', 'san', 'slk', 'slv', 'sme', 'sms', 'spa', 'sqi', 'swe', 'tam', 'tpn', 'ttc', 'tur', 'uig', 'ukr', 'urb', 'urd', 'uzb', 'vep', 'wbp', 'wol', 'xcl', 'xnr', 'xpg', 'yrl']


In [36]:
model = load_hf_model("Qwen/Qwen3-0.6B", no_cache=True)

In [30]:
# Score the items for a specific language.
# We add the log probability for the right and wrong sentence to the dataframe itself (sen_nll, wrong_nll),
# as well as the 'delta' that is the difference between these two.

score_tse(model, datasets['nld'])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx, row in tqdm_notebook(tse_df.iterrows()):


0it [00:00, ?it/s]

Unnamed: 0,sen,verb,verb_idx,cop,cop_idx,child,child_idx,child_features,child_upos,head,...,grammatical_feature,ungrammatical_feature,wo,inflect_item,agreement_certainty,sen_prob,wrong_prob,sen_nll,wrong_nll,delta
0,"Als de heren hier even willen wachten, dan zal...",waarschuwen,13,zal,9.0,ik,10,"{'Case': 'Nom', 'Person': '1', 'PronType': 'Pr...",PRON,zal,...,1,2,VS,aux,+,"[-5.682960510253906, -9.580001831054688, -8.48...","[-5.682960510253906, -9.580001831054688, -8.48...",161.726547,163.282043,1.555496
1,Volgens mevrouw-hier zou ik andere grond krijg...,gehoord,17,heb,10.0,ik,11,"{'Case': 'Nom', 'Person': '1', 'PronType': 'Pr...",PRON,heb,...,1,2,VS,aux,+,"[-5.785015106201172, -6.127483367919922, -11.0...","[-5.785015106201172, -6.127483367919922, -11.0...",245.389984,246.871719,1.481735
2,"omdat je het vraagt, zal ik het je zeggen.",zeggen,9,zal,5.0,ik,6,"{'Case': 'Nom', 'Person': '1', 'PronType': 'Pr...",PRON,zal,...,1,2,VS,aux,+,"[-11.708929061889648, -11.690223693847656, -7....","[-11.708929061889648, -11.690223693847656, -7....",97.852386,100.059875,2.207489
3,Ook zal ik de komende zomer weer gewoon de fie...,beklimmen,10,zal,1.0,ik,2,"{'Case': 'Nom', 'Person': '1', 'PronType': 'Pr...",PRON,zal,...,1,2,VS,aux,+,"[-8.171564102172852, -10.296234130859375, -5.1...","[-8.171564102172852, -10.296234130859375, -9.5...",134.002014,136.095581,2.093567
4,""" Voor de wedstrijd heb ik mijn spelers attent...",gemaakt,9,heb,4.0,ik,5,"{'Case': 'Nom', 'Person': '1', 'PronType': 'Pr...",PRON,heb,...,1,2,VS,aux,+,"[-8.597238540649414, -6.177814483642578, -8.72...","[-8.597238540649414, -6.177814483642578, -8.72...",189.671631,191.044739,1.373108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2326,"Vervolgens heeft hij, volgens zijn broer, tot ...",gezeten,16,heeft,1.0,hij,2,"{'Case': 'Nom', 'Person': '3', 'PronType': 'Pr...",PRON,heeft,...,3,1,VS,aux,+,"[-9.937719345092773, -6.257926940917969, -7.31...","[-9.937719345092773, -6.257926940917969, -7.31...",293.141968,291.980011,-1.161957
2327,Vanaf die plek heeft hij de slachtoffers de ze...,gegeven,9,heeft,3.0,hij,4,"{'Case': 'Nom', 'Person': '3', 'PronType': 'Pr...",PRON,heeft,...,3,1,VS,aux,+,"[-8.500349044799805, -16.31214141845703, -10.2...","[-8.500349044799805, -16.31214141845703, -10.2...",145.606812,144.076263,-1.530548
2328,Voor het onderzoek naar het vraagstuk van de s...,genomen,18,heeft,10.0,men,11,"{'Case': 'Nom', 'Person': '3', 'PronType': 'Pr...",PRON,heeft,...,3,1,VS,aux,+,"[-9.14107894897461, -5.0951385498046875, -2.98...","[-9.14107894897461, -5.0951385498046875, -2.98...",391.001282,390.924805,-0.076477
2329,In augustus 2005 heeft ze aangekondigd dat ze ...,aangekondigd,5,heeft,3.0,ze,4,"{'Person': '3', 'PronType': 'Prs', 'lemma': 'ze'}",PRON,heeft,...,3,1,VS,aux,+,"[-12.071830749511719, -0.24288177490234375, -8...","[-12.071830749511719, -0.24288177490234375, -8...",289.919983,285.580109,-4.339874


In [34]:
import numpy as np


# We can express performance as an accuracy by seeing how often the probability delta was greater than 0.
accuracy = np.mean(datasets['nld']['delta'] > 0)
accuracy

np.float64(0.5023595023595023)