# Pip install

In [None]:
!pip install -U datasets

In [None]:
!pip install -U accelerate

In [None]:
!pip install -i https://test.pypi.org/simple/ bitsandbytes

In [None]:
!pip install -U transformers

# Imports

In [4]:
import pandas as pd
import datetime
import json
import numpy as np
import seaborn as sns
import tqdm
import os
import gc
import glob
import torch
from argparse import Namespace
from transformers import set_seed
from datasets import load_dataset, load_from_disk
from torch.nn.functional import softmax
from matplotlib import pyplot as plt
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForMaskedLM,
    AutoModelForSeq2SeqLM,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if not torch.cuda.is_available():
    raise Exception("Change runtime type to include a GPU.")

# Functions

## --Probe helpers

In [2]:
def get_probe_function(prefix):
    probe_functions = [
        probe_gpt,
        probe_bert,
        probe_llama,
        probe_t5,
        probe_stablelm,
        probe_mpt,
        probe_redpajama,
        probe_falcon,
    ]
    for func in probe_functions:
        if prefix.lower() in func.__name__:
            return func

In [3]:
def probe_t5(model, tokenizer, target_id, context):
    # tokenize context
    input_ids = tokenizer(
        context,
        padding="longest",
        max_length=512,
        truncation=True,
        return_tensors="pt",
    ).input_ids
    # use model to solicit a prediction
    outputs = model.generate(
        input_ids=input_ids.to(device),
        output_scores=True,
        return_dict=True,
        return_dict_in_generate=True,
        max_new_tokens=4,
    )

    # find the left-most non-sepecial token, save itr of this token to grab
    # correct logit scores array
    sequences = outputs["sequences"][0].tolist()
    for i in range(4):
        logits = outputs["scores"][i]
        probs = softmax(logits, dim=-1)
        probs = probs.detach().cpu().numpy()
        if tokenizer.decode([np.argmax(probs)]) not in [
            "<extra_id_0>",
            "",
            " ",
            "<pad>",
        ]:
            save_itr = i
            break
    # grab its logits
    logits = outputs["scores"][save_itr]
    # convert our prediction scores to a probability distribution with softmax
    probs = softmax(logits, dim=-1)
    probs = probs.detach().cpu().numpy()

    return probs[0][target_id.item()]


def probe_stablelm(model, tokenizer, target_id, context):
    # tokenize context
    input_ids = tokenizer(
        context,
        padding="longest",
        max_length=4096,
        truncation=True,
        return_tensors="pt",
    ).input_ids
    # print(input_ids)
    # use model to solicit a prediction
    outputs = model.generate(
        pad_token_id = tokenizer.eos_token_id,
        input_ids=input_ids.to(device),
        output_scores=True,
        return_dict=True,
        return_dict_in_generate=True,
        max_new_tokens=4,
    )
    # print(outputs)

    # find the left-most non-sepecial token, save itr of this token to grab
    # correct logit scores array
    sequences = outputs["sequences"][0].tolist()
    for i in range(4):
        logits = outputs["scores"][i]
        probs = softmax(logits, dim=-1)
        probs = probs.detach().cpu().numpy()
        if tokenizer.decode([np.argmax(probs)]) not in [
            "<|endoftext|>",
            "<|padding|>",
            "",
            " ",
        ]:
            save_itr = i
            break
    # grab its logits
    logits = outputs["scores"][save_itr]
    # convert our prediction scores to a probability distribution with softmax
    probs = softmax(logits, dim=-1)
    probs = probs.detach().cpu().numpy()

    return probs[0][target_id.item()]


def probe_falcon(model, tokenizer, target_id, context):
    # tokenize context
    input_ids = tokenizer(
        context,
        padding="longest",
        max_length=2048,
        truncation=True,
        return_token_type_ids=False,
        return_tensors="pt",
    ).input_ids
    # use model to solicit a prediction
    outputs = model.generate(
        input_ids=input_ids.to(device),
        output_scores=True,
        return_dict=True,
        return_dict_in_generate=True,
        max_new_tokens=3,
    )

    # find the left-most non-sepecial token, save itr of this token to grab
    # correct logit scores array
    sequences = outputs["sequences"][0].tolist()
    for i in range(3):
        logits = outputs["scores"][i]
        probs = softmax(logits, dim=-1)
        probs = probs.detach().cpu().numpy()
        if tokenizer.decode([np.argmax(probs)]) not in [
            "<|endoftext|>",
            "<|padding|>",
            "",
            " ",
        ]:
            save_itr = i
            break
    # grab its logits
    logits = outputs["scores"][save_itr]
    # convert our prediction scores to a probability distribution with softmax
    probs = softmax(logits, dim=-1)
    probs = probs.detach().cpu().numpy()

    return probs[0][target_id.item()]


def probe_redpajama(model, tokenizer, target_id, context):
    # tokenize context
    input_ids = tokenizer(
        context,
        padding="longest",
        max_length=2048,
        truncation=True,
        return_tensors="pt",
    ).input_ids
    # use model to solicit a prediction
    outputs = model.generate(
        pad_token_id = tokenizer.eos_token_id,
        input_ids=input_ids.to(device),
        output_scores=True,
        return_dict=True,
        return_dict_in_generate=True,
        max_new_tokens=4,
    )

    # find the left-most non-sepecial token, save itr of this token to grab
    # correct logit scores array
    sequences = outputs["sequences"][0].tolist()
    for i in range(4):
        logits = outputs["scores"][i]
        probs = softmax(logits, dim=-1)
        probs = probs.detach().cpu().numpy()
        if tokenizer.decode([np.argmax(probs)]) not in [
            "<|endoftext|>",
            "<|padding|>",
            "",
            " ",
        ]:
            save_itr = i
            break
    # grab its logits
    logits = outputs["scores"][save_itr]
    # convert our prediction scores to a probability distribution with softmax
    probs = softmax(logits, dim=-1)
    probs = probs.detach().cpu().numpy()

    return probs[0][target_id.item()]


def probe_mpt(model, tokenizer, target_id, context):
    # tokenize context
    input_ids = tokenizer(
        context,
        padding="longest",
        max_length=2048,
        truncation=True,
        return_tensors="pt",
    ).input_ids
    # use model to solicit a prediction
    outputs = model.generate(
        input_ids=input_ids.to(device),
        output_scores=True,
        return_dict=True,
        return_dict_in_generate=True,
        max_new_tokens=4,
    )

    # find the left-most non-sepecial token, save itr of this token to grab
    # correct logit scores array
    sequences = outputs["sequences"][0].tolist()
    for i in range(4):
        logits = outputs["scores"][i]
        probs = softmax(logits, dim=-1)
        probs = probs.detach().cpu().numpy()
        if tokenizer.decode([np.argmax(probs)]) not in [
            "<|endoftext|>",
            "<|padding|>",
            "",
            " ",
        ]:
            save_itr = i
            break
    # grab its logits
    logits = outputs["scores"][save_itr]
    # convert our prediction scores to a probability distribution with softmax
    probs = softmax(logits, dim=-1)
    probs = probs.detach().cpu().numpy()

    return probs[0][target_id.item()]


def probe_gpt(model, tokenizer, target_id, context):
    # tokenize context
    input_ids = tokenizer(
        context,
        return_tensors="pt",
    ).input_ids.to(device)

    # grab value
    target_scalar = target_id.detach().cpu().numpy()

    # use model to solicit a prediction
    outputs = model(input_ids=input_ids, output_hidden_states=True, return_dict=True)

    # every token in the model's vocab gets a representative prediction from the model
    logits = outputs["logits"][0, -1]
    # convert our prediction scores to a probability distribution with softmax
    probs = softmax(logits, dim=-1)
    probs = list(probs.detach().cpu().numpy())

    # double check weird-ness before accessing prob
    if len(probs) < target_id:
        return None

    # return the likelihood that our stipulated target would follow the context,
    # according to the model
    try:
        return np.take(probs, [target_scalar])[0]

    except IndexError:
        print("target index not in model vocabulary scope; raising IndexError")
        return None


def probe_bert(model, tokenizer, target_id, context):
    # tokenize context
    input_ids = tokenizer(
        context,
        padding="longest",
        max_length=512,
        truncation=True,
        return_tensors="pt",
    ).input_ids

    mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]

    # use model to solicit a prediction
    logits = model(input_ids=input_ids.to(device)).logits
    mask_token_logits = logits[0, mask_token_index, :]

    # Convert our prediction scores to a probability distribution with softmax
    probs = torch.squeeze(softmax(mask_token_logits, dim=-1))

    probs = probs.detach().cpu().numpy()

    return probs[target_id.item()]


def probe_llama(model, tokenizer, target_id, context):
    # tokenize context
    input_ids = tokenizer(
        context,
        return_tensors="pt",
    ).input_ids.to(device)

    # grab value
    target_scalar = target_id.detach().cpu().numpy()

    # use model to solicit a prediction
    outputs = model(input_ids=input_ids, output_hidden_states=True, return_dict=True)

    # every token in the model's vocab gets a representative prediction from the model
    logits = outputs["logits"][0, -1]
    # convert our prediction scores to a probability distribution with softmax
    probs = softmax(logits, dim=-1)

    probs = list(probs.detach().cpu().numpy())

    # double check weird-ness before accessing prob
    if len(probs) < target_id:
        return None

    # return the likelihood that our stipulated target would follow the context,
    # according to the model
    try:
        return np.take(probs, [target_scalar])[0]

    except IndexError:
        print("target index not in model vocabulary scope; raising IndexError")
        return None

## --Get model token function

In [4]:
def get_model_and_tokenizer(model_name):
    if "t5" in model_name.lower():
        return AutoTokenizer.from_pretrained(
            model_name
        ), AutoModelForSeq2SeqLM.from_pretrained(
            model_name, 
            # load_in_8bit=True, 
            # quantization_config = quantization_config,
            device_map="auto", 
            torch_dtype=torch.float16
        )

    elif (
        ("gpt" in model_name.lower())
        or ("opt" in model_name.lower())
        or ("pythia" in model_name.lower())
        or ("bloom" in model_name.lower())
    ):
        return AutoTokenizer.from_pretrained(
            model_name
        ), AutoModelForCausalLM.from_pretrained(
            model_name, 
            load_in_8bit=True, 
            device_map="auto", 
            torch_dtype=torch.float16
        )

    elif ("stablelm" in model_name.lower()) or ("redpajama" in model_name.lower()):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = "<|padding|>"
        
        # bnb_config = transformers.BitsAndBytesConfig(
        #     load_in_8bit=True,
        #     # bnb_4bit_use_double_quant=True,
        #     # bnb_4bit_quant_type="nf4",
        #     bnb_8bit_compute_dtype=torch.float16,
        # )
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            # load_in_8bit=True, 
            # quantization_config=bnb_config,
            device_map="auto", 
            torch_dtype=torch.float16,
            pad_token_id=tokenizer.eos_token_id,
            offload_folder='./offload'
        )
        return tokenizer, model

    elif "mpt" in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = "<|padding|>"

        bnb_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map={"": 0},
            # torch_dtype=torch.float16,
            trust_remote_code=True,
        )  # .to(device)

        return (
            tokenizer,
            model,
        )

    elif "bert" in model_name.lower():
        return AutoTokenizer.from_pretrained(
            model_name
        ), AutoModelForMaskedLM.from_pretrained(
            model_name, torch_dtype=torch.float16
        ).to(
            device
        )

    elif "llama" in model_name.lower():
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token

        model = transformers.LlamaForCausalLM.from_pretrained(
            model_name, 
            load_in_8bit=True, 
            device_map="auto", 
            torch_dtype=torch.float16
        )
        return tokenizer, model

    elif "mistral" in model_name.lower():
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token

        model = transformers.AutoModelForCausalLM.from_pretrained(
            model_name, 
            # load_in_8bit=True, 
            device_map="auto", 
            torch_dtype=torch.float16
        )
        return tokenizer, model

    elif "falcon" in model_name.lower():
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.eos_token

        bnb_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map="auto",
            # torch_dtype=torch.float16,
            trust_remote_code=True,
        )  # .to(device)

        return (
            tokenizer,
            model,
        )

## --Evaluate model

In [1]:
def evaluate_model(model_name, input_dataset, tokenizer, model, lang, p_true_freq_list):

    true_count = 0
    fact_count = 0
    p_falses = []
    p_trues = []

    # establish prefix
    prefix = ""
    probe_func = None

    # get correct CKA function
    if "t5" in model_name.lower():
        prefix = "t5"
        probe_func = get_probe_function(prefix)
    elif (
        ("gpt-neo" in model_name.lower())
        or ("gpt-j" in model_name.lower())
        or ("pythia" in model_name.lower())
    ):
        prefix = "eleutherai"
        probe_func = get_probe_function("gpt")

    elif "gpt" in model_name.lower():
        prefix = "gpt"
        probe_func = get_probe_function(prefix)

    elif "opt" in model_name.lower():
        prefix = "opt"
        probe_func = get_probe_function("gpt")

    elif "roberta" in model_name.lower():
        prefix = "roberta"
        probe_func = get_probe_function("bert")

    elif "bert" in model_name.lower():
        prefix = "bert"
        probe_func = get_probe_function(prefix)

    elif "llama" in model_name.lower():
        prefix = "llama"
        probe_func = get_probe_function(prefix)

    elif "mistral" in model_name.lower():
        prefix = "mistral"
        probe_func = get_probe_function("llama")

    elif "bloom" in model_name.lower():
        prefix = "bloom"
        probe_func = get_probe_function("gpt")

    elif "stablelm" in model_name.lower():
        prefix = "stablelm"
        probe_func = get_probe_function(prefix)

    elif "mpt" in model_name.lower():
        prefix = "mpt"
        probe_func = get_probe_function(prefix)

    elif "redpajama" in model_name.lower():
        prefix = "redpajama"
        probe_func = get_probe_function(prefix)

    elif "falcon" in model_name.lower():
        prefix = "falcon"
        probe_func = get_probe_function(prefix)

    # iterate over context/entity pairings
    # input_dataset is a datasets dataset
    # context is a plain string (since our context's will be unique)
    # and entities is a list containing, in the first slot, the true
    # value for the statement and in the subsequent slots, incorrect information
    
    conf_idx_sum = 0
    conf_idx_count = 0
    for entities_dict in tqdm.tqdm(input_dataset):
        # convert string of list into a real list
        if " <br> " in entities_dict["false"]:
            counterfacts_list = entities_dict["false"].split(" <br> ")
        else:
            counterfacts_list = [entities_dict["false"]]

        # intitiate vars
        p_true = 0.0
        p_false = 0.0
        p_false_list_inner = []

        # grab true and false entities
        entities = [entities_dict["true"]]
        entities.extend(counterfacts_list)

        # iterate through each fact and counterfact
        for entity_count, entity in enumerate(entities):
            # grab the context
            context = entities_dict["stem"]
            # if multiple stems are stored, grab the correct one
            # (zeroeth stem is true fact, next ones are counterfacts)
            if " <br> " in context:
                context = context.split(" <br> ")
            if type(context) == list:
                context = context[entity_count]
            # necessary additions based on model type
            if prefix == "roberta":
                context += " <mask>."
            elif prefix == "bert":
                context += " [MASK]."

            # first find target vocab id
            # default to the very first token that get's predicted
            # e.g. in the case of Tokyo, which gets split into <Tok> <yo>,
            target_id = None
            if prefix == "t5":
                target_ids = tokenizer.encode(
                    " " + entity,
                    padding="longest",
                    max_length=512,
                    truncation=True,
                    return_tensors="pt",
                ).tolist()
                space_only_token = tokenizer.encode(" ")[0]
                try:
                    target_ids[0].remove(space_only_token)
                except ValueError:
                    pass
                target_id = torch.tensor(target_ids).to(device)[0][0]

            elif (
                (prefix == "gpt")
                or (prefix == "eleutherai")
                or (prefix == "bloom")
                or (prefix == "stablelm")
                or (prefix == "mpt")
                or (prefix == "redpajama")
            ):
                target_id = tokenizer.encode(" " + entity, return_tensors="pt").to(
                    device
                )[0][0]

            elif prefix == "falcon":
                target_id = tokenizer.encode(
                    " " + entity, return_token_type_ids=False, return_tensors="pt"
                ).to(device)[0][0]

            elif prefix == "opt":
                target_id = tokenizer.encode(" " + entity, return_tensors="pt").to(
                    device
                )[0][1]

            elif prefix == "roberta":
                target_id = tokenizer.encode(
                    " " + entity,
                    padding="longest",
                    max_length=512,
                    truncation=True,
                    return_tensors="pt",
                ).to(device)[0][1]

            elif prefix == "bert":
                target_id = tokenizer.encode(
                    entity,
                    padding="longest",
                    max_length=512,
                    truncation=True,
                    return_tensors="pt",
                ).to(device)[0][1]

            elif (prefix == "llama") or (prefix == "mistral"):
                target_id = tokenizer.encode(" " + entity, return_tensors="pt").to(
                    device
                )[0][2]

            # next call probe function
            model_prob = probe_func(model, tokenizer, target_id, context)

            # lastly, register results
            # if it is the first time through, it is the fact
            if entity_count == 0:
                p_true = model_prob
            # if it is the second+ time through, it is the counterfactual(s)
            else:
                p_false += model_prob
                p_false_list_inner.append(float(model_prob))
        
        # logging data for position analytics
        p_false_list_inner.sort(reverse=True)
        pos=1
        for i in range(len(p_false_list_inner)):
            if(p_true > p_false_list_inner[i]):
                break
            pos+=1
        p_true_freq_list[0].append(pos)
        p_true_freq_list[1].append(lang)

        
        p_false_list_inner.append(p_true)
        p_false_list_inner.sort(reverse=True)
        
        if not (p_false_list_inner[0] == 0 or p_false_list_inner[1] == 0):
            conf_idx_sum += p_false_list_inner[0]/p_false_list_inner[1]
            conf_idx_count+=1
            
        
        # entity count is equal to the num counterfactuals
        # (since it started at a 0 index in the enumerate)
        p_false /= entity_count

        # append p_false and p_true
        p_falses.append(float(p_false))
        p_trues.append(float(p_true))

        # update counts based on probs
        if p_true > p_false:
            true_count += 1
        fact_count += 1
    
    return true_count/fact_count, round(conf_idx_sum/conf_idx_count)

# Benchmark

## --Benchmark arguments

In [6]:
# benchmark arguments

model_names = [
    'bert-base-multilingual-cased',
    'xlm-roberta-large',
    'xlm-roberta-base',
    'google/mt5-small',
    'google/mt5-large',
    'stabilityai/stablelm-3b-4e1t',
    'stabilityai/stablelm-zephyr-3b',
    'togethercomputer/RedPajama-INCITE-Base-3B-v1'
]

langs = ['English', 'Czech', 'Ukrainian']

data_size = 1000

## -- Checking arguments

In [7]:
# check model names
compatible_model_prefixes = [
    "t5",
    "pythia",
    "gpt",
    "opt",
    "llama",
    "roberta",
    "bert",
    "bloom",
    "stablelm",
    "mpt",
    "redpajama",
    "falcon",
    "mistral",
]
for model_name in model_names:
    model_supported = True
    for model_prefix in compatible_model_prefixes:
        if model_prefix in model_name.lower():
            model_supported = True
            break
    if not model_supported:
        raise Exception(f"Model {model_name} not supported.")


# Check language name
supported_languages = [
    "English", 
    "French", 
    "Spanish", 
    "German", 
    "Ukrainian", 
    "Romanian", 
    "Bulgarian", 
    "Catalan", 
    "Danish", 
    "Croatian", 
    "Hungarian", 
    "Italian", 
    "Dutch", 
    "Polish", 
    "Portuguese", 
    "Russian", 
    "Slovenian", 
    "Serbian", 
    "Swedish", 
    "Czech"
]
for lang in langs:
    if lang not in supported_languages:
        raise Exception(f'Language {lang} not supported')

## -- Dataset download/read

In [8]:
# Datasets downloading or reading
datasets = []

local_dataset_path = './local_datasets'

if not os.path.exists(local_dataset_path):
    os.makedirs(local_dataset_path)

for lang in langs:
    local_path = os.path.join(local_dataset_path, f'{lang}_dataset')
    if os.path.exists(local_path):
        dataset = load_from_disk(local_path)
        print(f'Dataset for {lang.lower()} language loaded from local storage successfully!')
    else:
        dataset = load_dataset("Polyglot-or-Not/Fact-Completion", split=lang).select(range(data_size))
        dataset.save_to_disk(local_path)
        print(f'Dataset for {lang.lower()} language downloaded and saved locally successfully!')
    datasets.append(dataset)

Dataset for english language loaded from local storage successfully!
Dataset for czech language loaded from local storage successfully!
Dataset for ukrainian language loaded from local storage successfully!


## --Create/read tables

In [5]:
acc_df_name = 'accuracy_table.csv'
fact_pos_name = 'fact_position_stats.csv'
conf_idx_name = 'confidence_index.csv'

In [6]:
# conf_idx_df = pd.DataFrame(columns = ['Model name'] + langs)
conf_idx_df = pd.read_csv(conf_idx_name)
conf_idx_df

Unnamed: 0,Model name,English,Czech,Ukrainian,Model type,Params count (mil)
0,bert-base-multilingual-cased,15911,2509,1440,MaskedLM,178
1,xlm-roberta-large,35580,58335,52950,MaskedLM,560
2,xlm-roberta-base,20401,11904,25385,MaskedLM,278
3,mt5-small,1740,5140,3622,text2text,300
4,mt5-large,1997,10427,9913,text2text,1230
5,stablelm-3b-4e1t,56429,77932,6022,CausalLM,2795
6,stablelm-zephyr-3b,170648,36613,2108,CausalLM,2795
7,RedPajama-INCITE-Base-3B-v1,40689,53117,5287,CausalLM,2776


In [7]:
# acc_df = pd.DataFrame(columns = ['Model name'] + langs + ['Param count (mil)'])
acc_df = pd.read_csv(acc_df_name)
acc_df

Unnamed: 0,Model name,English,Czech,Ukrainian,Param count (mil),Mean accuracy,Model type,Efficienty
0,bert-base-multilingual-cased,0.714,0.632,0.562,178,0.636,MaskedLM,35.730337
1,xlm-roberta-large,0.599,0.573,0.554,560,0.575,MaskedLM,10.267857
2,xlm-roberta-base,0.61,0.57,0.536,278,0.572,MaskedLM,20.57554
3,mt5-small,0.517,0.461,0.475,300,0.484,text2text,16.133333
4,mt5-large,0.575,0.484,0.502,1230,0.52,text2text,4.227642
5,stablelm-3b-4e1t,0.848,0.713,0.632,2795,0.731,CausalLM,2.615385
6,stablelm-zephyr-3b,0.743,0.571,0.521,2795,0.612,CausalLM,2.189624
7,RedPajama-INCITE-Base-3B-v1,0.822,0.684,0.605,2776,0.704,CausalLM,2.536023


## --Benchmark itself

In [8]:
stat_pos_dict = {}

for model_name in model_names:
    
    if model_name.split('/')[-1] in acc_df['Model name'].tolist():
        continue
    
    stat_pos = [[], []]
    
    tokenizer, model = get_model_and_tokenizer(model_name)
    model_name = model_name.split('/')[-1]
    print(f'\n\n{model_name}')
    
    
    acc_row = pd.DataFrame(columns = ['Model name']+langs+['Param count (mil)'], data={'Model name': model_name}, index=[0])
    acc_row.loc[0, 'Model name'] = model_name
    acc_row.loc[0, 'Param count (mil)'] = round(sum(p.numel() for p in model.parameters())/1000000)
    # print(acc_row)

    conf_row = pd.DataFrame(columns = ['Model name'] + langs, data={'Model name': model_name}, index=[0])
    conf_row.loc[0, 'Model name'] = model_name
    # print(conf_row)
    
    for j in range(1, len(langs)+1):
        acc_row.iloc[0, j],  conf_row.iloc[0, j]= evaluate_model(model_name, datasets[j-1], tokenizer, model, langs[j-1], stat_pos)
        print(f"{langs[j-1]} language: accuracy is {acc_row.iloc[0, j]}, confidence index is {conf_row.iloc[0, j]}")
    
    # print(acc_row)
    
    # logging data for accuracy and confidence
    acc_df = pd.concat([acc_df, acc_row], ignore_index=True)
    conf_idx_df = pd.concat([conf_idx_df, conf_row], ignore_index=True)
    
    if 'Language' not in stat_pos_dict:
        stat_pos_dict['Language'] = stat_pos[1]
    stat_pos_dict[model_name] = stat_pos[0]

    gc.collect()
    del tokenizer
    del model
    torch.cuda.empty_cache()
    

NameError: name 'model_names' is not defined

In [None]:
# in case of a loop crash, clear memory

# gc.collect()
# del tokenizer
# del model
# torch.cuda.empty_cache()

# Results

In [13]:
fact_pos_df = pd.DataFrame(stat_pos_dict)
fact_pos_df.head(10)

Unnamed: 0,Language,stablelm-3b-4e1t,stablelm-zephyr-3b,RedPajama-INCITE-Base-3B-v1
0,English,1,2,1
1,English,1,1,1
2,English,1,1,1
3,English,1,1,1
4,English,1,1,1
5,English,1,1,1
6,English,1,1,1
7,English,1,1,1
8,English,1,1,1
9,English,1,2,1


In [14]:
acc_df

Unnamed: 0,Model name,English,Czech,Ukrainian,Param count (mil)
0,bert-base-multilingual-cased,0.714,0.632,0.562,178
1,xlm-roberta-large,0.599,0.573,0.554,560
2,xlm-roberta-base,0.61,0.57,0.536,278
3,mt5-small,0.517,0.461,0.475,300
4,mt5-large,0.575,0.484,0.502,1230
5,stablelm-3b-4e1t,0.848,0.713,0.632,2795
6,stablelm-zephyr-3b,0.743,0.571,0.521,2795
7,RedPajama-INCITE-Base-3B-v1,0.822,0.684,0.605,2776


In [15]:
conf_idx_df

Unnamed: 0,Model name,English,Czech,Ukrainian
0,bert-base-multilingual-cased,15911,2509,1440
1,xlm-roberta-large,35580,58335,52950
2,xlm-roberta-base,20401,11904,25385
3,mt5-small,1740,5140,3622
4,mt5-large,1997,10427,9913
5,stablelm-3b-4e1t,56429,77932,6022
6,stablelm-zephyr-3b,170648,36613,2108
7,RedPajama-INCITE-Base-3B-v1,40689,53117,5287


## --Save results

In [16]:
acc_df.to_csv(acc_df_name, index=False)

In [17]:
conf_idx_df.to_csv(conf_idx_name, index=False)

In [None]:
fact_pos_df.to_csv(fact_pos_name, index=False)