In [3]:
from collections import namedtuple
from tqdm import tqdm
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

In [4]:
from dataclasses import dataclass

@dataclass
class Model:
    model: AutoModelForCausalLM
    tokenizer: AutoTokenizer
    device: str

## Загружаем Mistral 7B из HF

- Блогпост: https://mistral.ai/news/announcing-mistral-7b/
- Модель на HF: https://huggingface.co/mistralai/Mistral-7B-v0.1


In [5]:
mistral_device = 'cuda:0'
mistral = Model(
    model=AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-v0.1",
        device_map=mistral_device,
        torch_dtype="auto",
        use_flash_attention_2=True,
    ), 
    tokenizer=AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1"),
    device=mistral_device
    
)

mistral.tokenizer.pad_token = mistral.tokenizer.eos_token
mistral.tokenizer.padding_side = "left"
mistral.tokenizer.model_max_length = mistral.model.config.max_position_embeddings
print('max_length', mistral.model.config.max_position_embeddings)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


max_length 32768


Process ForkProcess-18:
Process ForkProcess-17:
Process ForkProcess-23:
Process ForkProcess-2:
Process ForkProcess-11:
Process ForkProcess-8:
Process ForkProcess-5:
Process ForkProcess-7:
Process ForkProcess-12:
Process ForkProcess-21:
Process ForkProcess-31:
Process ForkProcess-32:
Process ForkProcess-19:
Process ForkProcess-1:
Process ForkProcess-30:
Process ForkProcess-9:
Process ForkProcess-22:
Process ForkProcess-10:
Process ForkProcess-20:
Process ForkProcess-24:
Process ForkProcess-4:
Process ForkProcess-25:
Process ForkProcess-3:
Process ForkProcess-13:
Process ForkProcess-27:
Traceback (most recent call last):
Process ForkProcess-26:
Traceback (most recent call last):
Process ForkProcess-28:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback 

In [5]:
prompt = "In order to prepare pasta, first you need to"

model_inputs = mistral.tokenizer([prompt], return_tensors="pt").to(mistral_device)

generated_ids = mistral.model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
mistral.tokenizer.batch_decode(generated_ids)[0]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> In order to prepare pasta, first you need to get your ingredients ready. In addition to the flour and the egg, you will also need some salt, olive oil, and sometimes water. Mix them together to make the dough for pasta. You may add some herbs and vegetables as a variation. Mix them in a bowl or in a dough machine first, and kneed them into shape. You could use your hands or a rolling pin to spread and thin the dough evenly. After you have added the right amount of liquid you will be'

## Загружаем Llama 7B из локального чекпоинта
- Модель на HF (нужно запрашивать доступ) https://huggingface.co/meta-llama/Llama-2-7b

In [6]:
llama_device = "cuda:1"
llama = Model(
    model = AutoModelForCausalLM.from_pretrained(
        "llama-7b-hf-model",
        device_map=llama_device,
        torch_dtype="auto",
        use_flash_attention_2=True,

    ),
    tokenizer = AutoTokenizer.from_pretrained("llama-7b-hf-model"),
    device=llama_device
)

llama.tokenizer.pad_token = llama.tokenizer.eos_token
llama.tokenizer.padding_side = "left"
llama.tokenizer.model_max_length = llama.model.config.max_position_embeddings
print('max_length', llama.model.config.max_position_embeddings)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


max_length 2048


In [7]:
prompt = "In order to prepare pasta, first you need to"

model_inputs = llama.tokenizer([prompt], return_tensors="pt").to(llama_device)

generated_ids = llama.model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
llama.tokenizer.batch_decode(generated_ids)[0]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'<s> In order to prepare pasta, first you need to wash them well, then bring in water with a lot of salt. That is the key to maintaining its original flavor. First, the pasta must be softened and then fry it for a few minutes over high heat.\nAfter that, put the pasta into a different bowl. If pasta is cooked too much, then it is likely that the cooking process will not be very successful. If it lacks salt, the pasta will be so hard'

### Функция для оценки вероятности продолжения при условии префикса
Вычисляем $\log P\left(\texttt{suffix} | \texttt{prefix}\right)$

In [7]:
import torch
from collections import namedtuple
import numpy as np

def score_completion(model_obj, prefix, suffix):
    model = model_obj.model
    tokenizer = model_obj.tokenizer
    device = model_obj.device
    max_length = tokenizer.model_max_length
        
    # Encode input sequence, preserving preffix-suffix split
    tokenizer.truncation_size = "left"
    prefix_encoded = tokenizer(
        [prefix], 
        add_special_tokens=True, 
        max_length=max_length, 
        truncation=True,
        return_tensors="pt",
    )

    tokenizer.truncation_size = "right"
    suffix_encoded = tokenizer(
        [suffix], 
        add_special_tokens=False, 
        max_length=max_length, 
        truncation=True,
        return_tensors="pt",
    )
    
    input_ids = torch.concat([prefix_encoded.input_ids, suffix_encoded.input_ids], dim=-1).to(device)
    attention_mask = torch.concat([prefix_encoded.attention_mask, suffix_encoded.attention_mask], dim=-1).to(device)
    
    # Apply model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)  # shape: (1, seqlen + 1, vocab_size)
    log_probs = torch.log_softmax(outputs.logits, dim=-1)  # shape: (1, seqlen + 1, vocab_size)
    
    # Collect logprobs of correspondent tokens
    # shape: (1, seqlen + 1)
    scores = -torch.gather(log_probs, dim=2, index=input_ids[:, 1:, None]).squeeze(-1)
    scores = scores[0].detach().cpu().numpy()
    
    # Ignore <s> in the beginning
    prefix_len = prefix_encoded.input_ids.shape[-1] - 1
    
    # Split logprobs into prefix and suffix
    return {
        'prefix': scores[:prefix_len],
        'suffix': scores[prefix_len:]
    }
    
def toprob(x):
    return np.exp(-np.sum(x))

def score_suffix(model, prefix, suffix):
    scores = score_completion(model, prefix, suffix)
    return toprob(scores['prefix']), toprob(scores['suffix'])

**Аккуратно с токенизатором** - между префиксом и суффиксом не должно быть пробелов

In [21]:
s1 = score_completion(mistral, "2 + 2 =", "4 ")
s2 = score_completion(mistral, "2 + 2 = 4", " ")

print('s1', s1['prefix'], s1['suffix'])
print('s2', s2['prefix'], s2['suffix'])

s1 [ 4.402521    1.4552702  10.406805    0.5393598   0.4020606   0.43647316] [0.25132608 0.7172515  4.507877  ]
s2 [ 4.402521    1.4552702  10.406805    0.5393598   0.4020606   0.43647316
  0.25132608  0.7172515 ] [6.789127]


**Вероятности предложений совпали** при разном разбиении на префикс и суффикс
$$P(S) = P(\texttt{suff} | \texttt{pref}) \cdot P(\texttt{pref})$$

In [23]:
a = score_suffix(mistral, "2 + 2 =", "4 .")
b = score_suffix(mistral, "2 + 2 = 4", ".")

a[0] * a[1], b[0] * b[1]

(1.6335237e-11, 1.6335254e-11)

# OpenBook QA
- Загружаем скачанный локально датасет
- https://allenai.org/data/open-book-qa

In [24]:
!ls bench_data/

openbook_en_jsonlines.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
import json
with open('bench_data/openbook_en_jsonlines.json', 'r') as fin:
    open_book_dataset = [json.loads(x) for x in fin.readlines()]

In [31]:
open_book_dataset[:2], len(open_book_dataset)

([{'options': ['puppies learning new tricks',
    'children growing up and getting old',
    'flowers wilting in a vase',
    'plants sprouting, blooming and wilting'],
   'task': 'The sun is responsible for',
   'true_answer_id': 3,
   'true_answer_text': 'plants sprouting, blooming and wilting'},
  {'options': ['the mountains seem very close',
    'the mountains are boring',
    'the mountains look the same as from up close',
    'the mountains seem smaller than in photographs'],
   'task': 'When standing miles away from Mount Rushmore',
   'true_answer_id': 3,
   'true_answer_text': 'the mountains seem smaller than in photographs'}],
 5957)

Пошафлим датасет, чтобы удобно замеряться на сулчайном подсемлпе для скорости

In [32]:
np.random.shuffle(open_book_dataset)

In [33]:
open_book_dataset[:2]

[{'options': ['Venus', 'Mars', 'Neptune', 'our Moon'],
  'task': 'This white object is visible due to being close to earth',
  'true_answer_id': 3,
  'true_answer_text': 'our Moon'},
 {'options': ['yellow', 'gold', 'blue', 'gray'],
  'task': 'A sky that is mostly this color will likely be precipitating:',
  'true_answer_id': 3,
  'true_answer_text': 'gray'}]

## Замерим мистраль и ламу в разных режимах

### Скоринг без нормализации

In [27]:
def apply_on_dataset_simple(model, dataset):
    results = []
    for row in tqdm(dataset, position=0):
        
        # Score simple argmax [P(option | task)]
        prefix = row['task']
        option_scores = np.array([
            score_suffix(model, prefix, option)[1]
            for option in row['options']
        ])
        
        selected_option = np.argmax(option_scores)
        
        results.append(selected_option == row['true_answer_id'])

    return np.array(results, dtype=np.float32)

In [35]:
mistral_opbqa_simple_20 = apply_on_dataset_simple(mistral, open_book_dataset[:20])
mistral_opbqa_simple_20

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:03<00:00,  5.34it/s]


array([0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0.,
       1., 1., 0.], dtype=float32)

In [36]:
np.mean(mistral_opbqa_simple_20)

0.4

In [38]:
mistral_opbqa_simple_200 = apply_on_dataset_simple(mistral, open_book_dataset[:200])
np.mean(mistral_opbqa_simple_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:33<00:00,  5.92it/s]


0.4

In [39]:
llama_opbqa_simple_200 = apply_on_dataset_simple(llama, open_book_dataset[:200])
np.mean(llama_opbqa_simple_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:31<00:00,  6.40it/s]


0.35

### Добавим нормализацию на длину

In [13]:
def apply_on_dataset_simple_len_norm(model, dataset):
    results = []
    for row in tqdm(dataset, position=0):
        
        # Score simple argmax [P(option | task)]
        prefix = row['task']
        option_scores = np.array([
            score_suffix(model, prefix, option)[1]
            for option in row['options']
        ])

        # Add len normalization
        option_scores /= np.array([len(r) for r in row['options']])
        
        selected_option = np.argmax(option_scores)
        results.append(selected_option == row['true_answer_id'])

    return np.array(results, dtype=np.float32)

In [41]:
mistral_opbqa_simple_len_norm_200 = apply_on_dataset_simple_len_norm(mistral, open_book_dataset[:200])
np.mean(mistral_opbqa_simple_len_norm_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:35<00:00,  5.60it/s]


0.39

In [42]:
llama_opbqa_simple_len_norm_200 = apply_on_dataset_simple_len_norm(llama, open_book_dataset[:200])
np.mean(llama_opbqa_simple_len_norm_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:31<00:00,  6.45it/s]


0.345

In [12]:
def apply_on_dataset_simple_norm_on_prior(model, dataset, prior_prefix):
    results = []
    for row in tqdm(dataset, position=0):
        
        # Score simple argmax [P(option | task)]
        prefix = row['task']
        option_scores = np.array([
            score_suffix(model, prefix, option)[1]
            for option in row['options']
        ])

        assert isinstance(prior_prefix, str)
        priors = np.array([
            score_suffix(model, prior_prefix, option)[1]
            for option in row['options']
        ])
        option_scores /= priors

        selected_option = np.argmax(option_scores)
        results.append(selected_option == row['true_answer_id'])

    return np.array(results, dtype=np.float32)

In [21]:
mistral_opbqa_simple_norm_on_prior_200 = apply_on_dataset_simple_norm_on_prior(
    mistral, open_book_dataset[:200], 
    prior_prefix="Answer:"
)
np.mean(mistral_opbqa_simple_norm_on_prior_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:10<00:00,  2.84it/s]


0.52

In [23]:
np.save('cache/mistral_opbqa_simple_norm_on_prior_200.npy', mistral_opbqa_simple_norm_on_prior_200)

In [22]:
llama_opbqa_simple_norm_on_prior_200 = apply_on_dataset_simple_norm_on_prior(
    llama, open_book_dataset[:200], 
    prior_prefix="Answer:"
)
np.mean(llama_opbqa_simple_norm_on_prior_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:02<00:00,  3.18it/s]


0.485

In [24]:
np.save('cache/llama_opbqa_simple_norm_on_prior_200.npy', llama_opbqa_simple_norm_on_prior_200)

### Превратим задачу в MCQ (multiple choice question)

In [11]:
def make_mcq(row):
    options = row['options']
    task = row['task']
    prompt = "\n".join([f"{i + 1}. {o}" for i, o in enumerate(options)])
    return {
        'task': task + '\n' + prompt + '\nAnswer:',
        'options': [f'{i + 1}' for i in range(len(options))],
        'true_answer_id': row['true_answer_id']
    }

def apply_on_dataset_with_fromat_fn(model, dataset, normalization=None, format_fn=None):
    results = []
    if format_fn is not None:
        dataset = [format_fn(x) for x in dataset]
        
    for row in tqdm(dataset, position=0):
        prefix = row['task']
        option_scores = np.array([
            score_suffix(model, prefix, option)[1]
            for option in row['options']
        ])            
        results.append(np.argmax(option_scores) == row['true_answer_id'])
        
    return results

In [53]:
make_mcq(open_book_dataset[0])

{'task': 'This white object is visible due to being close to earth\n1. Venus\n2. Mars\n3. Neptune\n4. our Moon\nAnswer:',
 'options': ['1', '2', '3', '4'],
 'true_answer_id': 3}

In [14]:
mistral_opbqa_mcq_200 = apply_on_dataset_with_fromat_fn(
    mistral, open_book_dataset[:200], 
    format_fn=make_mcq
)
np.mean(mistral_opbqa_mcq_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:35<00:00,  5.58it/s]


0.585

In [52]:
llama_opbqa_mcq_200 = apply_on_dataset_with_fromat_fn(
    llama, open_book_dataset[:200], 
    format_fn=make_mcq
)
np.mean(llama_opbqa_mcq_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:32<00:00,  6.23it/s]


0.33

### Соберем few-shot подводку

In [15]:
fewshot_rows = [make_mcq(r) for r in open_book_dataset[-5:]]
fewshot_prompt = "\n\n".join([f"{r['task']} {r['options'][r['true_answer_id']]}" for r in fewshot_rows])
fewshot_prompt

'A person is heating water in order to cook pasta. He spills the pot of water on his leg and finds that the water\n1. scalds\n2. cools\n3. toasts\n4. freezes\nAnswer: 1\n\nPasta may be cooked in water when\n1. the water is warm\n2. the water is on the stove\n3. water is bubbling from applied warmth\n4. the pasta is very fresh\nAnswer: 3\n\nA decrease in diseases\n1. has no impact on a population\n2. leads to more sick people\n3. leads to less sick people\n4. leads to an uptick in emergency room visits\nAnswer: 3\n\nWhen soil is viewed in a scientific way, what is seen and viewed is actually\n1. insects like big beetles\n2. tiny lifeforms in dirt\n3. small mammals living there\n4. a lot of tiny pebbles\nAnswer: 2\n\nSome animals use a liquid coming from their skin to adjust to\n1. cold\n2. water\n3. heat\n4. humidity\nAnswer: 3'

In [33]:
def apply_on_dataset_with_fromat_fn_and_prompt(model, dataset, normalization=None, format_fn=None, prompt=None):
    results = []
    if format_fn is not None:
        dataset = [format_fn(x) for x in dataset]
        
    for row in tqdm(dataset, position=0):
        prefix = row['task']
        if prompt:
            prefix = prompt + "\n\n" + prefix
            
        option_scores = np.array([
            score_suffix(model, prefix, option)[1]
            for option in row['options']
        ])            
        results.append(np.argmax(option_scores) == row['true_answer_id'])
        
    return np.array(results, dtype=np.float32)

In [34]:
mistral_opbqa_mcq_fewshot_200 = apply_on_dataset_with_fromat_fn_and_prompt(
    mistral, open_book_dataset[:200], 
    format_fn=make_mcq,
    prompt=fewshot_prompt
)
np.mean(mistral_opbqa_mcq_fewshot_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:47<00:00,  4.25it/s]


0.76

In [38]:
np.save('cache/mistral_opbqa_mcq_fewshot_200.npy', mistral_opbqa_mcq_fewshot_200)

In [36]:
llama_opbqa_mcq_fewshot_200 = apply_on_dataset_with_fromat_fn_and_prompt(
    llama, open_book_dataset[:200], 
    format_fn=make_mcq,
    prompt=fewshot_prompt
)
np.mean(llama_opbqa_mcq_fewshot_200)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:34<00:00,  5.79it/s]


0.51

In [39]:
np.save('cache/llama_opbqa_mcq_fewshot_200.npy', llama_opbqa_mcq_fewshot_200)


## Статзначимость
- Проверим бутстрап и альтернативные способы оценки статзначимости

In [58]:
def bootstrap_test(baseline, test, n_samples=500000, paired=True):
    baseline = np.array(baseline, dtype=np.float32)
    test = np.array(test, dtype=np.float32)
    assert len(baseline) == len(test)
    indices = np.random.randint(0, len(test), (n_samples, len(test)))
    if paired:
        indices2 = indices
    else:
        indices2 = np.random.randint(0, len(test), (n_samples, len(test)))
    
    test_sampled_scores = np.mean(test[indices], axis=-1)
    baseline_sampled_scores = np.mean(baseline[indices2], axis=-1)

    diff = test_sampled_scores - baseline_sampled_scores
    
    same_cnt = np.sum(diff == 0)
    test_better_cnt = np.sum(diff > 0)
    test_worse_cnt = np.sum(diff < 0)
    pval_left = 1.0 - test_better_cnt / n_samples
#     return pval_left
    
    pval_right = 1.0 - test_worse_cnt / n_samples
    pval_twosided = min(min(pval_left, pval_right) * 2.0, 1.0)
    return pval_twosided


In [27]:
len(mistral_opbqa_mcq_fewshot_200), len(llama_opbqa_mcq_fewshot_200)

(200, 200)

In [40]:
print('llama', np.mean(mistral_opbqa_mcq_fewshot_200))
print('mistral', np.mean(llama_opbqa_mcq_fewshot_200))
print('pvalue', bootstrap_test(baseline=mistral_opbqa_mcq_fewshot_200, test=llama_opbqa_mcq_fewshot_200))

llama 0.76
mistral 0.51
pvalue 0.0


In [64]:
from scipy.stats import mannwhitneyu, ttest_ind, wilcoxon, ttest_rel
def all_tests(A, B):
    tests = [
        (mannwhitneyu, "mannwhitneyu"), 
        (ttest_ind, "ttest_ind"), 
        (wilcoxon, "wilcoxon"), 
        (ttest_rel, "ttest_rel")
    ]
    for fn, name in tests:
        print(f"{name}: {fn(A, B).pvalue}")
    print(f"bootstrap: {bootstrap_test(A, B, paired=True)}")
    print(f"bootstrap-unpaired: {bootstrap_test(A, B, paired=False)}")

In [65]:
all_tests(mistral_opbqa_mcq_fewshot_200, llama_opbqa_mcq_fewshot_200)

mannwhitneyu: 2.1503481599517891e-07
ttest_ind: 1.3851098841591184e-07
wilcoxon: 6.1590119706630185e-09
ttest_rel: 1.3570433906500782e-09
bootstrap: 0.0
bootstrap-unpaired: 0.0


In [66]:
all_tests(llama_opbqa_simple_norm_on_prior_200, mistral_opbqa_simple_norm_on_prior_200)

mannwhitneyu: 0.4847809081775155
ttest_ind: 0.4851576177708934
wilcoxon: 0.32698934959801507
ttest_rel: 0.32822703308821277
bootstrap: 0.35995599999999994
bootstrap-unpaired: 0.517272


In [70]:
add_1 = (np.random.rand(len(llama_opbqa_simple_norm_on_prior_200)) - 0.5) * 0.0001
add_2 = (np.random.rand(len(llama_opbqa_simple_norm_on_prior_200)) - 0.5) * 0.0001

all_tests(
    llama_opbqa_simple_norm_on_prior_200 + add_1,
    mistral_opbqa_simple_norm_on_prior_200 + add_2
)

mannwhitneyu: 0.3783468435580267
ttest_ind: 0.4851384255369875
wilcoxon: 0.21781019839200122
ttest_rel: 0.32820996272670244
bootstrap: 0.3153760000000001
bootstrap-unpaired: 0.47297199999999995


In [71]:
all_tests(mistral_opbqa_mcq_fewshot_200 + add_1, llama_opbqa_mcq_fewshot_200 + add_2)

mannwhitneyu: 3.330958373896012e-05
ttest_ind: 1.3853408041564848e-07
wilcoxon: 4.945252167103879e-06
ttest_rel: 1.3572904826876695e-09
bootstrap: 0.0
bootstrap-unpaired: 0.0


### Замерим непрокрасившийся сетап на семпле побольше

In [72]:
mistral_opbqa_simple_norm_on_prior_2000 = apply_on_dataset_simple_norm_on_prior(
    mistral, open_book_dataset[:2000], 
    prior_prefix="Answer:"
)
np.save('cache/mistral_opbqa_simple_norm_on_prior_2000.npy', mistral_opbqa_simple_norm_on_prior_2000)
np.mean(mistral_opbqa_simple_norm_on_prior_2000)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [12:03<00:00,  2.77it/s]


0.5655

In [92]:
llama_opbqa_simple_norm_on_prior_2000 = apply_on_dataset_simple_norm_on_prior(
    llama, open_book_dataset[:2000], 
    prior_prefix="Answer:"
)
np.save('cache/mistral_opbqa_simple_norm_on_prior_2000.npy', llama_opbqa_simple_norm_on_prior_2000)
np.mean(llama_opbqa_simple_norm_on_prior_2000)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [10:52<00:00,  3.07it/s]


0.5225

In [96]:
print('mistral_opbqa_simple_norm_on_prior_2000', np.mean(mistral_opbqa_simple_norm_on_prior_2000))
print('llama_opbqa_simple_norm_on_prior_2000', np.mean(llama_opbqa_simple_norm_on_prior_2000))
print('-------------')

all_tests(mistral_opbqa_simple_norm_on_prior_2000, llama_opbqa_simple_norm_on_prior_2000)

mistral_opbqa_simple_norm_on_prior_2000 0.5655
llama_opbqa_simple_norm_on_prior_2000 0.5225
-------------
mannwhitneyu: 0.006337404430601847
ttest_ind: 0.006322441753302127
wilcoxon: 1.880218804695599e-05
ttest_rel: 1.8119041325330304e-05
bootstrap: 1.6000000000016e-05
bootstrap-unpaired: 0.006783999999999901


## Посмотрим на распределение скоров

In [89]:
def visualize(task, options, option_scores, true_answer_id):
    scored_options_str = "\n".join([f"{s} - {o}{' [TRUE]' if i == true_answer_id else ''}" for i, (o, s) in enumerate(zip(options, option_scores))])
    print(f"{task}\nOptions:\n{scored_options_str}")
    
def apply_on_dataset_with_all_tools(model, dataset, normalization=None, format_fn=None, prompt=None):
    results = []
    if format_fn is not None:
        dataset = [format_fn(x) for x in dataset]
        
    for i, row in tqdm(enumerate(dataset), position=0):
        prefix = row['task']

        if prompt:
            prefix = prompt + "\n\n" + prefix
            
        option_scores = np.array([
            score_suffix(model, prefix, option)[1]
            for option in row['options']
        ])  

        if normalization == 'length':
            option_scores /= np.array([len(r) for r in row['options']])
        elif normalization is not None:
            prior_prefix = normalization
            priors = np.array([
                score_suffix(model, prior_prefix, option)[1]
                for option in row['options']
            ])
            option_scores /= priors
        
#         if i in (0, 10, 20):
        visualize(row['task'], row['options'], option_scores, row['true_answer_id'])
        
        results.append(np.argmax(option_scores) == row['true_answer_id'])
        
    return np.array(results, dtype=np.float32)

### MCQ + Fewshot

In [82]:
apply_on_dataset_with_all_tools(
    llama, open_book_dataset[:3], 
    format_fn=make_mcq,
    prompt=fewshot_prompt
)

2it [00:00,  5.41it/s]

The sun is responsible for
1. puppies learning new tricks
2. children growing up and getting old
3. flowers wilting in a vase
4. plants sprouting, blooming and wilting
Answer:
Options:
0.1635913848876953 - 1
0.14436888694763184 - 2
0.18537330627441406 - 3
0.503896951675415 - 4 [TRUE]
When standing miles away from Mount Rushmore
1. the mountains seem very close
2. the mountains are boring
3. the mountains look the same as from up close
4. the mountains seem smaller than in photographs
Answer:
Options:
0.3910847008228302 - 1
0.18473531305789948 - 2
0.18473531305789948 - 3
0.23720481991767883 - 4 [TRUE]


3it [00:00,  5.48it/s]

When food is reduced in the stomach
1. the mind needs time to digest
2. take a second to digest what I said
3. nutrients are being deconstructed
4. reader's digest is a body of works
Answer:
Options:
0.18901626765727997 - 1
0.14720602333545685 - 2
0.5822111368179321 - 3 [TRUE]
0.07879370450973511 - 4





array([1., 0., 1.], dtype=float32)

In [83]:
apply_on_dataset_with_all_tools(
    mistral, open_book_dataset[:3], 
    format_fn=make_mcq,
    prompt=fewshot_prompt
)

1it [00:00,  4.21it/s]

The sun is responsible for
1. puppies learning new tricks
2. children growing up and getting old
3. flowers wilting in a vase
4. plants sprouting, blooming and wilting
Answer:
Options:
0.014951090328395367 - 1
0.03165145590901375 - 2
0.024650176987051964 - 3
0.9249911904335022 - 4 [TRUE]


2it [00:00,  4.22it/s]

When standing miles away from Mount Rushmore
1. the mountains seem very close
2. the mountains are boring
3. the mountains look the same as from up close
4. the mountains seem smaller than in photographs
Answer:
Options:
0.20462031662464142 - 1
0.15935847163200378 - 2
0.07527562230825424 - 3
0.5562157034873962 - 4 [TRUE]


3it [00:00,  4.23it/s]

When food is reduced in the stomach
1. the mind needs time to digest
2. take a second to digest what I said
3. nutrients are being deconstructed
4. reader's digest is a body of works
Answer:
Options:
0.011937608011066914 - 1
0.028636841103434563 - 2
0.9483218789100647 - 3 [TRUE]
0.007240525912493467 - 4





array([1., 1., 1.], dtype=float32)

### MCQ + 0-Shot


In [84]:
apply_on_dataset_with_all_tools(
    llama, open_book_dataset[:3], 
    format_fn=make_mcq,
    prompt=None
)

2it [00:00,  5.86it/s]

The sun is responsible for
1. puppies learning new tricks
2. children growing up and getting old
3. flowers wilting in a vase
4. plants sprouting, blooming and wilting
Answer:
Options:
0.11462289839982986 - 1
0.042167410254478455 - 2
0.042167410254478455 - 3
0.05414402484893799 - 4 [TRUE]
When standing miles away from Mount Rushmore
1. the mountains seem very close
2. the mountains are boring
3. the mountains look the same as from up close
4. the mountains seem smaller than in photographs
Answer:
Options:
0.14511708915233612 - 1
0.12806537747383118 - 2
0.12806537747383118 - 3
0.12806537747383118 - 4 [TRUE]


3it [00:00,  5.89it/s]

When food is reduced in the stomach
1. the mind needs time to digest
2. take a second to digest what I said
3. nutrients are being deconstructed
4. reader's digest is a body of works
Answer:
Options:
0.1293146163225174 - 1
0.07368124276399612 - 2
0.07843327522277832 - 3 [TRUE]
0.1293146163225174 - 4





array([0., 0., 0.], dtype=float32)

In [85]:
apply_on_dataset_with_all_tools(
    mistral, open_book_dataset[:3], 
    format_fn=make_mcq,
    prompt=None
)

2it [00:00,  5.55it/s]

The sun is responsible for
1. puppies learning new tricks
2. children growing up and getting old
3. flowers wilting in a vase
4. plants sprouting, blooming and wilting
Answer:
Options:
0.04783881828188896 - 1
0.054208479821681976 - 2
0.061426255851984024 - 3
0.07887287437915802 - 4 [TRUE]
When standing miles away from Mount Rushmore
1. the mountains seem very close
2. the mountains are boring
3. the mountains look the same as from up close
4. the mountains seem smaller than in photographs
Answer:
Options:
0.06996961683034897 - 1
0.048089370131492615 - 2
0.05119086056947708 - 3
0.0744822695851326 - 4 [TRUE]


3it [00:00,  5.53it/s]

When food is reduced in the stomach
1. the mind needs time to digest
2. take a second to digest what I said
3. nutrients are being deconstructed
4. reader's digest is a body of works
Answer:
Options:
0.2256922572851181 - 1
0.10660947114229202 - 2
0.06074425205588341 - 3 [TRUE]
0.0325140617787838 - 4





array([1., 1., 0.], dtype=float32)

### Скоринг

In [86]:
apply_on_dataset_with_all_tools(
    llama, open_book_dataset[:3], 
    format_fn=None,
    prompt=None
)

2it [00:00,  6.27it/s]

The sun is responsible for
Options:
5.724991336032714e-14 - puppies learning new tricks
1.0741314088180687e-11 - children growing up and getting old
6.48289364899024e-12 - flowers wilting in a vase
2.2158153201276738e-14 - plants sprouting, blooming and wilting [TRUE]
When standing miles away from Mount Rushmore
Options:
1.2116071701484543e-08 - the mountains seem very close
6.121588791430099e-10 - the mountains are boring
1.011798934566488e-12 - the mountains look the same as from up close
2.0233129408020467e-12 - the mountains seem smaller than in photographs [TRUE]


3it [00:00,  6.24it/s]

When food is reduced in the stomach
Options:
5.628290368486777e-12 - the mind needs time to digest
7.758056158474412e-17 - take a second to digest what I said
6.080840692090916e-12 - nutrients are being deconstructed [TRUE]
6.066636316517671e-20 - reader's digest is a body of works





array([0., 0., 1.], dtype=float32)

In [87]:
apply_on_dataset_with_all_tools(
    mistral, open_book_dataset[:3], 
    format_fn=None,
    prompt=None
)

2it [00:00,  5.33it/s]

The sun is responsible for
Options:
5.811069941213942e-13 - puppies learning new tricks
1.394638744767196e-12 - children growing up and getting old
2.326297574711811e-12 - flowers wilting in a vase
7.426364211082781e-13 - plants sprouting, blooming and wilting [TRUE]
When standing miles away from Mount Rushmore
Options:
1.0746776801795477e-09 - the mountains seem very close
4.674062178966487e-11 - the mountains are boring
4.422852871704269e-13 - the mountains look the same as from up close
2.4759210306979362e-11 - the mountains seem smaller than in photographs [TRUE]


3it [00:00,  5.34it/s]

When food is reduced in the stomach
Options:
4.944046907984223e-12 - the mind needs time to digest
3.686249749732844e-18 - take a second to digest what I said
8.703655743173833e-13 - nutrients are being deconstructed [TRUE]
6.309999773249721e-24 - reader's digest is a body of works





array([0., 0., 0.], dtype=float32)

### Скоринг с нормализацией

In [90]:
apply_on_dataset_with_all_tools(
    llama, open_book_dataset[:3], 
    format_fn=None,
    prompt=None,
    normalization='Answer:'
)

1it [00:00,  3.21it/s]

The sun is responsible for
Options:
0.4955384135246277 - puppies learning new tricks
118.44405364990234 - children growing up and getting old
20.253536224365234 - flowers wilting in a vase
11538.96875 - plants sprouting, blooming and wilting [TRUE]


2it [00:00,  3.20it/s]

When standing miles away from Mount Rushmore
Options:
71569.8203125 - the mountains seem very close
132.78067016601562 - the mountains are boring
627455.625 - the mountains look the same as from up close
422023.375 - the mountains seem smaller than in photographs [TRUE]


3it [00:00,  3.20it/s]

When food is reduced in the stomach
Options:
3.0362017154693604 - the mind needs time to digest
0.004176686517894268 - take a second to digest what I said
4901.15771484375 - nutrients are being deconstructed [TRUE]
0.007643749471753836 - reader's digest is a body of works





array([1., 0., 1.], dtype=float32)

In [91]:
apply_on_dataset_with_all_tools(
    mistral, open_book_dataset[:3], 
    format_fn=None,
    prompt=None,
    normalization='Answer:'
)

1it [00:00,  2.76it/s]

The sun is responsible for
Options:
19.41446876525879 - puppies learning new tricks
53.42073440551758 - children growing up and getting old
7.065333843231201 - flowers wilting in a vase
255618.84375 - plants sprouting, blooming and wilting [TRUE]


2it [00:00,  2.77it/s]

When standing miles away from Mount Rushmore
Options:
2011.4620361328125 - the mountains seem very close
16.4169921875 - the mountains are boring
130829.78125 - the mountains look the same as from up close
731890.0 - the mountains seem smaller than in photographs [TRUE]


3it [00:01,  2.77it/s]

When food is reduced in the stomach
Options:
4.504613399505615 - the mind needs time to digest
0.0017585433088243008 - take a second to digest what I said
9947.365234375 - nutrients are being deconstructed [TRUE]
4.1989492274296936e-06 - reader's digest is a body of works





array([1., 1., 1.], dtype=float32)