In [304]:
%run -n main.py
lines = read_lines(DOTENV_PATH)
pairs = parse_dotenv(lines)
os.environ.update(pairs)
%run -n main.py

# rsg scores var

In [None]:
%run -n main.py
table = pd.DataFrame(RSG_LB)

xs, ys = [], []
tasks = ['RCB', 'PARus', 'MuSeRC', 'TERRa', 'RUSSE', 'RWSD', 'DaNetQA', 'RuCoS']
for task_index, task in enumerate(tasks):
    for score in table[task]:
        xs.append(score)
        ys.append(task_index + random.random() / 10)

fig, ax = plt.subplots()
ax.scatter(xs, ys, alpha=0.5)
ax.set_xlim(0, 1)
ax.set_yticks(range(len(tasks)))
ax.set_yticklabels(tasks)

# tasks

## terra

In [None]:
path = 'data/rsg/TERRa/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['label'] == 'entailment'], 50)
    + random.sample([_ for _ in items if _['label'] == 'not_entailment'], 50)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/terra.jsonl', lines)

## danetqa

In [None]:
path = 'data/rsg/DaNetQA/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['label'] is True], 50)
    + random.sample([_ for _ in items if _['label'] is False], 50)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/danetqa.jsonl', lines)

## parus

In [None]:
path = 'data/rsg/PARus/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['question'] == 'effect'], 48)
    + random.sample([_ for _ in items if _['question'] == 'cause'], 52)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/parus.jsonl', lines)

## rwsd

In [None]:
path = 'data/rsg/RWSD/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['label'] == True], 50)
    + random.sample([_ for _ in items if _['label'] == False], 50)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/rwsd.jsonl', lines)

## russe

In [None]:
path = 'data/rsg/RUSSE/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['label'] == True], 50)
    + random.sample([_ for _ in items if _['label'] == False], 50)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/russe.jsonl', lines)

## rucola

In [None]:
path = 'data/rucola/out_of_domain_dev.csv'
items = list(read_csv(path))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['acceptable'] == '0' and _['error_type'] == 'Hallucination'], 50)
    + random.sample([_ for _ in items if _['acceptable'] == '1'], 50)
)
random.shuffle(items)
for item in items:
    item['label'] = item.pop('acceptable')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/rucola.jsonl', lines)

# eval apis

In [None]:
%run -n main.py
task = DANETQA
eval = f'17_curie_{task}'

In [None]:
%run -n main.py
lines = read_lines(f'tasks/{task}.jsonl')
task_items = list(parse_jsonl(lines))

In [None]:
%run -n main.py
task_item = random.choice(task_items)
prompt = TASK_PROMPTS[task](task_item)
print(prompt)

In [None]:
%run -n main.py
response = openai_completions(prompt, model=TEXT_CURIE_001, stop='---')
print(response)

In [None]:
path = Path(f'evals/{eval}.jsonl')
eval_items = []
if path.exists():
    lines = read_lines(path)
    eval_items.extend(parse_jsonl(lines))
len(eval_items)

In [None]:
%run -n main.py
cache_ids = {_['id'] for _ in eval_items}
nocache_task_items = [_ for _ in task_items if _['id'] not in cache_ids]

for task_item in log_progress(nocache_task_items):
    prompt = TASK_PROMPTS[task](task_item)

    response = openai_completions(prompt, model=TEXT_CURIE_001, stop='---')
    sleep(2)

    # response = openai_chat_completions(prompt, model=GPT_35_TURBO_0301, stop='---')
    # sleep(2)

    # response = cohere_generate(prompt, end_sequences=['---'])
    # sleep(12)

    eval_items.append({
        'id': task_item['id'],
        'response': response
    })


In [None]:
eval_items

In [None]:
[_['label'] for _ in task_items[:10]]

In [None]:
lines = format_jsonl(eval_items)
write_lines(f'evals/{eval}.jsonl', lines)

# score

In [317]:
%run -n main.py
MODEL_TASK_EVALS

{('text-davinci-003', 'terra'): '01_davinci_terra',
 ('text-davinci-003', 'danetqa'): '02_davinci_danetqa',
 ('text-davinci-003', 'parus'): '03_davinci_parus',
 ('gpt-3.5-turbo-0301', 'parus'): '04_turbo_parus',
 ('gpt-3.5-turbo-0301', 'danetqa'): '05_turbo_danetqa',
 ('gpt-3.5-turbo-0301', 'terra'): '06_turbo_terra',
 ('xlarge', 'parus'): '07_cohere_parus',
 ('xlarge', 'danetqa'): '08_cohere_danetqa',
 ('xlarge', 'terra'): '09_cohere_terra',
 ('gpt-3.5-turbo-0301', 'rwsd'): '12_turbo_rwsd',
 ('gpt-3.5-turbo-0301', 'russe'): '13_turbo_russe',
 ('gpt-3.5-turbo-0301', 'rucola'): '14_turbo_rucola',
 ('text-curie-001', 'parus'): '15_curie_parus',
 ('text-curie-001', 'terra'): '16_curie_terra',
 ('text-curie-001', 'danetqa'): '17_curie_danetqa',
 ('sberbank-ai/rugpt3small_based_on_gpt2', 'terra'): '18_rugpt3_small_terra',
 ('sberbank-ai/rugpt3small_based_on_gpt2',
  'danetqa'): '19_rugpt3_small_danetqa',
 ('sberbank-ai/rugpt3small_based_on_gpt2', 'parus'): '20_rugpt3_small_parus',
 ('sberba

In [318]:
%run -n main.py
model_task_scores = []
for (model, task), eval in MODEL_TASK_EVALS.items():
    lines = read_lines(f'tasks/{task}.jsonl')
    id_targets = {
        _['id']: _['label']
        for _ in parse_jsonl(lines)
    }
    
    lines = read_lines(f'evals/{eval}.jsonl')
    norm_response = NORM_RESPONSES[task]
    id_preds = {
        _['id']: norm_response(_['response'])
        for _ in parse_jsonl(lines)
    }
    score = acc_score(id_targets, id_preds)
    model_task_scores.append((model, task, score))

table = scores_table(model_task_scores)
table

task,terra,danetqa,parus,rwsd,russe,rucola
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
human,0.92,0.915,0.982,0.84,0.805,0.84
sota,0.877,0.917,0.908,0.675,0.823,0.82
openai/turbo,0.86,"0.82, 3!","0.91, 8!",0.79,0.68,0.53
openai/davinci,0.91,0.79,0.93,-,-,-
openai/curie,0.57,"0.71, 1!",0.60,-,-,-
cohere/xlarge,0.72,0.50,0.54,-,-,-
sberbank-ai/rugpt3small_based_on_gpt2,"0.66, 71!","0.61, 36!","0.51, 51!","0.32, 59!","0.57, 54!","0.48, 46!"
sberbank-ai/rugpt3medium_based_on_gpt2,"0.48, 25!","0.54, 28!","0.70, 67!","0.55, 16!","0.49, 17!","0.51, 14!"
sberbank-ai/rugpt3large_based_on_gpt2,"0.59, 56!","0.51, 27!","0.51, 63!","0.36, 34!","0.52, 42!","0.51, 55!"
facebook/xglm-1.7B,"0.56, 84!","0.52, 79!","?, 95!","0.35, 83!","0.31, 84!","0.58, 74!"


In [8]:
#!c1.4
task = 'parus'
eval = '20_rugpt3_small_parus'
lines = read_lines(f'tasks/{task}.jsonl')
id_targets = {
        _['id']: _['label']
        for _ in parse_jsonl(lines)
}
    
lines = read_lines(f'evals/{eval}.jsonl')
norm_response = NORM_RESPONSES[task]
id_preds = {
    _['id']: norm_response(_['response'])
    for _ in parse_jsonl(lines)
}

In [11]:
set(id_preds) & (set(id_targets))

{1, 11, 26, 52, 53, 66, 74, 82, 91, 95}