In [None]:
%run -n main.py
lines = read_lines(DOTENV_PATH)
pairs = parse_dotenv(lines)
os.environ.update(pairs)
%run -n main.py

# tasks

## terra

In [None]:
path = 'data/rsg/TERRa/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['label'] == 'entailment'], 50)
    + random.sample([_ for _ in items if _['label'] == 'not_entailment'], 50)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/terra.jsonl', lines)

## danetqa

In [None]:
path = 'data/rsg/DaNetQA/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['label'] is True], 50)
    + random.sample([_ for _ in items if _['label'] is False], 50)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/danetqa.jsonl', lines)

## parus

In [None]:
path = 'data/rsg/PARus/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed()
items = (
    random.sample([_ for _ in items if _['question'] == 'effect'], 48)
    + random.sample([_ for _ in items if _['question'] == 'cause'], 52)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/parus.jsonl', lines)

# eval

In [None]:
%run -n main.py
lines = read_lines('tasks/parus.jsonl')
task_items = list(parse_jsonl(lines))

In [None]:
cache_ids = {}
lines = read_lines('evals/03_openai_parus.jsonl')
items = parse_jsonl(lines)
cache_ids = {_['id'] for _ in items}
len(cache_ids)

In [None]:
# eval_items = []

In [None]:
%run -n main.py
for task_item in log_progress([_ for _ in task_items if _['id'] not in cache_ids]):
    prompt = parus_prompt(task_item)
    response = join_tokens(openai_generate_stream(prompt, model=TEXT_DAVINCHI_003))
    eval_items.append({
        'id': task_item['id'],
        'response': response
    })
    sleep(2)

In [None]:
len(eval_items)

In [None]:
print(prompt)

In [None]:
eval_items

In [None]:
[_['label'] for _ in task_items[:5]]

In [None]:
lines = format_jsonl(eval_items)
write_lines('evals/03_openai_parus.jsonl', lines)

# score

In [None]:
%run -n main.py
model_task_evals = [
    (TEXT_DAVINCHI_003, TERRA, '01_openai_terra'),
    (TEXT_DAVINCHI_003, DANETQA, '02_openai_danetqa'),
    (TEXT_DAVINCHI_003, PARUS, '03_openai_parus'),
]
data = []
for model, task, eval in model_task_evals:
    lines = read_lines(f'tasks/{task}.jsonl')
    id_targets = {
        _['id']: _['label']
        for _ in parse_jsonl(lines)
    }
    
    lines = read_lines(f'evals/{eval}.jsonl')
    norm_response = NORM_RESPONSE[task]
    id_preds = {
        _['id']: norm_response(_['response'])
        for _ in parse_jsonl(lines)
    }
    score = acc_score(id_targets, id_preds)
    data.append((model, task, score))
data