In [10]:
%run -n main.py
lines = read_lines(DOTENV_PATH)
pairs = parse_dotenv(lines)
os.environ.update(pairs)
%run -n main.py

# tasks

## terra

In [None]:
path = 'data/rsg/TERRa/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['label'] == 'entailment'], 50)
    + random.sample([_ for _ in items if _['label'] == 'not_entailment'], 50)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/terra.jsonl', lines)

## danetqa

In [None]:
path = 'data/rsg/DaNetQA/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed(0)
items = (
    random.sample([_ for _ in items if _['label'] is True], 50)
    + random.sample([_ for _ in items if _['label'] is False], 50)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/danetqa.jsonl', lines)

## parus

In [None]:
path = 'data/rsg/PARus/val.jsonl'
lines = read_lines(path)
items = list(parse_jsonl(lines))

In [None]:
random.seed()
items = (
    random.sample([_ for _ in items if _['question'] == 'effect'], 48)
    + random.sample([_ for _ in items if _['question'] == 'cause'], 52)
)
random.shuffle(items)
for item in items:
    item['id'] = item.pop('idx')

In [None]:
lines = format_jsonl(items)
write_lines('tasks/parus.jsonl', lines)

# eval

In [153]:
task = TERRA
eval = '07_code_cushman_terra'

In [171]:
%run -n main.py
lines = read_lines(f'tasks/{task}.jsonl')
task_items = list(parse_jsonl(lines))

In [177]:
path = Path(f'evals/{eval}.jsonl')
eval_items = []
if path.exists():
    lines = read_lines(path)
    eval_items.extend(parse_jsonl(lines))
len(eval_items)

0

In [178]:
%run -n main.py
cache_ids = {_['id'] for _ in eval_items}
nocache_task_items = [_ for _ in task_items if _['id'] not in cache_ids]

for task_item in log_progress(nocache_task_items):
    prompt = TASK_PROMPTS[task](task_item)
    response = join_tokens(openai_completions_stream(prompt, model=CODE_CUSHMAN_001))
    eval_items.append({
        'id': task_item['id'],
        'response': response
    })
    sleep(2)

100%|██████████| 3/3 [00:09<00:00,  3.03s/it]


In [179]:
eval_items

[{'id': 131, 'response': 'No'},
 {'id': 72, 'response': 'Yes'},
 {'id': 167, 'response': 'No'}]

In [180]:
[_['label'] for _ in task_items[:10]]

['entailment',
 'not_entailment',
 'entailment',
 'entailment',
 'entailment',
 'not_entailment',
 'not_entailment',
 'not_entailment',
 'not_entailment',
 'entailment']

In [150]:
lines = format_jsonl(eval_items)
write_lines(f'evals/{eval}.jsonl', lines)

# score

In [152]:
%run -n main.py
model_task_evals = [
    (TEXT_DAVINCHI_003, TERRA, '01_davinci_terra'),
    (TEXT_DAVINCHI_003, DANETQA, '02_davinci_danetqa'),
    (TEXT_DAVINCHI_003, PARUS, '03_davinci_parus'),
    (GPT_35_TURBO_0301, PARUS, '04_turbo_parus'),
    (GPT_35_TURBO_0301, DANETQA, '05_turbo_danetqa'),
    (GPT_35_TURBO_0301, TERRA, '06_turbo_terra'),
]
data = []
for model, task, eval in model_task_evals:
    lines = read_lines(f'tasks/{task}.jsonl')
    id_targets = {
        _['id']: _['label']
        for _ in parse_jsonl(lines)
    }
    
    lines = read_lines(f'evals/{eval}.jsonl')
    norm_response = NORM_RESPONSES[task]
    id_preds = {
        _['id']: norm_response(_['response'])
        for _ in parse_jsonl(lines)
    }
    score = acc_score(id_targets, id_preds)
    data.append((model, task, score))
data

[('text-davinci-003', 'terra', (0.91, 0)),
 ('text-davinci-003', 'danetqa', (0.79, 0)),
 ('text-davinci-003', 'parus', (0.93, 0)),
 ('gpt-3.5-turbo-0301', 'parus', (0.9130434782608695, 8)),
 ('gpt-3.5-turbo-0301', 'danetqa', (0.8350515463917526, 3)),
 ('gpt-3.5-turbo-0301', 'terra', (0.86, 0))]

# explore

In [131]:
# lines = read_lines(f'tasks/{task}.jsonl')
# id_task_items = {
#     _['id']: _
#     for _ in parse_jsonl(lines)
# }

# for id, task_item in id_task_items.items():
#     target = id_targets[id]
#     pred = id_preds[id]
#     if target == pred:
#         continue
        
#     display(task_item)
#     display(target)
#     display(pred)