In [None]:
%run -n main.py
lines = read_lines('.secret')
env = dict(parse_dotenv(lines))
openai.api_key = env['OPENAI_TOKEN']

# tasks

In [None]:
# table = pd.read_excel('tasks/user_oriented_annot.xlsx')
# table = table[~table.done.isnull()]
# table = table.where(pd.notnull(table), None)

# view = table[['id', 'orig_instruction', 'orig_input']]
# view = view.rename(columns={
#     'orig_instruction': 'instruction',
#     'orig_input': 'input'
# })
# items = view.to_dict(orient='records')
# lines = format_jsonl(items)
# write_lines('tasks/user_oriented_en.jsonl', lines)

# view = table[['id', 'instruction', 'input']]
# items = view.to_dict(orient='records')
# lines = format_jsonl(items)
# write_lines('tasks/user_oriented_ru.jsonl', lines)

In [None]:
# table = pd.read_excel('tasks/vicuna_question_annot.xlsx')

# view = table[['id', 'category', 'instruction']]
# items = view.to_dict(orient='records')
# lines = format_jsonl(items)
# write_lines('tasks/vicuna_question_ru.jsonl', lines)

# view = table[['id', 'category', 'orig_instruction']]
# view = view.rename(columns={'orig_instruction': 'instruction'})
# items = view.to_dict(orient='records')
# lines = format_jsonl(items)
# write_lines('tasks/vicuna_question_en.jsonl', lines)

# models

## gusev_7b_ru_alpaca_lora

In [None]:
# LLM.int8() requires Turing or Ampere GPUs.
# WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!
import os
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-11.7/targets/x86_64-linux/lib'

import torch
from peft import (
    PeftModel,
    PeftConfig
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig
)

In [None]:
model_name = 'IlyaGusev/llama_7b_ru_turbo_alpaca_lora'
config = PeftConfig.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,

    # Overriding torch_dtype=None with `torch_dtype=torch.float16
    torch_dtype=torch.float16,
    
    # A device map needs to be passed to run convert models into mixed-int8
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PeftModel.from_pretrained(model, model_name)

## gusev_13b_ru_alpaca_lora

In [None]:
model_name = 'IlyaGusev/llama_13b_ru_turbo_alpaca_lora'
config = PeftConfig.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PeftModel.from_pretrained(model, model_name)

## gusev_7b_en_alpaca_lora

In [None]:
model_name = 'IlyaGusev/alpaca_7b_lora_reproduce'

config = PeftConfig.from_pretrained(model_name)
config.base_model_name_or_path = 'decapoda-research/llama-7b-hf'

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PeftModel.from_pretrained(model, model_name)

## chainyo_7b_en_alpaca_lora

In [None]:
model_name = 'chainyo/alpaca-lora-7b'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map='auto',
)

## wortega_instruct_rugpt_large

In [None]:
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel
)

model_name = 'AlexWortega/instruct_rugptlarge'

tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
special_tokens_dict = {
    'additional_special_tokens': [
        '<code>', '</code>',
        '<instructionS>', '<instructionE>',
        '<next>'
    ]
}
tokenizer.add_special_tokens(special_tokens_dict)

device = 'cuda:0'
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
model.resize_token_embeddings(len(tokenizer))

In [None]:
for record in log_progress(task_records):
    prompt = instruct_rugpt_prompt(record)
    output = instruct_rugpt_complete(prompt, model, tokenizer)
    eval_records.append(EvalRecord(record.id, prompt, output))

# evals

In [None]:
%run -n main.py
task_path = 'tasks/user_oriented_ru.jsonl'
eval_path = 'evals/user_oriented_ru/openai_turbo.jsonl'

In [None]:
eval_records = list(load_eval(eval_path))
cache_ids = {_.id for _ in eval_records}
task_records = [
    _ for _ in load_task(task_path)
    if _.id not in cache_ids
]
print('eval:', len(eval_records))
print('task:', len(task_records))

In [None]:
%run -n main.py
for task_record in log_progress(task_records):
    eval_record = eval_ru_openai(task_record, GPT_35_TURBO)

    print(eval_record)
    if eval_record:
        eval_records.append(eval_record) 

In [None]:
dump_eval(eval_path, eval_records)

# annot

In [None]:
!ls evals/ru_vicuna_question

In [None]:
%run -n main.py
a_name = 'gusev_7b_ru_alpaca_lora'
b_name = 'gusev_13b_ru_alpaca_lora'

lines = read_lines(f'evals/ru_vicuna_question/{a_name}.jsonl')
a_items = list(parse_jsonl(lines))

lines = read_lines(f'evals/ru_vicuna_question/{b_name}.jsonl')
b_items = list(parse_jsonl(lines))

In [None]:
import pandas as pd

id_a_items = {_['id']: _ for _ in a_items}
id_b_items = {_['id']: _ for _ in b_items}
items = []
for id in sorted(id_a_items.keys() & id_b_items.keys()):
    a_item = id_a_items[id]
    b_item = id_b_items[id]

    items.append({
        'id': id,
        'prompt': b_item['prompt'],
        'a': a_item['output'],
        'b': b_item['output'],
        'label': None
    })

table = pd.DataFrame(items)
table.to_excel(f'sbs/ru_vicuna_question/{a_name}_{b_name}.xlsx', index=False)

In [None]:
!open sbs

# report

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
from pathlib import Path

names = [
#     'openai_turbo',
    'openai_davinci_002',
    'gusev_7b_ru_alpaca_lora',
#     'gusev_13b_ru_alpaca_lora',
    'wortega_instruct_rugpt_large',
]


data = {}
for name in names:
    path = f'sbs/ru_vicuna_question/openai_davinci_003_{name}.xlsx'
    table = pd.read_excel(path, dtype='str')
    
    label_counts = table.label.value_counts()
    for label, count in label_counts.items():
        data[name, label] = count
        
# data['openai_davinci_003', '0'] = 30

table = pd.Series(data)
table = table.unstack()
table = table.fillna(0)


table = table.reindex(
    index=[
        'wortega_instruct_rugpt_large',

#         'gusev_13b_ru_alpaca_lora',
        'openai_davinci_002',
        'gusev_7b_ru_alpaca_lora',

#         'openai_turbo',
        
    ],
    columns=[
        '2', '1', '0', '-1', '-2', '?',
    ]
    
)
table = table.rename(
    columns = {
        '?': 'пустой ответ',
        '-2': 'хуже',
        '-1': 'похуже',
        '0': 'примерно одинаково',
        '1': 'получше',
        '2': 'лучше'
    }
)


table.plot(
    title='SbS с openai_davinci_003 на ru_vicuna_question',
    kind='barh',
    stacked=True,
    width=0.9,
    alpha=0.7,
    xlabel='# заданий'
).legend(
    loc='upper left',
    bbox_to_anchor=(1.0, 1.0)
)

In [None]:
!open sbs