In [137]:
%run -n main.py
dotenv = dict(read_dotenv('.env'))
openai.api_key = dotenv['OPENAI_TOKEN']

# sources

## alpaca

In [None]:
!mkdir -p data/sources/alpaca
!curl -L https://github.com/yizhongw/self-instruct/raw/main/human_eval/user_oriented_instructions.jsonl \
    > data/sources/alpaca/user_oriented_instructions.jsonl

In [None]:
%run -n main.py
items = read_jsonl('data/sources/alpaca/user_oriented_instructions.jsonl')
alpaca_items = list(parse_alpaca(items))

## vicuna

In [None]:
!mkdir -p data/sources/vicuna
!curl -L https://github.com/lm-sys/vicuna-blog-eval/raw/main/eval/table/question.jsonl \
    > data/sources/vicuna/question.jsonl

In [None]:
%run -n main.py
items = read_jsonl('data/sources/vicuna/question.jsonl')
vicuna_items = list(parse_vicuna(items))

## arena

In [None]:
!mkdir -p data/sources/arena
!curl -L curl -L https://huggingface.co/datasets/lmsys/chatbot_arena_conversations/resolve/main/data/train-00000-of-00001-cced8514c7ed782a.parquet \
    > data/sources/arena/train-00000-of-00001-cced8514c7ed782a.parquet

In [None]:
%run -n main.py
records = pd.read_parquet('data/sources/arena/train-00000-of-00001-cced8514c7ed782a.parquet').itertuples()
arena_items = list(parse_arena(records))

# orig

In [None]:
orig_items = alpaca_items + vicuna_items

In [None]:
instruction_items = {
    _['instruction']: _
    for _ in arena_items
    if _['lang'] == 'English'
}
orig_items.extend(random.sample(list(instruction_items.values()), 1000))

In [None]:
write_jsonl('data/orig.jsonl', orig_items)

In [12]:
orig_items = list(read_jsonl('data/orig.jsonl'))
random.sample(orig_items, 5)

[{'id': '6711af7d-4165-45e2-a45e-755c4eca5026',
  'source': 'arena',
  'source_id': '1ad24688c2a545e4b40be8c8f0129a8f',
  'lang': 'English',
  'instruction': 'convert movie to emoji : midnight express, shinning'},
 {'id': '418214b0-fdb9-4634-ad50-0701b1bab193',
  'source': 'arena',
  'source_id': 'ca5b0426f2ab48f6b0a82ddeace44480',
  'lang': 'English',
  'instruction': 'can you create me 3 music band name with that acronym: DIK and give a description'},
 {'id': '4d5561ff-d153-4f84-8def-8d15525e0013',
  'source': 'alpaca',
  'source_id': 'user_oriented_task_67',
  'instruction': 'You should choose a YouTube video title based on the video\'s content. A video\'s title tells viewers what to expect from it. It should be direct, honest, and clear. The title of the video needs to capture the attention of viewers, so do not use an unclear or ambiguous one.\n\n"A research study has been conducted to determine if exercise really can "boost" your metabolism."'},
 {'id': 'e96d61ad-7d3a-4fbf-b2b4-6

# translate

In [None]:
translate_items = [
    {
        'id': _['id'],
        'instruction': _['instruction'],
        'answer': None
    }
    for _ in orig_items
]

In [None]:
%run -n main.py
items = [_ for _ in translate_items if not _['answer']]
queue = iter(tqdm(items))
workers = [openai_translate_worker(queue) for _ in range(10)]
await asyncio.gather(*workers);

In [None]:
write_jsonl('data/translate.jsonl', translate_items)

# label studio

In [None]:
%run -n main.py
label_studio = label_studio_sdk.Client('http://localhost:8080', dotenv['LABELSTUDIO_TOKEN'])
label_studio.check_connection()

In [None]:
title_projects = {
    _.title: _
    for _ in label_studio.list_projects()
}
translate_project = title_projects['translate']
classify_project = title_projects['classify']

# translate annot

In [None]:
translate_items = read_jsonl('data/translate.jsonl')
annot_items = [translate_annot_item(_) for _ in translate_items]
random.choice(annot_items)

In [None]:
translate_project.delete_all_tasks();
translate_project.import_tasks(annot_items);

In [None]:
annot_items = translate_project.export_tasks()
translate_items = [annot_translate_item(_) for _ in annot_items]
random.sample(translate_items, 3)

In [None]:
write_jsonl('data/translate.jsonl', translate_items)

# classify

In [None]:
%run -n main.py
classify_items = list(read_jsonl('data/classify.jsonl'))
id_embeddings = read_pickle('data/embeddings.pkl')

In [None]:
%run -n main.py
items = [
    _ for _ in classify_items
    if _['id'] not in id_embeddings
]
for index in tqdm(range(0, len(items), 64)):
    batch = items[index:index + 64]
    texts = [_['instruction'] for _ in batch]
    embeddings = openai_embed_batch(texts)
    for item, embedding in zip(batch, embeddings):
        id_embeddings[item['id']] = np.array(embedding)
write_pickle('data/embeddings.pkl', id_embeddings)

In [None]:
target_items = [
    _ for _ in classify_items
    if _['tags'] and 'bad instruction' not in _['tags']
]
items = [_ for _ in classify_items if not _['tags']]

for item in tqdm(items):
    max_sim = 0
    for target_item in target_items:
        sim = cosine_sim(
            id_embeddings[item['id']],
            id_embeddings[target_item['id']]
        )
        if sim > max_sim:
            max_sim = sim
            item['tags'] = target_item['tags']
    item['max_sim'] = max_sim

In [None]:
items = []
for item in classify_items:
    if not item.get('max_sim'):
        continue
        
    if 'enumerate' not in item['tags']:
        continue
        
    items.append(item)

items = sorted(items, key=lambda _: _['max_sim'], reverse=False)
annot_items = [classify_annot_item(_) for _ in items]
len(annot_items)

In [None]:
classify_project.delete_all_tasks();
classify_project.import_tasks(annot_items);

In [None]:
%run -n main.py
annot_items = classify_project.export_tasks()
items = (annot_classify_item(_) for _ in annot_items)
id_tags = {
    _['id']: _['tags']
    for _ in items
}
for item in classify_items:
    tags = id_tags.get(item['id'])
    if tags is not None:
        item['tags'] = tags
        item.pop('max_sim', None)

In [None]:
for item in classify_items:
    if item.pop('max_sim', None):
        item['tags'] = []
write_jsonl('data/classify.jsonl', classify_items)

# tasks

In [13]:
items = read_jsonl('data/classify.jsonl')
id_tags = {_['id']: _['tags'] for _ in items if _['tags']}
len(id_tags)

551

In [14]:
items = read_jsonl('data/orig.jsonl')
id_sources = {_['id']: _['source'] for _ in items}
len(id_sources)

1317

In [16]:
task_items = []
items = read_jsonl(f'data/translate.jsonl')
for item in items:
    id = item['id']
    tags = id_tags.get(id, [])
    if 'bad instruction' in tags:
        continue

    task_items.append({
        'id': id,
        'source': id_sources[id],
        'instruction': item['answer'],
        'tags': tags
    })
write_jsonl(f'data/tasks.jsonl', task_items)

In [12]:
task_items = list(read_jsonl('data/tasks.jsonl'))

In [27]:
%run -n main.py
source_items = defaultdict(list)
for item in task_items:
    source_items[item['source']].append(item)

with open('data/tasks.md', 'w') as file:
    with redirect_stdout(file):
        for source in ['alpaca', 'vicuna', 'arena']:
            print(f'<h1>{source}</h1>')

            items = [_ for _ in source_items[source] if _['tags']]
            for item in random.sample(items, 30):
                print(' '.join(f'<code>#{_}</code>' for _ in item['tags']))
                print('<br/>')
                instruction = html.escape(item['instruction'])
                print('<br/>\n'.join(instruction.splitlines()))
                print('<br/><br/>')

# infer

In [273]:
%run -n main.py
infer_items = list(read_jsonl('data/infer.jsonl'))

In [279]:
infer_items.extend(
    {
        'id': _['id'],
        'model': 'yagpt_chat',
        'instruction': _['instruction'],
        'answer': None
    }
    for _ in task_items
)

In [281]:
write_jsonl('data/infer.jsonl', infer_items)

In [294]:
items = [_ for _ in infer_items if _['model'] == 'yagpt_chat' and _['answer'] is not None]
print('|items| =', len(items))

# random.shuffle(items)
for item in items[-10:]:
    if item['answer']:
        print(item['instruction'])
        print('----')
        print(item['answer'])
        print('---\n\n')

|items| = 834
Напиши официальное письмо потенциальному деловому партнеру, предложи создать совместный бизнес?
----
Уважаемый [Имя партнера],

Я хотел бы выразить свой интерес к созданию совместного бизнеса с вами. Я убежден, что наши знания и опыт могут быть объединены для достижения общих целей.

Мы предлагаем рассмотреть возможность сотрудничества в [области/сфере деятельности], где мы можем предложить наши уникальные навыки и экспертизу. Мы готовы обсудить детали нашего возможного партнерства и предоставить дополнительную информацию о наших возможностях.

Если вы заинтересованы в создании совместного бизнеса, пожалуйста, свяжитесь со мной для дальнейшего обсуждения. Я буду рад ответить на любые ваши вопросы и предоставить более подробную информацию.

С уважением,
[Ваше имя]
---


Напиши заявление об увольнении моему нынешнему работодателю, вырази благодарность за предоставленные возможности.
----
Уважаемый (имя работодателя),

Я пишу, чтобы сообщить вам о своем решении уволиться с д

## turbo

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'turbo_2']
queue = iter(tqdm(items))
workers = [openai_infer_worker(queue, model='gpt-3.5-turbo-0613') for _ in range(20)]
await asyncio.gather(*workers);

## gpt4

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'gpt4_2']
queue = iter(tqdm(items))
workers = [openai_infer_worker(queue, model='gpt-4-0613', request_timeout=1200) for _ in range(20)]
await asyncio.gather(*workers);

## gigachat

In [74]:
%run -n main.py
headers = dict(read_headers('.gigachat'))
gigachat_client = gigachat_client_init(headers)

# After ~5 min / 260 answers blocked for ~1 hour
# {'result': 'rejected', 'reason': 'UserBlocked', 'user_blocked_until': '2023-08-25T11:00:24+00:00'}

# "в полуавтоматическом режиме банят, если 3 временных бана, то могут опять решить забанить насовсем.
# Так что при временном бане лучше какое-то время  подождать."

# "из-за запросов типа "Люди умирают, когда их убивают, откуда это высказывание?". Цензор такое
# отлавливает сколько-то раз и во временный бан отправляет"

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'gigachat']
queue = iter(tqdm(items[:100]))
workers = [gigachat_infer_worker(gigachat_client, queue) for _ in range(2)]
await asyncio.gather(*workers);

## yagpt

In [162]:
lines = !~/yandex-cloud/bin/yc iam create-token
YAGPT_TOKEN = lines[0]

lines = !~/yandex-cloud/bin/yc resource-manager folder get --name default --format json
data = json.loads(''.join(lines))
YAGPT_FOLDER_ID = data['id']

# token expires every ~12 hours

In [None]:
%run -n main.py
yagpt_client = yagpt_client_init(YAGPT_TOKEN, YAGPT_FOLDER_ID)

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'yagpt_instruct']
queue = iter(tqdm(items))
limiter = Limiter(min_delay=1.2)
workers = [yagpt_infer_worker(yagpt_client, limiter, queue, mode='instruct') for _ in range(5)]
await asyncio.gather(*workers);

In [None]:
%run -n main.py
items = [_ for _ in infer_items if _['answer'] is None and _['model'] == 'yagpt_chat']
queue = iter(tqdm(items))
limiter = Limiter(min_delay=1.2)
workers = [yagpt_infer_worker(yagpt_client, limiter, queue, mode='chat') for _ in range(5)]
await asyncio.gather(*workers);