In [113]:
%run -n main.py
dotenv = dict(read_dotenv('.env'))
openai.api_key = dotenv['OPENAI_TOKEN']

# sources

## alpaca

In [None]:
!mkdir -p data/sources/alpaca
!curl -L https://github.com/yizhongw/self-instruct/raw/main/human_eval/user_oriented_instructions.jsonl \
    > data/sources/alpaca/user_oriented_instructions.jsonl

In [99]:
%run -n main.py
items = read_jsonl('data/sources/alpaca/user_oriented_instructions.jsonl')
alpaca_items = list(parse_alpaca(items))
random.sample(alpaca_items, 3)

[{'id': '7610be0f-8237-4035-96a8-5820219d619e',
  'source': 'alpaca',
  'source_id': 'user_oriented_task_224',
  'instruction': 'Convert natural language into a list of turn-by-turn directions.\n\n"Go west on I-10 until you hit the 101 then take it north. You will eventually see the Hollywood sign."'},
 {'id': 'a58bfb9a-4f0d-4330-8154-4afd17d573c6',
  'source': 'alpaca',
  'source_id': 'user_oriented_task_238',
  'instruction': 'Categorize the given product into one of Electronics, Computers, Smart Home, or Arts & Crafts departments.\n\n"Google Nest Learning Thermostat"'},
 {'id': '63ce006e-7056-46cd-842b-3615c2d8317b',
  'source': 'alpaca',
  'source_id': 'user_oriented_task_104',
  'instruction': 'List the personality traits that are required to be successful in the given job.\n\n"Social Media Marketer"'}]

## vicuna

In [None]:
!mkdir -p data/sources/vicuna
!curl -L https://github.com/lm-sys/vicuna-blog-eval/raw/main/eval/table/question.jsonl \
    > data/sources/vicuna/question.jsonl

In [100]:
%run -n main.py
items = read_jsonl('data/sources/vicuna/question.jsonl')
vicuna_items = list(parse_vicuna(items))
random.sample(vicuna_items, 3)

[{'id': '2aea7930-c299-4be5-aad4-048d669c231d',
  'source': 'vicuna',
  'source_id': 53,
  'category': 'counterfactual',
  'instruction': 'What if the Black Death had not occurred in the 14th century?'},
 {'id': '5b49aa61-1300-495a-ac10-1d88a3186939',
  'source': 'vicuna',
  'source_id': 73,
  'category': 'writing',
  'instruction': 'Use an appropriate format to structure a formal letter of recommendation for a student applying to a prestigious graduate program in computer science.'},
 {'id': '7c08ea72-8c64-4b93-9bde-445f20dbe3b1',
  'source': 'vicuna',
  'source_id': 57,
  'category': 'counterfactual',
  'instruction': 'What if the Suez Canal had never been constructed?'}]

## arena

In [None]:
!mkdir -p data/sources/arena
!curl -L curl -L https://huggingface.co/datasets/lmsys/chatbot_arena_conversations/resolve/main/data/train-00000-of-00001-cced8514c7ed782a.parquet \
    > data/sources/arena/train-00000-of-00001-cced8514c7ed782a.parquet

In [115]:
%run -n main.py
records = pd.read_parquet('data/sources/arena/train-00000-of-00001-cced8514c7ed782a.parquet').itertuples()
arena_items = list(parse_arena(records))
random.sample(arena_items, 3)

[{'id': 'd843ac4a-4b59-4b4f-bc3e-d6de101a7953',
  'source': 'arena',
  'source_id': '924c59203b4e4aeea5e627011da0b283',
  'lang': 'English',
  'instruction': 'what are hybrid text classification methods?'},
 {'id': '584f5fa3-37cc-44b4-bf28-dc2dd7e71682',
  'source': 'arena',
  'source_id': '37c2c4edb3b14e49b5920613d9b6befe',
  'lang': 'English',
  'instruction': 'What strains of cannabis have been reported by individuals with borderline personality disorder as preferable or effective in the management of their symptoms?'},
 {'id': 'bea19674-14cd-4996-a350-d68d2d0c0317',
  'source': 'arena',
  'source_id': '2005e97ef88246468f890495101550ea',
  'lang': 'English',
  'instruction': 'How big is the Eiffeltower?'}]

# orig

In [116]:
!mkdir -p data/orig

In [117]:
write_jsonl('data/orig/alpaca.jsonl', alpaca_items)
write_jsonl('data/orig/vicuna.jsonl', vicuna_items)

In [122]:
instruction_items = {
    _['instruction']: _
    for _ in arena_items
    if _['lang'] == 'English'
}
items = random.sample(list(instruction_items.values()), 1000)
write_jsonl('data/orig/arena.jsonl', items)

In [124]:
alpaca_items = list(read_jsonl('data/orig/alpaca.jsonl'))
vicuna_items = list(read_jsonl('data/orig/vicuna.jsonl'))
arena_items = list(read_jsonl('data/orig/arena.jsonl'))

# translate

In [178]:
translate_items = [
    {
        'id': _['id'],
        'instruction': _['instruction'],
        'answer': None
    }
    for _ in vicuna_items
]

In [175]:
%run -n main.py
items = [_ for _ in translate_items if not _['answer']]
queue = iter(tqdm(items))
workers = [openai_translate_worker(queue) for _ in range(10)]
await asyncio.gather(*workers);

100%|██████████| 156/156 [01:07<00:00,  2.30it/s]


In [188]:
!mkdir -p data/translate
write_jsonl('data/translate/vicuna.jsonl', translate_items)