In [1]:
import json
import openai
import datetime
from langame import LangameClient
c = LangameClient(path_to_config_file="../config.yaml")

In [None]:
from langame.conversation_starters import get_existing_conversation_starters
import logging
logger = logging.getLogger("generation")
memes, index, embeddings_model = get_existing_conversation_starters(
    c._firestore_client, logger=logger, confirmed=True)

In [4]:
out_file_name = f"../data/fine_tune_generation_{datetime.date.today().strftime('%d_%m_%Y')}.jsonl"

for e in memes:
    with open(out_file_name, "a+") as outfile:
        json.dump({
            "prompt": f"{','.join(e['topics'])} ###",
            "completion": f" {e['content']}\n",
        }, outfile)
        outfile.write('\n')

In [5]:
!head -n3 $out_file_name | jq .

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[1;39m{
  [0m[34;1m"prompt"[0m[1;39m: [0m[0;32m"ice breaker ###"[0m[1;39m,
  [0m[34;1m"completion"[0m[1;39m: [0m[0;32m" When is a time when you know for sure you'll soon have to exchange a nice, meaningful conversation with someone new?\n"[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"prompt"[0m[1;39m: [0m[0;32m"space exploration,space travel ###"[0m[1;39m,
  [0m[34;1m"completion"[0m[1;39m: [0m[0;32m" Do you think humans are the only intelligent life in the universe?\n"[0m[1;39m
[1;39m}[0m
[1;39m{
  [0m[34;1m"prompt"[0m[1;39m: [0m[0;32m"ecology ###"[0m[1;39m,
  [0m[34;1m"completion"[0m[1;39m: [0m[0;32m" Have natural disasters gotten worse with the increase in hu

In [5]:
!openai tools fine_tunes.prepare_data -f $out_file_name

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Analyzing...

- Your file contains 1363 prompt-completion pairs
- There are 26 duplicated prompt-completion sets. These are rows: [243, 410, 412, 434, 436, 474, 497, 567, 601, 604, 619, 646, 735, 818, 820, 842, 851, 956, 966, 1024, 1036, 1083, 1149, 1262, 1304, 1316]
- All prompts end with suffix ` ###`
- All completions end with suffix `\n`

Based on the analysis we will perform the following actions:
- [Recommended] Remove 26 duplicate rows [Y/n]: ^C



In [4]:
# split to train and validation files
with open(out_file_name, "r") as infile:
    lines = infile.readlines()
    lines = lines[:int(len(lines) * 0.8)]
    with open(out_file_name.replace(".jsonl", "_train.jsonl"), "w") as outfile:
        outfile.writelines(lines)
    lines = lines[int(len(lines) * 0.8):]
    with open(out_file_name.replace(".jsonl", "_valid.jsonl"), "w") as outfile:
        outfile.writelines(lines)
        
train = openai.File.create(
  file=open(out_file_name.replace(".jsonl", "_train.jsonl")),
  purpose="fine-tune"
)
validation = openai.File.create(
  file=open(out_file_name.replace(".jsonl", "_valid.jsonl")),
  purpose="fine-tune"
)

ft = openai.FineTune.create(
    training_file=train["id"],
    validation_file=validation["id"],
    model="curie",
)

In [5]:
results = openai.FineTune.retrieve(
    ft["id"],
)
results

<FineTune fine-tune id=ft-zZ2pHxog8LQ3MqL6W0THUnkn at 0x105903a40> JSON: {
  "created_at": 1649407681,
  "events": [
    {
      "created_at": 1649407681,
      "level": "info",
      "message": "Created fine-tune: ft-zZ2pHxog8LQ3MqL6W0THUnkn",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": null,
    "n_epochs": 4,
    "prompt_loss_weight": 0.1
  },
  "id": "ft-zZ2pHxog8LQ3MqL6W0THUnkn",
  "model": "curie",
  "object": "fine-tune",
  "organization_id": "org-KwcHNgfGe4pqdKDLQIJt99UZ",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 204646,
      "created_at": 1649407678,
      "filename": "fine_tune_generation_08_04_2022_train.jsonl",
      "id": "file-R1YxqZwTJObItTy5rKmBPHx1",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
    }
  ],
  "updated_at": 1649407681,
  "validation_files": 