# Prepare training set for OpenAI

In [9]:
import requests
import json
import os
from datetime import date

## Fetch Advent of Code dataset from huggingface

In [59]:
dataset_url = "https://huggingface.co/datasets/isavita/advent-of-code/resolve/main/train.json"
response = requests.get(dataset_url, timeout=600)
dataset = response.json()

# get only go solutions, because go is the only language that the dataset has all challenges solved
dataset = [part for part in dataset if part["solution_lang"] == "go"]

## System Constants

In [60]:
class Constants:
    @staticmethod
    def file_ext():
        return ".clj"
    
    @staticmethod
    def lang():
        return "clojure"
    
    @staticmethod
    def output_dir():
        return os.path.expanduser("~/code/advent_generated/training_data")
    
    @staticmethod
    def input_dir():
        return os.path.expanduser("~/code/advent_generated/clojure")

## Prompt Templates with the task, but no example solution

In [61]:
sys_msg = f"""You are a highly experienced programmer with a PhD in computer science participating in a coding challenge.
Write clean, efficient code without unnecessary comments, demonstrating your advanced skills by solving problems practically and concisely.
Aim to produce optimal and concise solutions, leveraging your decade of industry experience."""

def user_part1_fn(task):
    return f"""Write an {Constants.lang()} program that reads input from a file called input.txt and prints the output to standard output.
Focus on writing clean, efficient code that demonstrates your programming skills by concisely solving the challenge.

Coding challenge:
{task}
"""

def user_part2_fn(task):
    return f"""Write an {Constants.lang()} program that reads input from a file called input.txt and prints the output to standard output.
Focus on writing clean, efficient code that demonstrates your programming skills by concisely solving the challenge.

Solve part two of the coding challenge:
{task}
"""

def assistant_solution_fn(solution):
    return f"""Here is a solution of the challenge:
```{Constants.lang()}
{solution}
```
"""

## Training Data Preparation

In [62]:
solutions = os.listdir(Constants.input_dir())
solved = [s for s in solutions if Constants.file_ext() in s]

data = []
for day in dataset:
    filename = next((s for s in solved if s.startswith(day["name"])), None)
    if filename:
        with open(f"{Constants.input_dir()}/{filename}", "r") as f:
            solution = f.read().strip()
        
        system_msg = {"role": "system", "content": sys_msg}
        
        if False and "part2_" in day["name"]:
            user_content = user_part2_fn(day["task"])
        else:
            user_content = user_part1_fn(day["task"])
        
        user_msg = {"role": "user", "content": user_content}
        assistant_msg = {"role": "assistant", "content": assistant_solution_fn(solution)}
        
        messages = {
            "messages": [
                system_msg,
                user_msg,
                assistant_msg
            ]
        }
        
        data.append(messages)

data.reverse()

## Data export to JSONL format

In [63]:
output_path = f"{Constants.output_dir()}/{Constants.lang()}_{len(data)}_{date.today()}.jsonl"

with open(output_path, "w") as f:
    for item in data:
        json.dump(item, f)
        f.write("\n")

## Upload the training file

In [64]:
import subprocess

cmd = [
    "curl",
    "https://api.openai.com/v1/files",
    "-H", f"Authorization: Bearer {os.environ['OPENAI_API_KEY']}",
    "-F", "purpose=fine-tune",
    "-F", f"file=@{output_path}"
]

result = subprocess.run(cmd, capture_output=True, text=True)
resp = result.stdout
status = result.returncode

print(f"Status: {status}")
print(f"Response: {resp}")

python(56857) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Status: 0
Response: {
  "object": "file",
  "id": "file-IGw5teHdu3CNodRPap3Ue3Um",
  "purpose": "fine-tune",
  "filename": "clojure_137_2024-08-23.jsonl",
  "bytes": 585757,
  "created_at": 1724438022,
  "status": "processed",
  "status_details": null
}



In [65]:
file_upload = json.loads(resp)
print(file_upload)
file_id = file_upload["id"]
print(f"File ID: {file_id}")

{'object': 'file', 'id': 'file-IGw5teHdu3CNodRPap3Ue3Um', 'purpose': 'fine-tune', 'filename': 'clojure_137_2024-08-23.jsonl', 'bytes': 585757, 'created_at': 1724438022, 'status': 'processed', 'status_details': None}
File ID: file-IGw5teHdu3CNodRPap3Ue3Um


## Create a fine-tuned model

### Create Training job from OpenAI UI [here](https://platform.openai.com/finetune)