In [10]:
import sys

In [11]:
import json
import os
import random

In [12]:
def read_jsonl(file_path):
    """
    Generator function to read a JSONL file.
    
    :param file_path: Path to the JSONL file
    :return: A dictionary representing each JSON object
    """
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

        
def write_jsonl(data, file_path):
    """
    Function to write a list of dictionaries to a JSONL file.
    
    :param data: List of dictionaries to write
    :param file_path: Path to the JSONL file where the data will be written
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        for item in data:
            json_line = json.dumps(item)
            file.write(json_line + '\n')

In [18]:
from datasets import load_dataset
data = load_dataset("deepmind/code_contests", split="train")

Downloading readme: 100%|██████████| 13.0k/13.0k [00:00<00:00, 8.65MB/s]
Downloading metadata: 100%|██████████| 4.52k/4.52k [00:00<00:00, 10.4MB/s]
Downloading data: 100%|██████████| 39/39 [08:13<00:00, 12.64s/files]
Downloading data: 100%|██████████| 63.1M/63.1M [00:05<00:00, 11.5MB/s]
Downloading data: 100%|██████████| 51.8M/51.8M [00:05<00:00, 10.3MB/s]
Generating train split: 100%|██████████| 13328/13328 [01:24<00:00, 157.08 examples/s]
Generating test split: 100%|██████████| 165/165 [00:00<00:00, 220.58 examples/s]
Generating valid split: 100%|██████████| 117/117 [00:00<00:00, 140.69 examples/s]


In [29]:
sample_data = []
for oo in data:
    sample_data.append(
        {
            "name": oo["name"],
            "description": oo["description"],
            "tags": oo["cf_tags"],
            "difficulty": oo["difficulty"],
            "id": oo["cf_contest_id"],
            "sample_io": oo["public_tests"],
            "test_list": oo["private_tests"],
        }
    )

random.shuffle(sample_data)

write_jsonl(sample_data[:25], "./data/CodeContest/Train-sampled.jsonl")

## Next Steps
1. Run GPT4 across all the 5 prompting strategy on the 25 sampled from the training data saved in the above cell (`./data/CodeContest/Train-sampled.jsonl`). Use language as **Rust**
2. Mix the data randomly which creates 125 sampled.
3. Convert the data into OpenAI acceptable message format.
4. Finetune *gpt-3.5-turbo-0125*. 

**Note:** This strategy of finetuning is motivated from the [FireAct](https://arxiv.org/abs/2310.05915) work