In [7]:
from tqdm import tqdm


In [9]:
prompt = "A model that takes in a question in the field of 'professional medicine' and is given with a list of possible answers. The models produces an answer that is most likely to be correct. The model should just generate the answer and not provide any explanation or reasoning."
temperature = .5
number_of_examples = 100

In [10]:
import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [11]:
from openai import OpenAI

client = OpenAI(
    api_key=OPENAI_API_KEY
)

In [12]:
import random

def generate_example(prompt, prev_examples, temperature=0.5):
    
    system_prompt = f"You are generating data which will be used to train a machine learning model.\n\nYou will be given a high-level description of the model we want to train, and from that, you will generate data samples, each with a prompt/response pair.\n\nYou will do so in this format:\n```\n<prompt>prompt</prompt>\n<response>response_goes_here</response>\n```\n\nOnly one prompt/response pair should be generated per turn.\n\nFor each turn, make the example slightly more complex than the last, while ensuring diversity.\n\nMake sure your samples are unique and diverse, yet high-quality and complex enough to train a well-performing model.\n\nHere is the type of model we want to train:\n`{prompt}`"
    
    system_message = {
        "role": "system",
        "content": system_prompt
    }

    messages = [system_message]
    
    messages.append({"role": "user", "content": f'Now, generate a prompt/response pair for `{prompt}`. Do so in the exact format requested:\n```\n<prompt>prompt</prompt>\n<response>response_goes_here</response>\n```\n\nOnly one prompt/response pair should be generated per turn.'})
    
    if len(prev_examples) > 0:
        if len(prev_examples) > 10:
            prev_examples = random.sample(prev_examples, 10)

        for example in prev_examples:
            messages.append({
                "role": "assistant",
                "content": example
            })

            messages.append({
                "role": "user",
                "content": 'Now, generate another prompt/response pair. Make it unique.'
            })
    
    resp = client.chat.completions.create(
        messages=messages,
        model="gpt-4o-mini",
        temperature=temperature
    )

    content = resp.choices[0].message.content
    
    return '<prompt>' + content.split('<prompt>')[1]

# Generate examples
prev_examples = []
for i in tqdm(range(number_of_examples)):
    example = generate_example(prompt, prev_examples, temperature)
    prev_examples.append(example)

100%|██████████| 100/100 [01:26<00:00,  1.15it/s]


In [13]:
def generate_system_message(prompt):
    
    messages = [
        {
            "role": "system",
            "content": "You will be given a high-level description of the model we are training, and from that, you will generate a simple system prompt for that model to use. Remember, you are not generating the system message for data generation -- you are generating the system message to use for inference. A good format to follow is `Given WHAT_THE_MODEL_SHOULD_DO.`.\n\nMake it as concise as possible. Include nothing but the system prompt in your response.\n\nFor example, never write: `\"SYSTEM_PROMPT_HERE`."
        },
        {
            "role": "user",
            "content": f"Here is the prompt: `{prompt.strip()}`. Write a fantastic system message.",
        }
    ]
    
    resp = client.chat.completions.create(
        messages=messages,
        model="gpt-4o-mini",
        temperature=temperature
    )

    content = resp.choices[0].message.content
    return content

system_message = generate_system_message(prompt)

print(f'The system message is: `{system_message}`')

The system message is: `Given a question in the field of professional medicine and a list of possible answers, provide the most likely correct answer without any explanation or reasoning.`


In [14]:
import pandas as pd

# Initialize lists to store prompts and responses
prompts = []
responses = []

# Parse out prompts and responses from examples
for example in prev_examples:
    try:
        prompt_start = example.index('<prompt>') + len('<prompt>')
        prompt_end = example.index('</prompt>')
        prompt = example[prompt_start:prompt_end].strip()

        response_start = example.index('<response>') + len('<response>')
        response_end = example.index('</response>')
        response = example[response_start:response_end].strip()

        prompts.append(prompt)
        responses.append(response)
    except (ValueError, IndexError):
        pass

# Create a DataFrame
df = pd.DataFrame({
    'prompt': prompts,
    'response': responses
})

# Remove duplicates
df = df.drop_duplicates()

print('There are ' + str(len(df)) + ' successfully-generated examples. Here are the first few:')
df.head()

There are 99 successfully-generated examples. Here are the first few:


Unnamed: 0,prompt,response
0,What is the primary treatment for Type 1 Diabe...,A) Insulin therapy
1,Which imaging modality is most commonly used t...,B) CT Pulmonary Angiography
2,What is the first-line medication for managing...,B) ACE inhibitors
3,Which of the following vaccines is recommended...,C) Pneumococcal vaccine
4,"In the management of chronic asthma, which cla...",C) Inhaled corticosteroids


In [15]:
# Split the data into train and test sets, with 90% in the train set
train_df = df.sample(frac=0.9, random_state=42)
test_df = df.drop(train_df.index)

# Save the dataframes to .jsonl files
train_df.to_json('train.jsonl', orient='records', lines=True)
test_df.to_json('test.jsonl', orient='records', lines=True)