In [None]:
%pip install --upgrade python-dotenv nest_asyncio pydantic google-genai requests pandas pyyaml

from IPython.display import clear_output ; clear_output()

In [None]:
import os
import json
import asyncio

from dotenv import load_dotenv
import nest_asyncio

from textwrap import dedent
from IPython.display import display, Markdown

from pydantic import BaseModel, Field
from enum import Enum

import pandas as pd

import yaml

from google import genai
from google.genai import types

load_dotenv()

nest_asyncio.apply()

_gemini_client_aio = genai.Client(api_key=os.getenv('GEMINI_API_KEY')).aio

G25PRO = 'gemini-2.5-pro-preview-03-25'
G25FLASH = 'gemini-2.5-flash-preview-04-17'

async def gemini(
        prompt,
        pro = False, max_tokens = None, temperature = None,
        budget = None, schema = None):
    config = {}
    if max_tokens is not None:
        config['max_output_tokens'] = max_tokens
    if temperature is not None:
        config['temperature'] = temperature
    if budget is not None:
        config['thinking_config'] = {'thinking_budget': budget}
    if schema is not None:
        config['response_mime_type'] = 'application/json'
        config['response_schema'] = schema
    
    response = await _gemini_client_aio.models.generate_content(
        model=(G25PRO if pro else G25FLASH),
        contents=prompt,
        config=config,
    )
    
    if schema is not None:
        return response.parsed
    else:
        return response.text

def md(str): display(Markdown(str))

def display_df(df):
    display(df.style.set_properties(
        **{'text-align': 'left', 'vertical-align': 'top', 'white-space': 'pre-wrap', 'width': '50%'},
    ))

In [None]:
dataset = pd.read_csv('dataset.csv')

display_df(dataset.head(3))

print(f'{len(dataset)} items in dataset.')

In [None]:
training_dataset = dataset.iloc[:25].reset_index(drop=True)
validation_dataset = dataset.iloc[25:50].reset_index(drop=True)
testing_dataset = dataset.iloc[50:100].reset_index(drop=True)

print(f'training: {training_dataset.shape}')
display_df(training_dataset.tail(1))

print(f'validation: {validation_dataset.shape}')
display_df(validation_dataset.tail(1))

print(f'testing: {testing_dataset.shape}')
display_df(testing_dataset.tail(1))

---

---

In [None]:
def compare_responses(res1, res2):
    try:
        return yaml.safe_load(res1) == yaml.safe_load(res2)
    except:
        return False

async def discover_prompt(training_dataset, validation_dataset):
    epochs = []
    run_again = True

    while run_again:
        print(f'Epoch {len(epochs) + 1}\n\n')

        epoch_prompt = None

        training_sample_prompt = '<training-samples>\n'
        for i, row in training_dataset.iterrows():
            training_sample_prompt += (
                "<sample>\n"
                "<input>\n"
                + str(row['input']) + "\n"
                "</input>\n"
                "<output>\n"
                + str(row['output']) + "\n"
                "</output>\n"
                "</sample>\n"
            )
        training_sample_prompt += '</training-samples>'
        training_sample_prompt = dedent(training_sample_prompt)

        if len(epochs) == 0:
            epoch_prompt = dedent(f"""
            You are an expert AI engineer.
            Your goal is to create the most accurate and effective prompt for an LLM.
            Below you are provided with a set of training samples.
            Each samples consists of an input and an output.
            You should create a prompt that will generate the output given the input.

            Instructions: thinking carefully about the training samples to understand the exact transformation required.
            Output: output only the generated prompt, without any additional text or structure (no quoting, no JSON, no XML, etc...)

            {training_sample_prompt}
            """)
        else:
            epoch_prompt = dedent(f"""
            You are an expert AI engineer.
            Your goal is to create the most accurate and effective prompt for an LLM.
            Below you are provided with a set of training samples.
            Each samples consists of an input and an output.
            You should create a prompt that will generate the output given the input.

            Instructions: thinking carefully about the training samples to understand the exact transformation required.
            Output: output only the generated prompt, without any additional text or structure (no quoting, no JSON, no XML, etc...)

            You have information about the previous training epochs:
            <previous-epochs>
            {json.dumps(epochs)}
            <previous-epochs>

            You need to improve the prompt.
            Remember that you can rewrite the prompt completely if needed -
            the previous prompt is provided here for your review.
            
            {training_sample_prompt}
        """)

        transform_prompt = await gemini(epoch_prompt, budget=12345)

        print(transform_prompt)

        validation_prompts = []
        expected = []
        for _, row in validation_dataset.iterrows():
            expected.append(str(row['output']))
            validation_prompts.append(f"""{transform_prompt}

            <input>
            {str(row['input'])}
            </input>
            """)

        results = await asyncio.gather(*(gemini(p) for p in validation_prompts))

        validation_results = [
            {'expected': exp, 'result': res, 'match': compare_responses(exp, res)}
            for exp, res in zip(expected, results)
        ]

        validation_accuracy = sum([1 for r in validation_results if r['match']]) / len(validation_results)
        epochs.append({
            'epoch_number': len(epochs),
            'prompt': transform_prompt,
            'validation_accuracy': validation_accuracy,
            'validation_results': validation_results
        })                

        print(f'New prompt:\n---\n{transform_prompt}\n---\n')
        print(f"Validation accuracy: {validation_accuracy:.2%}\n---\n\n")

        run_again = len(epochs) <= 23 and epochs[-1]['validation_accuracy'] <= 0.9
    return epochs[-1]['prompt'], epochs[-1]['validation_accuracy']


In [None]:

transform_prompt, transform_validation_accuracy = await discover_prompt(training_dataset, validation_dataset)

print(f"Transform prompt:\n---\n{transform_prompt}\n---\n")
print(f"Validation accuracy: {transform_validation_accuracy:.2%}\n---\n")