#### Libraries

In [1]:
import dspy
import random
from dotenv import load_dotenv
from dspy.datasets import DataLoader
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, LabeledFewShot

from src.starling import StarlingLM # <- Custom Local Model Client for Starling7B

_ = load_dotenv()

#### LLM

The model that will be used in this notebook is [Starling7B](https://huggingface.co/Nexusflow/Starling-LM-7B-beta), and the evaluation model will be [GPT-4 Turbo](https://openai.com/gpt-4).

In [2]:
# Share generation args between models
generation_args = {
    "temperature":0,
    "max_tokens":500,
    "stop":"\n\n",
    "model_type":"chat",
    "n": 1
}
# Model specific args
model_info = {
    "gpt-4": {"model": "gpt-4-0125-preview", "api_base": "https://api.openai.com/v1/"},
    "starling": {"model": "Nexusflow/Starling-LM-7B-beta"}
}

In [3]:
# Set up the models
lm = StarlingLM(**model_info["starling"], **generation_args)
evaluator_lm = dspy.OpenAI(**model_info["gpt-4"], **generation_args)

dspy.configure(lm=lm)

In [4]:
# Testing inference of Starling
lm("What is the capital of Colombia?")

[' The capital of Colombia is Bogotá. It is the largest city in the country and serves as the political, economic, and cultural center of Colombia. Located in the Andean region of the country, Bogotá has a rich history and is known for its vibrant arts scene, diverse architecture, and numerous museums and cultural institutions.']

#### Load dataset

The dataset that will be used in this notebook is [gretelai/synthetic_text_to_sql](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql)

In [5]:
# Define random seed
random.seed(1399)

In [6]:
# Load dataset
dl = DataLoader()
trainset = dl.from_huggingface(
    dataset_name="gretelai/synthetic_text_to_sql", # Dataset name from Huggingface
    fields=("sql_prompt", "sql_context", "sql"), # Fields needed
    input_keys=("sql_prompt", "sql_context"), # What our model expects to recieve to generate an output
    split="train"
)

testset = dl.from_huggingface(
    dataset_name="gretelai/synthetic_text_to_sql", # Dataset name from Huggingface
    fields=("sql_prompt", "sql_context", "sql"), # Fields needed
    input_keys=("sql_prompt", "sql_context"), # What our model expects to recieve to generate an output
    split="test"
)

trainset = dl.sample(dataset=trainset, n=100)
testset = dl.sample(dataset=testset, n=75)

_trainval = dl.train_test_split(dataset=trainset, test_size=0.25, random_state=1399) # 25% of training data for validation
trainset, valset = _trainval["train"], _trainval["test"]

len(trainset), len(valset), len(testset)

(75, 25, 75)

In [7]:
# Verify an example of the dataset
sample = dl.sample(dataset=trainset, n=1)[0]
for k, v in sample.items():
    print(f"\n{k.upper()}:\n")
    print(v)


SQL_PROMPT:

List the top 5 countries with the highest number of satellites in orbit as of 2022-01-01, ordered by the number of satellites in descending order.

SQL_CONTEXT:

CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);

SQL:

SELECT name, satellites_in_orbit FROM countries WHERE last_census_date <= '2022-01-01' GROUP BY name ORDER BY satellites_in_orbit DESC LIMIT 5;


#### Signature (Input/Output)

In [8]:
class TextToSql(dspy.Signature):
    """Transform a natural language query into a SQL query."""

    sql_prompt = dspy.InputField(desc="Natural language query")
    sql_context = dspy.InputField(desc="Context for the query")
    sql = dspy.OutputField(desc="SQL query")

### Inference

#### Baseline Inference

In [9]:
generate_sql_query = dspy.Predict(signature=TextToSql)

result = generate_sql_query(
    sql_prompt=sample["sql_prompt"],
    sql_context=sample["sql_context"]
)

for k, v in result.items():
    print(f"\n{k.upper()}:\n")
    print(v)


SQL:

SELECT name, satellites_in_orbit
FROM countries
WHERE last_census_date = '2022-01-01'
ORDER BY satellites_in_orbit DESC
LIMIT 5;


#### ChainOfThought Inference

In [10]:
generate_sql_query = dspy.ChainOfThought(signature=TextToSql)

result = generate_sql_query(
    sql_prompt=sample["sql_prompt"],
    sql_context=sample["sql_context"]
)

for k, v in result.items():
    print(f"\n{k.upper()}:\n")
    print(v)


RATIONALE:

produce the SQL query. We need to:

SQL:

1. Select the relevant columns from the table: In this case, we need to select the country name and the number of satellites in orbit.
2. Filter the data based on the date: We need to filter the data to only include records with a last_census_date of 2022-01-01.
3. Order the results: We need to order the results by the number of satellites in orbit in descending order.


### Metric of evaluation

#### Metric definition

In [11]:
class Correctness(dspy.Signature):
    """Assess if the SQL query accurately answers the given natural language query based on the provided context."""

    sql_prompt = dspy.InputField(desc="Natural language query ")
    sql_context = dspy.InputField(desc="Context for the query")
    sql = dspy.InputField(desc="SQL query")
    correct = dspy.OutputField(desc="Indicate whether the SQL query correctly answers the natural language query based on the given context", prefix="Yes/No:")

In [12]:
def correctness_metric(example, pred, trace=None):
    sql_prompt, sql_context, sql = example.sql_prompt, example.sql_context, pred.sql

    correctness = dspy.Predict(Correctness)

    with dspy.context(lm=evaluator_lm): 
        correct = correctness(
            sql_prompt=sql_prompt,
            sql_context=sql_context,
            sql=sql,
        )
    
    score = int(correct.correct=="Yes")

    if trace is not None:
        return score == 1

    return score

#### Evaluate single data point

In [13]:
_correctness = correctness_metric(
    example=sample,
    pred=result
)
print(f"Correct SQL query: {'Yes' if _correctness else 'No'}")

Correct SQL query: No


In [14]:
evaluator_lm.inspect_history(n=1)





Assess if the SQL query accurately answers the given natural language query based on the provided context.

---

Follow the following format.

Sql Prompt: Natural language query

Sql Context: Context for the query

Sql: SQL query

Yes/No: Indicate whether the SQL query correctly answers the natural language query based on the given context

---

Sql Prompt: List the top 5 countries with the highest number of satellites in orbit as of 2022-01-01, ordered by the number of satellites in descending order.

Sql Context: CREATE TABLE countries(id INT, name VARCHAR(255), population INT, satellites_in_orbit INT, last_census_date DATE);

Sql: 1. Select the relevant columns from the table: In this case, we need to select the country name and the number of satellites in orbit. 2. Filter the data based on the date: We need to filter the data to only include records with a last_census_date of 2022-01-01. 3. Order the results: We need to order the results by the number of satellites in orbit in 

#### Evaluate entire dataset - GPT 3.5

<div style="background-color: #F0F0F0; padding: 10px; border-radius: 5px;"> <p style="color: #4B4B4B; font-size: 18px; font-weight: bold; margin: 0;"> 📊 Baseline Evaluation </p> <p style="color: #4B4B4B; font-size: 16px; margin: 5px 0 0;"> Without any optimization, <strong>Starling7B</strong> achieves an <strong>80% correctness in validation (25 samples)</strong> and <strong>70.07% correctness in test (75 samples).</strong> </p> </div>

In [16]:
print("GPT 3.5 Turbo - Validation Score: \n")
with dspy.context(lm=dspy.OpenAI(model="gpt-3.5-turbo-0125", api_base="https://api.openai.com/v1/", **generation_args)):
    evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
    evaluate(generate_sql_query)

GPT 3.5 Turbo - Validation Score: 



  0%|          | 0/25 [00:00<?, ?it/s]

Average Metric: 20 / 25  (80.0): 100%|██████████| 25/25 [00:09<00:00,  2.77it/s]

Average Metric: 20 / 25  (80.0%)



  df = df.applymap(truncate_cell)


In [17]:
print("GPT 3.5 Turbo - Test Score: \n")
with dspy.context(lm=dspy.OpenAI(model="gpt-3.5-turbo-0125", api_base="https://api.openai.com/v1/", **generation_args)):
    evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
    evaluate(generate_sql_query)

GPT 3.5 Turbo - Test Score: 



Average Metric: 53 / 75  (70.7): 100%|██████████| 75/75 [00:14<00:00,  5.06it/s] 

Average Metric: 53 / 75  (70.7%)



  df = df.applymap(truncate_cell)


#### Evaluate entire dataset - Starling7B

<div style="background-color: #FFCCCB; padding: 10px; border-radius: 5px;"> <p style="color: #8B0000; font-size: 18px; font-weight: bold; margin: 0;"> ⚠️ Evaluation Stage 1 </p> <p style="color: #8B0000; font-size: 16px; margin: 5px 0 0;"> Without any optimization, <strong>Starling7B</strong> achieves an <strong>72% correctness in validation (25 samples)</strong> and <strong>50.67% correctness in test (75 samples).</strong> </p> </div>

In [19]:
print("Starling7b - Validation Score: \n")
evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(generate_sql_query)

Starling7b - Validation Score: 



Average Metric: 18 / 25  (72.0): 100%|██████████| 25/25 [00:10<00:00,  2.44it/s]

Average Metric: 18 / 25  (72.0%)



  df = df.applymap(truncate_cell)


72.0

In [20]:
print("Starling7b - Test Score: \n")
evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(generate_sql_query)

Starling7b - Test Score: 



Average Metric: 38 / 75  (50.7): 100%|██████████| 75/75 [00:31<00:00,  2.37it/s]

Average Metric: 38 / 75  (50.7%)



  df = df.applymap(truncate_cell)


50.67

### Optimize for Text2SQL

#### Create program

In [21]:
# Define the program ~ You can think of this a Pytorch model.
class TextToSqlProgram(dspy.Module):
    def __init__(self):
        super().__init__()
        self.program = dspy.ChainOfThought(signature=TextToSql)
    
    def forward(self, sql_prompt, sql_context):
        return self.program(
            sql_prompt=sql_prompt,
            sql_context=sql_context
        )

### FewShot

In [22]:
# Execute the optimizer -> this only adds few shots to the prompt
optimizer = LabeledFewShot(k=4)
optmized_program = optimizer.compile(student=TextToSqlProgram(), trainset=trainset)

In [23]:
optmized_program(sql_context=sample["sql_context"], sql_prompt=sample["sql_prompt"])

Prediction(
    rationale='produce the SQL query. We need to filter the countries based on the date and then order them by the number of satellites in orbit.',
    sql="SELECT name, satellites_in_orbit\nFROM countries\nWHERE last_census_date <= '2022-01-01'\nORDER BY satellites_in_orbit DESC\nLIMIT 5;"
)

#### What is happening inside?

In [24]:
lm.inspect_history(n=1)





Transform a natural language query into a SQL query.

---

Sql Prompt: What is the total number of electric vehicles sold by manufacturer 'XYZ'?
Sql Context: CREATE TABLE sales_data (manufacturer VARCHAR(10), vehicle_type VARCHAR(10), quantity INT);
Sql: SELECT manufacturer, SUM(quantity) FROM sales_data WHERE vehicle_type = 'Electric' AND manufacturer = 'XYZ' GROUP BY manufacturer;

Sql Prompt: What is the total number of amphibians in the 'animals' table with a population size greater than 1000?
Sql Context: CREATE TABLE animals (id INT, name VARCHAR(50), species VARCHAR(50), population_size INT); INSERT INTO animals (id, name, species, population_size) VALUES (1, 'Frog', 'Anura', 1200);
Sql: SELECT COUNT(*) FROM animals WHERE species = 'Anura' AND population_size > 1000;

Sql Prompt: What is the minimum depth a marine species can live at?
Sql Context: CREATE TABLE species (id INT, name VARCHAR(255), habitat VARCHAR(255), depth FLOAT); INSERT INTO species (id, name, habitat, dept

#### Evaluate the optimized program


<div style="background-color: #FFF8DC; padding: 10px; border-radius: 5px;"> <p style="color: #DAA520; font-size: 18px; font-weight: bold; margin: 0;"> 🌟 Evaluation Stage 2 </p> <p style="color: #DAA520; font-size: 16px; margin: 5px 0 0;"> With <strong>Few Shot</strong> optimization, <strong>Starling7B</strong> achieves an <strong>64% correctness in validation (25 samples)</strong> and <strong>60% correctness in test (75 samples).</strong> </p> </div>

In [25]:
print("Starling7b + FewShotOptimizer - Validation Score: \n")
evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program)

Starling7b + FewShotOptimizer - Validation Score: 



Average Metric: 16 / 25  (64.0): 100%|██████████| 25/25 [00:15<00:00,  1.59it/s]

Average Metric: 16 / 25  (64.0%)



  df = df.applymap(truncate_cell)


64.0

In [26]:
print("Starling7b + FewShotOptimizer - Test Score: \n")
evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program)

Starling7b + FewShotOptimizer - Test Score: 



Average Metric: 45 / 75  (60.0): 100%|██████████| 75/75 [00:31<00:00,  2.35it/s]

Average Metric: 45 / 75  (60.0%)





60.0

### BootstrapFewShotWithRandomSearch

[DSPy docs](https://dspy-docs.vercel.app/docs/building-blocks/optimizers) recommend that in a setup like the one with have at hand, with ~50 samples, the best option is to use `BootstrapFewShotWithRandomSearch`:

![image](assets/dspy.png)

In [27]:
optimizer2 = BootstrapFewShotWithRandomSearch(metric=correctness_metric, max_bootstrapped_demos=2, num_candidate_programs=8, num_threads=5)
optmized_program_2 = optimizer2.compile(student = TextToSqlProgram(), trainset=trainset, valset=valset)

Going to sample between 1 and 2 traces per predictor.
Will attempt to train 8 candidate sets.


Average Metric: 18 / 25  (72.0): 100%|██████████| 25/25 [00:15<00:00,  1.64it/s]
  df = df.applymap(truncate_cell)


Average Metric: 18 / 25  (72.0%)
Score: 72.0 for set: [0]
New best score: 72.0 for seed -3
Scores so far: [72.0]
Best score: 72.0


Average Metric: 19 / 25  (76.0): 100%|██████████| 25/25 [00:24<00:00,  1.00it/s]


Average Metric: 19 / 25  (76.0%)
Score: 76.0 for set: [16]
New best score: 76.0 for seed -2
Scores so far: [72.0, 76.0]
Best score: 76.0


  4%|▍         | 3/75 [00:11<04:44,  3.95s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 18 / 25  (72.0): 100%|██████████| 25/25 [00:25<00:00,  1.02s/it]


Average Metric: 18 / 25  (72.0%)
Score: 72.0 for set: [16]
Scores so far: [72.0, 76.0, 72.0]
Best score: 76.0
Average of max per entry across top 1 scores: 0.76
Average of max per entry across top 2 scores: 0.84
Average of max per entry across top 3 scores: 0.84
Average of max per entry across top 5 scores: 0.84
Average of max per entry across top 8 scores: 0.84
Average of max per entry across top 9999 scores: 0.84


  4%|▍         | 3/75 [00:10<04:08,  3.45s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 20 / 25  (80.0): 100%|██████████| 25/25 [00:26<00:00,  1.07s/it]


Average Metric: 20 / 25  (80.0%)
Score: 80.0 for set: [16]
New best score: 80.0 for seed 0
Scores so far: [72.0, 76.0, 72.0, 80.0]
Best score: 80.0
Average of max per entry across top 1 scores: 0.8
Average of max per entry across top 2 scores: 0.84
Average of max per entry across top 3 scores: 0.88
Average of max per entry across top 5 scores: 0.88
Average of max per entry across top 8 scores: 0.88
Average of max per entry across top 9999 scores: 0.88


  1%|▏         | 1/75 [00:02<03:34,  2.90s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 20 / 25  (80.0): 100%|██████████| 25/25 [00:25<00:00,  1.01s/it]


Average Metric: 20 / 25  (80.0%)
Score: 80.0 for set: [16]
Scores so far: [72.0, 76.0, 72.0, 80.0, 80.0]
Best score: 80.0
Average of max per entry across top 1 scores: 0.8
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.88
Average of max per entry across top 5 scores: 0.88
Average of max per entry across top 8 scores: 0.88
Average of max per entry across top 9999 scores: 0.88


  1%|▏         | 1/75 [00:03<04:09,  3.37s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 20 / 25  (80.0): 100%|██████████| 25/25 [00:26<00:00,  1.08s/it]


Average Metric: 20 / 25  (80.0%)
Score: 80.0 for set: [16]
Scores so far: [72.0, 76.0, 72.0, 80.0, 80.0, 80.0]
Best score: 80.0
Average of max per entry across top 1 scores: 0.8
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.96
Average of max per entry across top 8 scores: 0.96
Average of max per entry across top 9999 scores: 0.96


  1%|▏         | 1/75 [00:03<04:00,  3.25s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 19 / 25  (76.0): 100%|██████████| 25/25 [00:23<00:00,  1.05it/s]


Average Metric: 19 / 25  (76.0%)
Score: 76.0 for set: [16]
Scores so far: [72.0, 76.0, 72.0, 80.0, 80.0, 80.0, 76.0]
Best score: 80.0
Average of max per entry across top 1 scores: 0.8
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 0.96
Average of max per entry across top 8 scores: 0.96
Average of max per entry across top 9999 scores: 0.96


  1%|▏         | 1/75 [00:03<04:18,  3.49s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 20 / 25  (80.0): 100%|██████████| 25/25 [00:25<00:00,  1.04s/it]


Average Metric: 20 / 25  (80.0%)
Score: 80.0 for set: [16]
Scores so far: [72.0, 76.0, 72.0, 80.0, 80.0, 80.0, 76.0, 80.0]
Best score: 80.0
Average of max per entry across top 1 scores: 0.8
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  5%|▌         | 4/75 [00:13<03:57,  3.34s/it]


Bootstrapped 2 full traces after 5 examples in round 0.


Average Metric: 20 / 25  (80.0): 100%|██████████| 25/25 [00:25<00:00,  1.02s/it]


Average Metric: 20 / 25  (80.0%)
Score: 80.0 for set: [16]
Scores so far: [72.0, 76.0, 72.0, 80.0, 80.0, 80.0, 76.0, 80.0, 80.0]
Best score: 80.0
Average of max per entry across top 1 scores: 0.8
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  1%|▏         | 1/75 [00:03<04:52,  3.95s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 19 / 25  (76.0): 100%|██████████| 25/25 [00:27<00:00,  1.10s/it]


Average Metric: 19 / 25  (76.0%)
Score: 76.0 for set: [16]
Scores so far: [72.0, 76.0, 72.0, 80.0, 80.0, 80.0, 76.0, 80.0, 80.0, 76.0]
Best score: 80.0
Average of max per entry across top 1 scores: 0.8
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0


  4%|▍         | 3/75 [00:11<04:41,  3.90s/it]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 20 / 25  (80.0): 100%|██████████| 25/25 [00:27<00:00,  1.09s/it]

Average Metric: 20 / 25  (80.0%)
Score: 80.0 for set: [16]
Scores so far: [72.0, 76.0, 72.0, 80.0, 80.0, 80.0, 76.0, 80.0, 80.0, 76.0, 80.0]
Best score: 80.0
Average of max per entry across top 1 scores: 0.8
Average of max per entry across top 2 scores: 0.88
Average of max per entry across top 3 scores: 0.96
Average of max per entry across top 5 scores: 1.0
Average of max per entry across top 8 scores: 1.0
Average of max per entry across top 9999 scores: 1.0
11 candidate programs found.





In [28]:
optmized_program_2(sql_context=sample["sql_context"], sql_prompt=sample["sql_prompt"])

Prediction(
    rationale='produce the SQL query. We need to find the top 5 countries with the highest number of satellites in orbit as of 2022-01-01, ordered by the number of satellites in descending order.',
    sql="SELECT name, satellites_in_orbit\nFROM countries\nWHERE last_census_date <= '2022-01-01'\nORDER BY satellites_in_orbit DESC\nLIMIT 5;"
)

In [29]:
lm.inspect_history(n=1)





Transform a natural language query into a SQL query.

---

Sql Prompt: What is the total number of open pedagogy courses offered by each country?
Sql Context: CREATE TABLE country (country_id INT, country_name VARCHAR(255)); CREATE TABLE open_pedagogy_courses (country_id INT, course_id INT); INSERT INTO country (country_id, country_name) VALUES (6001, 'Country X'), (6002, 'Country Y'), (6003, 'Country Z'); INSERT INTO open_pedagogy_courses (country_id, course_id) VALUES (6001, 7001), (6001, 7002), (6002, 7003);
Sql: SELECT country_name, COUNT(course_id) as total_courses FROM country JOIN open_pedagogy_courses ON country.country_id = open_pedagogy_courses.country_id GROUP BY country_name;

Sql Prompt: Top 3 most popular songs of 2021 in 'concert_ticket_sales' table?
Sql Context: CREATE TABLE concert_ticket_sales (ticket_id INT, song_id INT, quantity INT, price FLOAT, sale_date DATE);
Sql: SELECT song_id, SUM(quantity) as total_quantity FROM concert_ticket_sales WHERE sale_date >= '2

#### Evaluate the optimized program

<div style="background-color: #E0F8E0; padding: 10px; border-radius: 5px;"> <p style="color: #006400; font-size: 18px; font-weight: bold; margin: 0;"> ✅ Evaluation Stage 3 </p> <p style="color: #006400; font-size: 16px; margin: 5px 0 0;"> With <strong>BootstrapFewShotWithRandomSearch</strong> optimization, <strong>Starling7B</strong> achieves an <strong>80% correctness in validation (25 samples)</strong> and <strong>68% correctness in test (75 samples).</strong> </p> </div>

In [30]:
print("Starling7b + BootstrapFewShotWithRandomSearch - Validation Score: \n")
evaluate = Evaluate(devset=valset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program_2)

Starling7b + BootstrapFewShotWithRandomSearch - Validation Score: 





Average Metric: 20 / 25  (80.0): 100%|██████████| 25/25 [00:20<00:00,  1.24it/s]

Average Metric: 20 / 25  (80.0%)



  df = df.applymap(truncate_cell)


80.0

In [31]:
print("Starling7b + BootstrapFewShotWithRandomSearch - Test Score: \n")
evaluate = Evaluate(devset=testset, metric=correctness_metric, num_threads=10, display_progress=True, display_table=0)
evaluate(optmized_program_2)

Starling7b + BootstrapFewShotWithRandomSearch - Test Score: 



Average Metric: 51 / 75  (68.0): 100%|██████████| 75/75 [01:01<00:00,  1.22it/s]

Average Metric: 51 / 75  (68.0%)



  df = df.applymap(truncate_cell)


68.0