In [1]:
#================================================================
# Evaluator
#================================================================

import re
import string
import unicodedata

def normalize_text(s):
    # Normalize Unicode characters
    s = unicodedata.normalize('NFD', s)
    # Convert to lowercase
    s = s.lower()
    # Remove punctuation
    s = ''.join(ch for ch in s if ch not in string.punctuation)
    # Remove articles (a, an, the)
    s = re.sub(r'\b(a|an|the)\b', ' ', s)
    # Fix extra whitespaces
    s = ' '.join(s.split())
    return s


def f1_score_strings(label, pred):
    # Tokenize the strings (split by whitespace)
    tokens1 = set(normalize_text(label).split())
    tokens2 = set(normalize_text(pred).split())

    # Calculate true positives, false positives, and false negatives
    true_positives = len(tokens1 & tokens2)
    false_positives = len(tokens2 - tokens1)
    false_negatives = len(tokens1 - tokens2)

    if true_positives == 0:
        return 0
    
    # Calculate F1 score
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * precision * recall / (precision + recall)
    return f1

In [2]:
#================================================================
# Data Loader
#================================================================

import json

def load_data_minor():
    with open("data._json", "r") as f:
        data = json.load(f)
    # raw format:
    # [
    #     {
    #         "question": "...",
    #         "docs": [...],
    #         "answer": "..."
    #     },
    #     ...
    # ]
          
    # format to (input, output) pairs
    new_data = []
    for d in data:
        input = (d["question"], d["docs"])
        output = d["answer"]
        new_data.append((input, output))
    return new_data[:5], None, new_data[5:]

In [None]:
#================================================================
# Optimizer Set Up
#================================================================
from cognify.optimizer.control_param import ControlParameter
from cognify.cog_hub.optim_setup.qa import QASetup

# ================= Overall Control Parameter =================
optimize_control_param = ControlParameter(
    opt_setup=QASetup(),
    opt_history_log_dir='opt_results',
    evaluator_batch_size=2,
)

In [4]:
train, val, test = load_data_minor()

In [6]:
import cognify.run.optimize

opt_cost, pareto_frontier, opt_logs = cognify.run.optimize.optimize(
    script_path="cognify_workflow.py",
    control_param=optimize_control_param,
    train_set=train,
    val_set=val,
    eval_fn=f1_score_strings,
)

[INFO 2024-11-13 22:38:09] Dry run on train set: 2 samples for optimizer analysis
[INFO 2024-11-13 22:38:09] Loading existing dry run result at opt_results/dry_run_train.json
[INFO 2024-11-13 22:38:09] ----------------- Start Optimization -----------------
[INFO 2024-11-13 22:38:09] Loading outer_loop params from opt_results/outer_loop/opt_params.json


> outer_loop | (best score: 0.00, lowest cost@1000: 0.00 $):  50%|#####     | 2/4 [00:00<?, ?it/s]

------> Evaluation in outer_loop_2 | (avg score: 0.00, avg cost@1000: 0.00 $):   0%|          | 0/5 [00:00<?, …

------> Evaluation in outer_loop_3 | (avg score: 0.00, avg cost@1000: 0.00 $):   0%|          | 0/5 [00:00<?, …

---> inner_loop in outer_loop_2 | (best score: 0.00, lowest cost@1000: 0.00 $):   0%|          | 0/2 [00:00<?,…

---> inner_loop in outer_loop_3 | (best score: 0.00, lowest cost@1000: 0.00 $):   0%|          | 0/2 [00:00<?,…

------> Evaluation in outer_loop_3.inner_loop_0 | (avg score: 0.00, avg cost@1000: 0.00 $):   0%|          | 0…

------> Evaluation in outer_loop_3.inner_loop_1 | (avg score: 0.00, avg cost@1000: 0.00 $):   0%|          | 0…

------> Evaluation in outer_loop_2.inner_loop_0 | (avg score: 0.00, avg cost@1000: 0.00 $):   0%|          | 0…

------> Evaluation in outer_loop_2.inner_loop_1 | (avg score: 0.00, avg cost@1000: 0.00 $):   0%|          | 0…

[INFO 2024-11-13 22:38:49] ----------------- Optimization Finished -----------------


Num Pareto Frontier: 1
--------------------------------------------------------
# 0-th Pareto solution
  Quality = 0.402, Cost per 1K invocation = 0.78 $
  Applied Optimization: outer_loop_3.inner_loop_0


In [7]:
from cognify.run.evaluate import evaluate, load_workflow

In [8]:
new_workflow = load_workflow(control_param=optimize_control_param, config_id='outer_loop_3.inner_loop_0')

In [9]:
input = (
    "What was the 2010 population of the birthplace of Gerard Piel?", 
    [
        'Gerard Piel | Gerard Piel (1 March 1915 in Woodmere, N.Y. – 5 September 2004) was the publisher of the new Scientific American magazine starting in 1948. He wrote for magazines, including "The Nation", and published books on science for the general public. In 1990, Piel was presented with the "In Praise of Reason" award by the Committee for Skeptical Inquiry (CSICOP).',
        'Woodmere, New York | Woodmere is a hamlet and census-designated place (CDP) in Nassau County, New York, United States. The population was 17,121 at the 2010 census.',
    ],
)

new_workflow(input)

'The 2010 population of Woodmere, New York, the birthplace of Gerard Piel, was 17,121.'

In [None]:
eval_result = evaluate(
    control_param=optimize_control_param,
    config_id='outer_loop_3.inner_loop_0',
    test_set=test,
    n_parallel=10,
    eval_fn=f1_score_strings,
    save_to='eval_results.json',
)

----- Testing select trial outer_loop_1.inner_loop_0 -----
  Params: {'qa_agent_sampler_1_few_shot': 'qa_agent_sampler_1_demos_8583cf5e-9847-4df3-8572-d016af28c712', 'qa_agent_sampler_1_reasoning': 'NoChange', 'qa_agent_aggregator_few_shot': 'qa_agent_aggregator_demos_784f480c-2313-4615-84ad-4f3fd0ef1ecf', 'qa_agent_aggregator_reasoning': 'ZeroShotCoT', 'qa_agent_sampler_0_few_shot': 'qa_agent_sampler_0_demos_6ed9a41d-0094-421f-b1e9-181e4b36ca65', 'qa_agent_sampler_0_reasoning': 'ZeroShotCoT', 'qa_agent_sampler_2_few_shot': 'NoChange', 'qa_agent_sampler_2_reasoning': 'ZeroShotCoT'}
  Quality = 0.374, Cost per 1K invocation = 1.45 $


> Evaluation in outer_loop_1.inner_loop_0 | (avg score: 0.00, avg cost@1000: 0.00 $):   0%|          | 0/10 [0…

[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.390986 seconds
[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.482437 seconds
[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.433917 seconds
[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.409458 seconds
[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.475266 seconds
[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.454701 seconds
[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.472313 seconds
[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.489201 seconds
[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.457788 seconds
[INFO 2024-11-13 22:39:18] Retrying request to /chat/completions in 0.472920 seconds


  Quality = 0.514, Cost per 1K invocation = 1.48 $
