In [1]:
import asyncio
import json
import os

import pandas as pd
from jinja2 import Template
from tqdm import tqdm

from oai.audio_senders import FileAudioSender
from oai.client import VoiceOption
from oai.client import connect_aoai as connect
from oai.events import generate_response
from oai.listeners import capture
from ut.aoai import gpt_call
from ut.gsdf import gc, get_sheet_by_name, set_with_dataframe

voice = VoiceOption.SAGE

system_msg = """Please repeat what the user says in your own words. Use the same language as the user."""
dataset_path = "data/phone"

In [2]:
async def gen_rt_response(filepath, wait=15):
  # for capturing the transcript for evaluation
  captured = []
  client = await connect()
  client.subscribe(capture, captured=captured)

  sender = FileAudioSender(client)

  init = {  # initial update to configure the session
    "type": "session.update",
    "session": {
      # "turn_detection": {"type": "server_vad"},  # automatic
      "turn_detection": None,  # generate upon response.create is sent
      "voice": voice,
      "instructions": system_msg,
    },
  }

  # wait for everything to be ready. this is oddly needed, from either asyncio or remote
  await asyncio.sleep(0.1)

  await client.enqueue(init)
  await sender.send_audio(filepath)
  await client.enqueue(generate_response)

  await asyncio.sleep(wait)  # wait for the response.
  await client.disconnect()
  return captured


eval_template = Template(
  """compare the correct answer with the generated answer. does the generated answer match the correct answer?
Correct Answer: {{ correct_answer }}
Generated Answer: {{ transcript }}

To do so, extract the address from the generated answer, then extract the number from the correct answer. 
Compare the two addresses. If they match, then the generated answer is correct. 
If they do not match, then the generated answer is incorrect.

output in the following format:
{
  "generated_answer": "extracted generated answer here",
  "correct_answer": "extracted correct answer here",
  "match": true/false
}
"""
)

In [3]:
results = []
for filepath in tqdm(os.listdir(dataset_path)):
  raw_transcripts = await gen_rt_response(f"{dataset_path}/{filepath}")
  correct_answer = filepath.split(".")[0]
  transcript = "\n".join(raw_transcripts)

  prompt = eval_template.render(transcript=transcript, correct_answer=correct_answer)
  eval_result = gpt_call(prompt)
  results.append(json.loads(eval_result))

df = pd.DataFrame(results)

 60%|██████    | 60/100 [19:44<13:03, 19.58s/it]

Rate limited, retrying after 23 seconds


 61%|██████    | 61/100 [20:28<17:29, 26.91s/it]

Rate limited, retrying after 4 seconds
Rate limited, retrying after 2 seconds


 62%|██████▏   | 62/100 [20:55<17:07, 27.04s/it]

Rate limited, retrying after 14 seconds


 64%|██████▍   | 64/100 [21:50<15:58, 26.62s/it]

Rate limited, retrying after 23 seconds


 78%|███████▊  | 78/100 [26:48<07:13, 19.69s/it]

Rate limited, retrying after 12 seconds


 80%|████████  | 80/100 [27:44<07:44, 23.22s/it]

Rate limited, retrying after 18 seconds


 85%|████████▌ | 85/100 [29:41<05:22, 21.51s/it]

Rate limited, retrying after 24 seconds


100%|██████████| 100/100 [34:55<00:00, 20.96s/it]


In [4]:
ss = gc.open("Realtime Evaluation")
ws = get_sheet_by_name(ss, dataset_path)
set_with_dataframe(ws, df)