## Lesson 6: Use your voice

**Lesson objective**: Get voice feedback 

So far we've set up a moderately complex workflow with a human feedback loop. Let's run it through the visualizer to see what it looks like.

In [1]:
import json
import os

from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.base.base_query_engine import BaseQueryEngine
from llama_index.core.workflow import (
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Event,
    Context,
    InputRequiredEvent,
    HumanResponseEvent,
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.utils.workflow import draw_all_possible_flows
from llama_parse import LlamaParse, ResultType

In [2]:
from dotenv import load_dotenv

load_dotenv()

openai_api_key = os.environ["OPENAI_API_KEY"]
llama_cloud_api_key = os.environ["LLAMA_CLOUD_API_KEY"]

In [3]:
import nest_asyncio

nest_asyncio.apply()

In [4]:
class ParseFormEvent(Event):
    application_form: str

class QueryEvent(Event):
    query: str
    field: str
    
class ResponseEvent(Event):
    field: str
    response: str

class FeedbackEvent(Event):
    feedback: str

class GenerateQuestionsEvent(Event):
    pass

This is exactly the same workflow from lesson 5:

In [5]:
class RAGWorkflow(Workflow):

    storage_dir: str = "./storage"
    llm: OpenAI
    query_engine: BaseQueryEngine

    @step
    async def set_up(self, ctx: Context, ev: StartEvent) -> ParseFormEvent:

        if not ev.resume_file:
            raise ValueError("No resume file provided")

        if not ev.application_form:
            raise ValueError("No application form provided")

        # define the LLM to work with
        self.llm = OpenAI(
            model="gpt-4.1-nano",
            api_key=openai_api_key,
            api_base=os.environ["OPENAI_API_BASE"],
            temperature=0.5,
        )

        # ingest the data and set up the query engine
        if os.path.exists(self.storage_dir):
            # you've already ingested the resume document
            storage_context = StorageContext.from_defaults(persist_dir=self.storage_dir)
            index = load_index_from_storage(
                storage_context=storage_context,
                embed_model=OpenAIEmbedding(
                    model_name="Cohere-embed-v3-english",
                    api_key=openai_api_key,
                    api_base=os.environ["OPENAI_API_BASE"],
                ),
            )
        else:
            # parse and load the resume document
            documents = LlamaParse(
                api_key=llama_cloud_api_key,
                # base_url=os.getenv("LLAMA_CLOUD_BASE_URL"),
                result_type=ResultType.MD,
                user_prompt="This is a resume, gather related facts together and format it as bullet points with headers",
            ).load_data(ev.resume_file)
            # embed and index the documents
            index = VectorStoreIndex.from_documents(
                documents=documents,
                embed_model=OpenAIEmbedding(
                    model_name="Cohere-embed-v3-english",
                    api_key=openai_api_key,
                    api_base=os.environ["OPENAI_API_BASE"],
                ),
            )
            index.storage_context.persist(persist_dir=self.storage_dir)

        # create a query engine
        self.query_engine = index.as_query_engine(llm=self.llm, similarity_top_k=5)

        # let's pass the application form to a new step to parse it
        return ParseFormEvent(application_form=ev.application_form)

    # form parsing
    @step
    async def parse_form(
        self, ctx: Context, ev: ParseFormEvent
    ) -> GenerateQuestionsEvent:
        parser = LlamaParse(
            api_key=llama_cloud_api_key,
            # base_url=os.getenv("LLAMA_CLOUD_BASE_URL"),
            result_type=ResultType.MD,
            user_prompt="This is a job application form."
            " Create a list of all the fields that need to be filled in."
            " Return a bulleted list of the fields ONLY.",
        )

        # get the LLM to convert the parsed form into JSON
        result = parser.load_data(ev.application_form)[0]
        raw_json = self.llm.complete(
            "This is a parsed form. Convert it into a JSON object containing only the list of fields to be filled in,"
            f" in the form {{ fields: [...] }}. <form>{result.text}</form>. Return JSON ONLY, no markdown."
        )
        fields = json.loads(raw_json.text)["fields"]

        await ctx.store.set("fields_to_fill", fields)

        return GenerateQuestionsEvent()

    # generate questions
    @step
    async def generate_questions(
        self, ctx: Context, ev: GenerateQuestionsEvent | FeedbackEvent
    ) -> QueryEvent:

        # get the list of fields to fill in
        fields = await ctx.store.get("fields_to_fill")

        # generate one query for each of the fields, and fire them off
        for field in fields:
            question = f"How would you answer this question about the candidate? <field>{field}</field>"

            # new! Is there feedback? If so, add it to the query:
            if hasattr(ev, "feedback"):
                question += f"""
                    \nWe previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback>{ev.feedback}</feedback>
                """

            ctx.send_event(QueryEvent(field=field, query=question))

        # store the number of fields so we know how many to wait for later
        await ctx.store.set("total_fields", len(fields))

    @step
    async def ask_question(self, ctx: Context, ev: QueryEvent) -> ResponseEvent:
        response = self.query_engine.query(
            f"This is a question about the specific resume we have in our database: {ev.query}"
        )
        return ResponseEvent(field=ev.field, response=response.response)

    # Get feedback from the human
    @step
    async def fill_in_application(
        self, ctx: Context, ev: ResponseEvent
    ) -> InputRequiredEvent | None:
        # get the total number of fields to wait for
        total_fields = await ctx.store.get("total_fields")

        responses = ctx.collect_events(ev, [ResponseEvent] * total_fields)
        if responses is None:
            return  # do nothing if there's nothing to do yet

        # we've got all the responses!
        responseList = "\n".join(
            "Field: " + r.field + "\n" + "Response: " + r.response for r in responses
        )

        result = self.llm.complete(
            f"""
            You are given a list of fields in an application form and responses to
            questions about those fields from a resume. Combine the two into a list of
            fields and succinct, factual answers to fill in those fields.

            <responses>
            {responseList}
            </responses>
        """
        )

        # save the result for later
        await ctx.store.set("filled_form", str(result))

        # Fire off the feedback request
        return InputRequiredEvent(
            prefix="How does this look? Give me any feedback you have on any of the answers.",
            result=result,
        )

    # Accept the feedback when a HumanResponseEvent fires
    @step
    async def get_feedback(
        self, ctx: Context, ev: HumanResponseEvent
    ) -> FeedbackEvent | StopEvent:

        result = self.llm.complete(
            f"""
            You have received some human feedback on the form-filling task you've done.
            Does everything look good, or is there more work to be done?
            <feedback>
            {ev.response}
            </feedback>
            If everything is fine, respond with just the word 'OKAY'.
            If there's any other feedback, respond with just the word 'FEEDBACK'.
        """
        )

        verdict = result.text.strip()
        print(f"LLM says the verdict was {verdict}")

        if verdict == "OKAY":
            return StopEvent(result=await ctx.store.get("filled_form"))

        return FeedbackEvent(feedback=ev.response)

### Getting voice feedback

Now, just for fun, you'll do one more thing: change the feedback from text feedback to actual words spoken out loud. To do this we'll use a different model from OpenAI called Whisper. LlamaIndex has a built-in way to transcribe audio files into text using Whisper.

Here's a function that takes a file and uses Whisper to return just the text:

In [6]:
import asyncio
from queue import Queue

import gradio as gr
from llama_index.readers.whisper import WhisperReader

In [7]:
def transcribe_speech(filepath: str) -> str:
    if filepath is None:
        gr.Warning("No audio found, please retry.")

    # audio_file = open(filepath, "rb")

    reader = WhisperReader(
        model="whisper-1",
        api_key=openai_api_key,
    )

    documents = reader.load_data(filepath)

    return documents[0].text

In [8]:
def store_transcription(output: str) -> str:
    global transcription_value
    transcription_value = output

    return output

In [9]:
mic_transcribe = gr.Interface(
    fn=lambda x: store_transcription(transcribe_speech(x)),
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(label="Transcription"),
)

In Gradio, you further define a visual interface containing this microphone input and output, and then launch it:

In [10]:
test_interface = gr.Blocks()

with test_interface:
    gr.TabbedInterface(
        [mic_transcribe],
        ["Transcribe Microphone"]
    )

test_interface.launch(
    share=False, 
    server_port=8000, 
    prevent_thread_lock=True
)

* Running on local URL:  http://127.0.0.1:8000
* To create a public link, set `share=True` in `launch()`.




In [None]:
print(transcription_value)

In [12]:
test_interface.close()

Closing server running on port: 8000


In [None]:
# New! Transcription handler.
class TranscriptionHandler:

    # we create a queue to hold transcription values
    def __init__(self):
        self.transcription_queue = Queue()
        self.interface = None

    # every time we record something we put it in the queue
    def store_transcription(self, output):
        self.transcription_queue.put(output)
        return output

    # This is the same interface and transcription logic as before
    # except it stores the result in a queue instead of a global
    def create_interface(self):
        mic_transcribe = gr.Interface(
            fn=lambda x: self.store_transcription(transcribe_speech(x)),
            inputs=gr.Audio(sources="microphone", type="filepath"),
            outputs=gr.Textbox(label="Transcription"),
        )
        self.interface = gr.Blocks()
        with self.interface:
            gr.TabbedInterface([mic_transcribe], ["Transcribe Microphone"])
        return self.interface

    # we launch the transcription interface
    async def get_transcription(self):
        self.interface = self.create_interface()
        self.interface.launch(share=False, server_port=8000, prevent_thread_lock=True)

        # we poll every 1.5 seconds waiting for something to end up in the queue
        while True:
            if not self.transcription_queue.empty():
                result = self.transcription_queue.get()

                if self.interface is not None:
                    self.interface.close()

                return result

            await asyncio.sleep(1.5)

In [None]:
workflow = RAGWorkflow(timeout=600, verbose=False)

handler = workflow.run(
    resume_file="./data/fake_resume.pdf",
    application_form="./data/fake_application_form.pdf",
)

async for event in handler.stream_events():
    if isinstance(event, InputRequiredEvent):
        # Get transcription
        transcription_handler = TranscriptionHandler()
        response = await transcription_handler.get_transcription()

        handler.ctx.send_event(HumanResponseEvent(response=response))

response = await handler

print("Agent complete! Here's your final result:")
print(str(response))

### Resources

To learn more about agentic document workflows, you check this [article](https://www.llamaindex.ai/blog/introducing-agentic-document-workflows) and theses [example implementations](https://github.com/run-llama/llamacloud-demo/tree/main/examples/document_workflows).