# Use your voice

**objective**: Get voice feedback 

So far we've set up a moderately complex workflow with a human feedback loop. Let's run it through the visualizer to see what it looks like.

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os, json
from llama_parse import LlamaParse
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage
)
from llama_index.core.workflow import (
    StartEvent,
    StopEvent,
    Workflow,
    step,
    Event,
    Context,
    InputRequiredEvent,
    HumanResponseEvent
)
from llama_index.utils.workflow import draw_all_possible_flows
import gradio as gr
import asyncio
from queue import Queue
import vosk
import wave
import json as json_lib

import nest_asyncio
nest_asyncio.apply()

from helper import get_gemini_api_key, get_llama_cloud_api_key

llama_cloud_api_key = get_llama_cloud_api_key()
gemini_api_key = get_gemini_api_key()

In [3]:
class ParseFormEvent(Event):
    application_form: str

class QueryEvent(Event):
    query: str

class ResponseEvent(Event):
    response: str

class FeedbackEvent(Event):
    feedback: str

class GenerateQuestionsEvent(Event):
    pass

class RAGWorkflow(Workflow):
    storage_dir = "./storage"
    llm: Gemini
    query_engine: VectorStoreIndex

    @step
    async def set_up(self, ctx: Context, ev: StartEvent) -> ParseFormEvent:

        if not ev.resume_file:
            raise ValueError("No resume file provided")

        if not ev.application_form:
            raise ValueError("No application form provided")

        # give ourselves an LLM to work with
        self.llm = Gemini(model="models/gemma-3-27b-it", api_key=gemini_api_key)

        # ingest our data and set up the query engine
        if os.path.exists(self.storage_dir):
            # we've already ingested our documents
            storage_context = StorageContext.from_defaults(persist_dir=self.storage_dir)
            index = load_index_from_storage(storage_context)
        else:
            # we need to parse and load our documents
            documents = LlamaParse(
                api_key=llama_cloud_api_key,
                base_url=os.getenv("LLAMA_CLOUD_BASE_URL"),
                result_type="markdown",
                content_guideline_instruction="This is a resume, gather related facts together and format it as bullet points with headers"
            ).load_data(ev.resume_file)
            # embed and index the documents
            index = VectorStoreIndex.from_documents(
                documents,
                embed_model=GeminiEmbedding(model_name="models/text-embedding-004", api_key=gemini_api_key)
            )
            index.storage_context.persist(persist_dir=self.storage_dir)

        # either way, create a query engine
        self.query_engine = index.as_query_engine(llm=self.llm, similarity_top_k=5)

        # let's pass our application form to a new step where we parse it
        return ParseFormEvent(application_form=ev.application_form)

    # we've separated the form parsing from the question generation
    @step
    async def parse_form(self, ctx: Context, ev: ParseFormEvent) -> GenerateQuestionsEvent:
        parser = LlamaParse(
            api_key=llama_cloud_api_key,
            base_url=os.getenv("LLAMA_CLOUD_BASE_URL"),
            result_type="markdown",
            content_guideline_instruction="This is a job application form. Create a list of all the fields that need to be filled in.",
            formatting_instruction="Return a bulleted list of the fields ONLY."
        )

        # get the LLM to convert the parsed form into JSON
        result = parser.load_data(ev.application_form)[0]
        raw_json = self.llm.complete(
            f"This is a parsed form. Convert it into a JSON object containing only the list of fields to be filled in, in the form {{ fields: [...] }}. <form>{result.text}</form>. Return JSON ONLY, no markdown.")
        
        # Try to parse JSON, with error handling
        try:
            # Clean the response by removing markdown code blocks if present
            json_text = raw_json.text.strip()
            if json_text.startswith("```json"):
                json_text = json_text[7:]  # Remove ```json
            if json_text.endswith("```"):
                json_text = json_text[:-3]  # Remove ```
            json_text = json_text.strip()
            
            fields = json.loads(json_text)["fields"]
        except json.JSONDecodeError as e:
            print(f"JSON decode error: {e}")
            print(f"Raw response: '{raw_json.text}'")
            # Fallback: extract fields manually or use a default list
            fields = [
                "First Name", "Last Name", "Email", "Phone", "Linkedin", 
                "Project Portfolio", "Degree", "Graduation Date", 
                "Current Job Title", "Current Employer", "Technical Skills",
                "Describe why you're a good fit for this position",
                "Do you have 5 years of experience in React?"
            ]
            print(f"Using fallback fields: {fields}")

        await ctx.set("fields_to_fill", fields)

        return GenerateQuestionsEvent()

    # this step can get triggered either by GenerateQuestionsEvent or a FeedbackEvent
    @step
    async def generate_questions(self, ctx: Context, ev: GenerateQuestionsEvent | FeedbackEvent) -> QueryEvent:

        # get the list of fields to fill in
        fields = await ctx.get("fields_to_fill")

        # generate one query for each of the fields, and fire them off
        for field in fields:
            question = f"How would you answer this question about the candidate? <field>{field}</field>"

            if hasattr(ev,"feedback"):
                question += f"""
                    \nWe previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback>{ev.feedback}</feedback>
                """

            ctx.send_event(QueryEvent(
                field=field,
                query=question
            ))

        # store the number of fields so we know how many to wait for later
        await ctx.set("total_fields", len(fields))
        return

    @step
    async def ask_question(self, ctx: Context, ev: QueryEvent) -> ResponseEvent:
        print(f"Asking question: {ev.query}")

        response = self.query_engine.query(f"This is a question about the specific resume we have in our database: {ev.query}")

        print(f"Answer was: {str(response)}")

        return ResponseEvent(field=ev.field, response=response.response)

    # we now emit an InputRequiredEvent
    @step
    async def fill_in_application(self, ctx: Context, ev: ResponseEvent) -> InputRequiredEvent:
        # get the total number of fields to wait for
        total_fields = await ctx.get("total_fields")

        responses = ctx.collect_events(ev, [ResponseEvent] * total_fields)
        if responses is None:
            return None # do nothing if there's nothing to do yet

        # we've got all the responses!
        responseList = "\n".join("Field: " + r.field + "\n" + "Response: " + r.response for r in responses)

        result = self.llm.complete(f"""
            You are given a list of fields in an application form and responses to
            questions about those fields from a resume. Combine the two into a list of
            fields and succinct, factual answers to fill in those fields.

            <responses>
            {responseList}
            </responses>
        """)

        # save the result for later
        await ctx.set("filled_form", str(result))

        # Let's get a human in the loop
        return InputRequiredEvent(
            prefix="How does this look? Give me any feedback you have on any of the answers.",
            result=result
        )

    # Accept the feedback.
    @step
    async def get_feedback(self, ctx: Context, ev: HumanResponseEvent) -> FeedbackEvent | StopEvent:

        result = self.llm.complete(f"""
            You have received some human feedback on the form-filling task you've done.
            Does everything look good, or is there more work to be done?
            <feedback>
            {ev.response}
            </feedback>
            If everything is fine, respond with just the word 'OKAY'.
            If there's any other feedback, respond with just the word 'FEEDBACK'.
        """)

        verdict = result.text.strip()

        print(f"LLM says the verdict was {verdict}")
        if (verdict == "OKAY"):
            return StopEvent(result=await ctx.get("filled_form"))
        else:
            return FeedbackEvent(feedback=ev.response)


In [4]:
WORKFLOW_FILE = "workflows/lesson_6.html"
draw_all_possible_flows(RAGWorkflow, filename=WORKFLOW_FILE)

workflows/lesson_6.html


In [5]:
from IPython.display import display, HTML, DisplayHandle
from helper import extract_html_content

html_content = extract_html_content(WORKFLOW_FILE)
display(HTML(html_content), metadata=dict(isolated=True))

Cool! You can see the path all the way to the end and the feedback loop is clear.

## Getting voice feedback

Now, just for fun, you'll do one more thing: change the feedback from text feedback to actual words spoken out loud. To do this we'll use VOSK, a free offline speech recognition toolkit that doesn't require any API keys.

Here's a function that takes a file and uses VOSK to return just the text:

In [6]:
def transcribe_speech(filepath):
    if filepath is None:
        gr.Warning("No audio found, please retry.")
        return ""
    
    try:
        # Download and set up VOSK model if not exists
        import urllib.request
        import zipfile
        
        model_path = "vosk-model-small-en-us-0.15"
        if not os.path.exists(model_path):
            print("Downloading VOSK model...")
            url = "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"
            urllib.request.urlretrieve(url, "vosk-model.zip")
            with zipfile.ZipFile("vosk-model.zip", 'r') as zip_ref:
                zip_ref.extractall(".")
            os.remove("vosk-model.zip")
            print("VOSK model downloaded and extracted!")
        
        # Initialize VOSK model
        model = vosk.Model(model_path)
        rec = vosk.KaldiRecognizer(model, 16000)
        
        # Open audio file
        wf = wave.open(filepath, 'rb')
        
        # Check if audio format is supported
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
            print("Audio file must be WAV format mono PCM.")
            return ""
        
        results = []
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                result = json_lib.loads(rec.Result())
                if 'text' in result:
                    results.append(result['text'])
        
        # Get final result
        final_result = json_lib.loads(rec.FinalResult())
        if 'text' in final_result:
            results.append(final_result['text'])
        
        wf.close()
        
        # Combine all results
        transcription = ' '.join(results).strip()
        return transcription if transcription else "No speech detected"
        
    except Exception as e:
        print(f"Error in transcription: {e}")
        return f"Transcription error: {str(e)}"

But before we can use it, you need to capture some audio from your microphone. That involves some extra steps!

First, create a callback function that saves data to a global variable.

In [7]:
def store_transcription(output):
    global transcription_value
    transcription_value = output
    return output

Now use Gradio, which has special widgets that can render inside a notebook, to create an interface for capturing audio from a microphone. When the audio is captured, it calls `transcribe_speech` on the recorded data, and calls `store_transcription` on that.

In [8]:
mic_transcribe = gr.Interface(
    fn=lambda x: store_transcription(transcribe_speech(x)),
    inputs=gr.Audio(sources="microphone",
                    type="filepath"),
    outputs=gr.Textbox(label="Transcription"))

In Gradio, you further define a visual interface containing this microphone input and output, and then launch it:

In [9]:
test_interface = gr.Blocks()
with test_interface:
    gr.TabbedInterface(
        [mic_transcribe],
        ["Transcribe Microphone"]
    )

test_interface.launch(
    share=False, 
    server_port=8000, 
    prevent_thread_lock=True
)

* Running on local URL:  http://127.0.0.1:8000
* To create a public link, set `share=True` in `launch()`.




You can now print out the transcription, which is stored in that global variable you created earlier:

In [11]:
print(transcription_value)




You're going to want to run Gradio again, so it's a good idea to shut down the Gradio interface you were using. 

In [12]:
test_interface.close()

Closing server running on port: 8000


Now you're going to create an entirely new class, a Transcription Handler. 

In [13]:
# New! Transcription handler.
class TranscriptionHandler:

    # we create a queue to hold transcription values
    def __init__(self):
        self.transcription_queue = Queue()
        self.interface = None

    # every time we record something we put it in the queue
    def store_transcription(self, output):
        self.transcription_queue.put(output)
        return output

    # This is the same interface and transcription logic as before
    # except it stores the result in a queue instead of a global
    def create_interface(self):
        mic_transcribe = gr.Interface(
            fn=lambda x: self.store_transcription(transcribe_speech(x)),
            inputs=gr.Audio(sources="microphone", type="filepath"),
            outputs=gr.Textbox(label="Transcription")
        )
        self.interface = gr.Blocks()
        with self.interface:
            gr.TabbedInterface(
                [mic_transcribe],
                ["Transcribe Microphone"]
            )
        return self.interface

    # we launch the transcription interface
    async def get_transcription(self):
        self.interface = self.create_interface()
        self.interface.launch(
            share=False,
            server_port=8000, 
            prevent_thread_lock=True
        )

        # we poll every 1.5 seconds waiting for something to end up in the queue
        while True:
            if not self.transcription_queue.empty():
                result = self.transcription_queue.get()
                if self.interface is not None:
                    self.interface.close()
                return result
            await asyncio.sleep(1.5)


Now you have a transcription handler, you can use it instead of the keyboard input interface when you're getting human input when you run your workflow:

In [14]:
w = RAGWorkflow(timeout=600, verbose=False)

handler = w.run(
    resume_file="./data/fake_resume.pdf",
    application_form="./data/fake_application_form.pdf"
)

async for event in handler.stream_events():
  if isinstance(event, InputRequiredEvent):
      # Get transcription
      transcription_handler = TranscriptionHandler()
      response = await transcription_handler.get_transcription()

      handler.ctx.send_event(
          HumanResponseEvent(
              response=response
          )
      )

response = await handler
print("Agent complete! Here's your final result:")
print(str(response))

Started parsing the file under job_id b6c7d009-c4a5-471c-b2a7-2a910eeaa3bf
Started parsing the file under job_id fe5146b0-839f-4ae5-b4a0-c4f05c763d3f
Asking question: How would you answer this question about the candidate? <field>First Name</field>
Answer was: Sarah
Asking question: How would you answer this question about the candidate? <field>Last Name</field>
Answer was: Chen.
Asking question: How would you answer this question about the candidate? <field>Email</field>
Answer was: sarah.chen@email.com
Asking question: How would you answer this question about the candidate? <field>Phone</field>
Answer was: This information is not available in the provided text.
Asking question: How would you answer this question about the candidate? <field>LinkedIn</field>
Answer was: linkedin.com/in/sarahchen
Asking question: How would you answer this question about the candidate? <field>Project Portfolio</field>
Answer was: The candidate has two projects listed in their portfolio: EcoTrack, a full-

Downloading VOSK model...
VOSK model downloaded and extracted!
Closing server running on port: 8000
LLM says the verdict was FEEDBACK
Asking question: How would you answer this question about the candidate? <field>First Name</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback>No speech detected</feedback>
                
Answer was: Sarah
Asking question: How would you answer this question about the candidate? <field>Last Name</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback>No speech detected</feedback>
                
Answer was: Chen.
Asking question: How would you answer this question about the candidate? <field>Email</field>
                    
We previously got fe

Task was destroyed but it is pending!
task: <Task pending name='Task-165' coro=<_delete_state() running at c:\Users\Welcome\Desktop\ai agent\venv\lib\site-packages\gradio\route_utils.py:935> wait_for=<Future pending cb=[Task.__wakeup()]>>


* Running on local URL:  http://127.0.0.1:8000
* To create a public link, set `share=True` in `launch()`.


Closing server running on port: 8000
LLM says the verdict was FEEDBACK
Asking question: How would you answer this question about the candidate? <field>First Name</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback></feedback>
                
Answer was: Sarah
Asking question: How would you answer this question about the candidate? <field>Last Name</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback></feedback>
                
Answer was: Chen.
Asking question: How would you answer this question about the candidate? <field>Email</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this pa

Task was destroyed but it is pending!
task: <Task pending name='Task-196' coro=<_delete_state() running at c:\Users\Welcome\Desktop\ai agent\venv\lib\site-packages\gradio\route_utils.py:935> wait_for=<Future pending cb=[Task.__wakeup()]>>


* To create a public link, set `share=True` in `launch()`.


Closing server running on port: 8000
LLM says the verdict was FEEDBACK
Asking question: How would you answer this question about the candidate? <field>First Name</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback></feedback>
                
Answer was: Sarah
Asking question: How would you answer this question about the candidate? <field>Last Name</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback></feedback>
                
Answer was: Chen.
Asking question: How would you answer this question about the candidate? <field>Email</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this pa

Closing server running on port: 8000
LLM says the verdict was FEEDBACK
Asking question: How would you answer this question about the candidate? <field>First Name</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback>No speech detected</feedback>
                
Answer was: Sarah
Asking question: How would you answer this question about the candidate? <field>Last Name</field>
                    
We previously got feedback about how we answered the questions.
                    It might not be relevant to this particular field, but here it is:
                    <feedback>No speech detected</feedback>
                
Answer was: Chen.
Asking question: How would you answer this question about the candidate? <field>Email</field>
                    
We previously got feedback about how we answered the questions.
                   

Task was destroyed but it is pending!
task: <Task pending name='Task-221' coro=<_delete_state() running at c:\Users\Welcome\Desktop\ai agent\venv\lib\site-packages\gradio\route_utils.py:935> wait_for=<Future pending cb=[Task.__wakeup()]>>
Task was destroyed but it is pending!
task: <Task pending name='Task-239' coro=<_delete_state() running at c:\Users\Welcome\Desktop\ai agent\venv\lib\site-packages\gradio\route_utils.py:935> wait_for=<Future pending cb=[Task.__wakeup()]>>


* Running on local URL:  http://127.0.0.1:8000
* To create a public link, set `share=True` in `launch()`.


CancelledError: 

## Resources

To learn more about agentic document workflows, you check this [article](https://www.llamaindex.ai/blog/introducing-agentic-document-workflows) and theses [example implementations](https://github.com/run-llama/llamacloud-demo/tree/main/examples/document_workflows).