In [2]:
import logging
import sys
import llama_index.core
import nest_asyncio

nest_asyncio.apply()
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
llama_index.core.set_global_handler("simple")

from llama_index.core.callbacks import (
    CallbackManager, TokenCountingHandler,
    LlamaDebugHandler, CBEventType, CBEvent
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.prompts import PromptTemplate
from llama_index.core import (
    Settings,
    VectorStoreIndex, 
    SummaryIndex,
    StorageContext,
    QueryBundle
)
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core import get_response_synthesizer
from llama_index.core.vector_stores.types import MetadataInfo, VectorStoreInfo
from llama_index.core.tools import QueryEngineTool, ToolMetadata
import tiktoken
import index_utils
import chromadb
import json

In [3]:
# Load models and setup callback handler
Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

# callback setup
token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-4o-mini").encode,
    verbose=True
)
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
Settings.callback_manager = CallbackManager([token_counter, llama_debug])

In [4]:
chroma_client = chromadb.PersistentClient(path="./chroma_db")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [5]:
chroma_client.list_collections()

[Collection(id=1a8bc10a-6620-41d3-97b4-3eebff9b6404, name=in_10_minutes_this_room_will_explode_ts5_scenes_description),
 Collection(id=358cc788-b408-41a2-84aa-0330aa076966, name=in_10_minutes_this_room_will_explode_transcript_spc_2),
 Collection(id=e4cce165-5ee1-4c94-b8b9-95d55ddc74a7, name=in_10_minutes_this_room_will_explode_transcript_spc_1)]

## Indexing

In [6]:
transcript_index = index_utils.get_transcipt_index(
    transcript_path="./data/audio_transcripts/in_10_minutes_this_room_will_explode.json",
    segs_per_chunk=1
)

reindexing False
Loading index from vector store
**********
Trace: index_construction
**********


In [7]:
scene_index = index_utils.get_scene_index(
    video_descriptions_path="./data/desciptions/in_10_minutes_this_room_will_explode_ts5.json"
)

Creating index from vector store
**********
Trace: index_construction
**********


### Auto retriever

In [24]:
vector_store_info = VectorStoreInfo(
    content_info="Video entertainment content",
    metadata_info=[
        MetadataInfo(
            name="start",
            type="float",
            description=(
                "Start time of a shot in seconds"
            ),
        ),
        MetadataInfo(
            name="end",
            type="float",
            description=(
                "End time of a shot in seconds"
            ),
        ),
    ],
)

In [26]:
scene_retriever = VectorIndexAutoRetriever(
    scene_index, 
    vector_store_info=vector_store_info,
    similarity_top_k=5
)

In [40]:
%%capture
query = "What happened in the first 3 minutes of the video?"
query_bundle = QueryBundle(query)
retrieved_nodes = scene_retriever.retrieve(query_bundle)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:llama_index.core.indices.vector_store.retrievers.auto_retriever.auto_retriever:Using query str: What happened in the video
Using query str: What happened in the video
INFO:llama_index.core.indices.vector_store.retrievers.auto_retriever.auto_retriever:Using filters: [('start', '>=', 0), ('end', '<=', 180)]
Using filters: [('start', '>=', 0), ('end', '<=', 180)]
INFO:llama_index.core.indices.vector_store.retrievers.auto_retriever.auto_retriever:Using top_k: 5
Using top_k: 5
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [42]:
print("Question:", query)
print("Retrieved nodes:")
for idx, node in enumerate(retrieved_nodes):
    # print(node.text)
    print(json.dumps(node.metadata, indent=2))
    print("=="*40)

Question: What happened in the first 3 minutes of the video?
Retrieved nodes:
{
  "start": 74,
  "end": 75,
  "content type": "scene description"
}
{
  "start": 34,
  "end": 35,
  "content type": "scene description"
}
{
  "start": 129,
  "end": 130,
  "content type": "scene description"
}
{
  "start": 104,
  "end": 105,
  "content type": "scene description"
}
{
  "start": 164,
  "end": 165,
  "content type": "scene description"
}


In [47]:
# define response synthesizer
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

# assemble query engine
scene_summary_engine = RetrieverQueryEngine(
    retriever=scene_retriever,
    response_synthesizer=response_synthesizer,
)

In [48]:
scene_summary_engine.query(query)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 1287
LLM Completion Token Usage: 59
** Messages: **
user: Your goal is to structure the user's query to match the request schema provided below.

<< Structured Request Schema >>
When responding use a markdown code snippet with a JSON object formatted in the following schema:

{
    "title": "VectorStoreQuerySpec",
    "description": "Schema for a structured request for vector store\n(i.e. to be converted to a VectorStoreQuery).\n\nCurrently only used by VectorIndexAutoRetriever.",
    "type": "object",
    "properties": {
        "query": {
            "title": "Query",
            "type": "string"
        },
        "filters": {
            "title": "Filters",
            "type": "array",
            "items": {
                "$ref": "#/definitions/MetadataFilter"
            }
        },
   

Response(response='In the initial moments of the video, two young men are seen on a metal structure, likely part of a game or challenge, displaying excitement and engagement. One is in a white hoodie and the other in a blue sweatshirt, with a digital timer showing a countdown. The atmosphere is energetic and competitive. Shortly after, the scene shifts to a high platform where the same individuals are animatedly participating, with a large digital countdown timer indicating urgency. A camera operator captures the action, contributing to the lively environment. The focus then transitions to a modern studio where a man in a brown leather jacket observes a contestant suspended from a harness, reaching for a target, with a timer displaying the remaining challenge time. The tension and competitive spirit are palpable throughout these scenes.', source_nodes=[NodeWithScore(node=TextNode(id_='39e6cfe1-4afe-4af6-8226-2af6235ec8ee', embedding=None, metadata={'start': 74, 'end': 75, 'content type

## Basic query engines

### Transcript query

In [14]:
transcript_query_engine = transcript_index.as_query_engine(
    similarity_top_k=5, 
    use_async=True
)

In [7]:
test_query = '''what is the prize'''
response = transcript_query_engine.query(test_query)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding Token Usage: 4
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 321
LLM Completion Token Usage: 8
** Messages: **
system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information is below.
---------------------
start: 188.66000366210938
end: 194.66000366210938

 You're gonna have to hurry. Under seven minutes remain, but for your $250,000, you're gonna have to 

### Video query

In [15]:
scene_query_engine = scene_index.as_query_engine(
    similarity_top_k=5, 
    use_async=True
)

In [10]:
query = "Tell me about the winning scene"
response = scene_query_engine.query(query)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding Token Usage: 6
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 556
LLM Completion Token Usage: 68
** Messages: **
system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information is below.
---------------------
start: 526
end: 527
file_path: data/scenes/in_10_minutes_this_room_will_explode_ts5/frame_15794.jpg

The scene captures a tense moment in a game show s

### Basic tools

In [17]:
transcipt_tool = QueryEngineTool(
    query_engine=transcript_query_engine,
    metadata=ToolMetadata(
        name=f"transcript_tool",
        description=(
            '''answer questions related to audio transcripts, conversations, spoken content'''
        )
    )
)

In [18]:
scene_tool = QueryEngineTool(
    query_engine=transcipt_tool,
    metadata=ToolMetadata(
        name=f"scene_tool",
        description=(
            '''answer questions related to scene description content'''
        )
    )
)

## Advanced query engine

In [8]:
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine import TransformQueryEngine

In [9]:
# reranker
rerank_postprocessor = SentenceTransformerRerank(
    model='models/mxbai-rerank-xsmall-v1',
    top_n=5, # number of nodes after re-ranking,
    keep_retrieval_score=False,
    device="cuda:1"
)

  from tqdm.autonotebook import tqdm, trange


INFO:datasets:PyTorch version 2.3.0+cu118 available.
PyTorch version 2.3.0+cu118 available.


In [None]:
# query transform
# hyde = HyDEQueryTransform(include_original=True)

In [17]:
advanced_transcript_query_engine = transcript_index.as_query_engine(
    node_postprocessors=[rerank_postprocessor],
    similarity_top_k=10, # semantic search nodes
    use_async=True
)
# advanced_transcript_query_engine = TransformQueryEngine(
#     transcript_query_engine_rerank, 
# )


advanced_scene_query_engine = scene_index.as_query_engine(
    node_postprocessors=[rerank_postprocessor],
    similarity_top_k=10, # semantic search nodes
    use_async=True
)

summary_query_engine = scene_index.as_query_engine(
    response_mode="tree_summarize",
    node_postprocessors=[rerank_postprocessor],
    similarity_top_k=10, # semantic search nodes
    use_async=True
)

# advanced_scene_query_engine = TransformQueryEngine(scene_query_engine_rerank, hyde)

In [18]:
advanced_transcipt_tool = QueryEngineTool(
    query_engine=advanced_transcript_query_engine,
    metadata=ToolMetadata(
        name=f"transcript_tool",
        description=(
            '''answer questions related to audio transcripts, conversations, spoken content'''
        )
    )
)
advanced_scene_tool = QueryEngineTool(
    query_engine=advanced_scene_query_engine,
    metadata=ToolMetadata(
        name=f"scene_tool",
        description=(
            '''answer questions related to scene description content'''
        )
    )
)

summary_tools = QueryEngineTool(
    query_engine=summary_query_engine,
    metadata=ToolMetadata(
        name=f"summary_tool",
        description=(
            '''summary scenes content, what happened in video'''
        )
    )
)

### Test query

In [19]:
query_1 = "what is the prize"
query_2 = "Describe the winning scene"
query_3 = "What happened in the first 3 minutes of the video?"

In [14]:
response = advanced_transcript_query_engine.query(query_1)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding Token Usage: 4


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 355
LLM Completion Token Usage: 8
** Messages: **
system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information is below.
---------------------
start: 389.260009765625
end: 392.260009765625
content type: spoken content

 I actually don't help him. Well, let's see if he's smart enough to win the money.

start: 584.3800048828125
end: 586.3800048828125
content type: spoken content

 And after he won with 19 seconds left,

start: 188.66000366210938
end: 194.66000366210938
content

In [16]:
response.response

'The prize is $250,000.'

In [28]:
response = advanced_scene_query_engine.query(query_2)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Batches: 100%|██████████| 1/1 [00:00<00:00, 26.12it/s]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
** Messages: **
system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information is below.
---------------------
start: 581
end: 582
content type: scene description
retrieval_score: 0.7426275197976455

The scene captures a high-energy moment in a competitive game setting. In the foreground, a contestant in a blue outfit is poised to hit a large, red button, displaying intense focus and determination. Two other participants flank him, one on the left wearing a gray shirt and the other on the right in a 

In [20]:
response = summary_query_engine.query(query_3)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Embedding Token Usage: 12


Batches: 100%|██████████| 1/1 [00:00<00:00, 28.48it/s]


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 737
LLM Completion Token Usage: 135
** Messages: **
system: You are an expert Q&A system that is trusted around the world.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Never directly reference the given context in your answer.
2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
user: Context information from multiple sources is below.
---------------------
start: 531
end: 532
content type: scene description

The scene captures a moment from a competitive game show or challenge. A young man with short hair is seen in profile, focused and determined, as he stands on a stage with a dark background. A digital timer in the upper left corner shows 01:08, indicating the

## Agents

### React Agent

In [27]:
from llama_index.core.agent import AgentRunner, ReActAgentWorker, ReActAgent

In [24]:
agent_context = '''Your are video analysis expert, you can answer question related to poken content or visual scene descriptions.\
Focus on accurately understanding the context, intent, and nuances of the spoken content and scene description content. \
If a query asks for details not mentioned in the content, indicate that the information is not available.'''

In [44]:
react_agent = ReActAgent.from_tools(
    [transcipt_tool, scene_tool],
    context=agent_context,
    verbose=True
)

ReActChatFormatter.from_context is deprecated, please use `from_defaults` instead.


In [45]:
response = react_agent.chat("What is the prize?")

> Running step 261b4e5e-d31e-4381-ae4b-c0dddf810916. Step input: What is the prize?
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 598
LLM Completion Token Usage: 43
** Messages: **
system: You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.

## Tools

You have access to a wide variety of tools. You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools to complete each subtask.

You have access to the following tools:
> Tool Name: transcript_tool
Tool Description: answer questions related to audio transcripts, conversations, spoken content
Tool Args: {"type": "object", "properties": {"input": {"title": "Input", "type": "string"}}, "required": ["i

In [46]:
response = react_agent.chat("")

> Running step b4aef6c7-0cd0-4080-8dee-e47c79a7f2f5. Step input: What timestamp the prize is mentioned?
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 621
LLM Completion Token Usage: 45
** Messages: **
system: You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.

## Tools

You have access to a wide variety of tools. You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools to complete each subtask.

You have access to the following tools:
> Tool Name: transcript_tool
Tool Description: answer questions related to audio transcripts, conversations, spoken content
Tool Args: {"type": "object", "properties": {"input": {"title": "Input", "type": "string

In [48]:
response = react_agent.chat("what happended from first 5 minutes")

> Running step 932d4c27-465c-4a22-a33b-afe36c448b16. Step input: what happended from first 5 minutes
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 749
LLM Completion Token Usage: 50
** Messages: **
system: You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.

## Tools

You have access to a wide variety of tools. You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools to complete each subtask.

You have access to the following tools:
> Tool Name: transcript_tool
Tool Description: answer questions related to audio transcripts, conversations, spoken content
Tool Args: {"type": "object", "properties": {"input": {"title": "Input", "type": "string"}}

In [52]:
response = react_agent.query("Did he win the prize?")

> Running step 87867bad-292a-4c80-956c-304f7211e604. Step input: Did he win the prize?
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 599
LLM Completion Token Usage: 51
** Messages: **
system: You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.

## Tools

You have access to a wide variety of tools. You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools to complete each subtask.

You have access to the following tools:
> Tool Name: transcript_tool
Tool Description: answer questions related to audio transcripts, conversations, spoken content
Tool Args: {"type": "object", "properties": {"input": {"title": "Input", "type": "string"}}, "required": 

In [53]:
response = react_agent.query("Tell me about the winning scene")

> Running step 2ae0af50-99bb-4b4e-a908-47891f51ac33. Step input: Tell me about the winning scene
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 599
LLM Completion Token Usage: 40
** Messages: **
system: You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.

## Tools

You have access to a wide variety of tools. You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools to complete each subtask.

You have access to the following tools:
> Tool Name: transcript_tool
Tool Description: answer questions related to audio transcripts, conversations, spoken content
Tool Args: {"type": "object", "properties": {"input": {"title": "Input", "type": "string"}}, "r

### OpenAI agent

In [61]:
from llama_index.agent.openai import OpenAIAgentWorker, OpenAIAgent
from llama_index.core.tools import FunctionTool, QueryEngineTool

In [42]:
openai_agent = OpenAIAgent.from_tools(
    [transcipt_tool, scene_tool],
    system_prompt=agent_context,
    verbose=True
)

In [43]:
openai_agent.query("What is the prize?")

Added user message to memory: What is the prize?
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 141
LLM Completion Token Usage: 22
** Messages: **
system: Your are video analysis expert, you can answer question related to poken content or visual scene descriptions.Focus on accurately understanding the context, intent, and nuances of the spoken content and scene description content. If a query asks for details not mentioned in the content, indicate that the information is not available.
user: What is the prize?
**************************************************
** Response: **
assistant: The information about the prize is not available. Please provide more context or details for a more specific inquiry.
**************************************************


**********
Trace: query
    |_agent_step -> 0.91547 seconds
      |_llm -> 0.912122 sec

Response(response='The information about the prize is not available. Please provide more context or details for a more specific inquiry.', source_nodes=[], metadata=None)

### Agent runner

In [33]:
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner

In [37]:
agent_worker = FunctionCallingAgentWorker.from_tools(
    [transcipt_tool, scene_tool],
    system_prompt=agent_context,
    verbose=True
)

In [38]:
agent = AgentRunner(agent_worker, verbose=True)

In [39]:
agent.query("What is the prize?")

> Running step b40b17e1-1e0d-40aa-9483-a771c969393a. Step input: What is the prize?
Added user message to memory: What is the prize?
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
LLM Prompt Token Usage: 141
LLM Completion Token Usage: 21
** Messages: **
system: Your are video analysis expert, you can answer question related to poken content or visual scene descriptions.Focus on accurately understanding the context, intent, and nuances of the spoken content and scene description content. If a query asks for details not mentioned in the content, indicate that the information is not available.
user: What is the prize?
**************************************************
** Response: **
assistant: The information about the prize is not available. Please provide more context or details for a specific inquiry.
**************************************************


=== LLM 

Response(response='The information about the prize is not available. Please provide more context or details for a specific inquiry.', source_nodes=[], metadata=None)