# Rag for audio and video data

In [None]:
%pip install --quiet langchain-text-splitters langchain-community langgraph 
%pip install -q "langchain[google-genai]"
%pip install -q langchain langchain-google-genai google-generativeai
%pip install -qU langchain-qdrant

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [29]:
# Langsmith api key setup

from dotenv import load_dotenv
import os

load_dotenv()

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API")

In [30]:
# Embedding model and setup

import os

if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API")

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [31]:
# Google api key setup

import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

# Securely input and set API key
if not os.environ.get("GOOGLE_API_KEY"):
    os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API")

# Initialize Gemini 2.0 Flash model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    google_api_key=os.environ["GOOGLE_API_KEY"]
)

# Example call
response = llm.invoke([HumanMessage(content="Hello Gemini 2.0!")])
print(response.content)


Hello! It's nice to connect with you. How can I help you today?


### Loading the audio data

In [None]:
# Using whisper model

from faster_whisper import WhisperModel
import os

def transcribe_audio_local(audio_path):
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")
    
    print(f"Loading Whisper model: )")
    model = WhisperModel("small", device="cpu", compute_type="int8")  
    # I have chose this config because it fits best for my machine but you can change as per your machine #configuration. larger the model_size , better the accuracy but it will consume more memory and time. Compute type defines the integer accuracy needed during transcription.
    
    print(f"Transcribing: {audio_path}")
    segments, _ = model.transcribe(audio_path)
    
    text = ""
    for segment in segments:
        # Optional: show timestamps
        print(f"[{segment.start:.2f}s - {segment.end:.2f}s]: {segment.text}")
        text += segment.text + " "
    
    return text.strip()


In [None]:
transcript = transcribe_audio_local("f1_rules.mp3") 
print(transcript)


🔍 Loading Whisper model: )
🎙️ Transcribing: f1_rules.mp3
[0.00s - 7.04s]:  An F1 Grand Prix is an absolute festival of excitement and entertainment, taking place
[7.04s - 12.28s]:  over three to four days with all of the warring action culminating in the race itself.
[12.28s - 17.04s]:  A traditional Grand Prix weekend starts much sooner than that though, with free free practice
[17.04s - 21.64s]:  sessions taking place over a couple of days before race day to kick things off.
[21.64s - 25.84s]:  These sessions give the teams and drivers the chance to learn the track, experiment
[25.84s - 29.68s]:  with the conditions and test the different setups and configurations they want to use
[29.68s - 34.16s]:  in qualifying and the race. After those practice sessions are completed, it's
[34.16s - 39.28s]:  time for qualifying. In essence, this is where all the drivers go out and try to
[39.28s - 44.92s]:  set the fastest possible lap times they can across three knockout sessions Q1, Q2
[44.92s

In [32]:
# Setting up the Quadrant vector and client

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

client = QdrantClient(":memory:")

# Create the collection before using it
if not client.collection_exists(collection_name="test"):
    client.create_collection(
        collection_name="test",
        vectors_config={"size": embeddings.embed_query("test").__len__(), "distance": "Cosine"}
    )

vector_store = QdrantVectorStore(
    client=client,
    collection_name="test",
    embedding=embeddings,
)


In [None]:
# Text splitter

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) # You can adjust this sizes as per your input.
chunks = splitter.split_text(transcript)

print(f"Split into {len(chunks)} chunks")

✅ Split into 2 chunks


In [60]:
# Add chunks to quadrant

vector_store.add_texts(chunks) 
retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # k value determines the number of chunks that will be displayed

### Taking the example of two different queries to test the model accuracy based on semantic chunks and gemini llms

In [72]:
query1 = "What is the purpose of the practice sessions held before the main Grand Prix race?"

# Retrieve top-k similar chunks
results = vector_store.similarity_search(query1, k=3)  # It will return top three most similar docs

# Inspect results
for i, doc in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(doc.metadata)
    print(doc.page_content[:500], "...")


Result 1:
{'_id': '5a21a1966a934ad296ffd10cf01df93b', '_collection_name': 'test'}
An F1 Grand Prix is an absolute festival of excitement and entertainment, taking place  over three to four days with all of the warring action culminating in the race itself.  A traditional Grand Prix weekend starts much sooner than that though, with free free practice  sessions taking place over a couple of days before race day to kick things off.  These sessions give the teams and drivers the chance to learn the track, experiment  with the conditions and test the different setups and ...

Result 2:
{'_id': '22243bc162894cf18d9a9da77868de2f', '_collection_name': 'test'}
the chance to learn the track, experiment  with the conditions and test the different setups and configurations they want to use  in qualifying and the race. After those practice sessions are completed, it's  time for qualifying. In essence, this is where all the drivers go out and try to  set the fastest possible lap times they can acro

In [79]:
query2 = "How is the total number of laps in an F1 race determined?"

# Retrieve top-k similar chunks
results = vector_store.similarity_search(query2, k=3)  # It will return top three most similar docs

# Inspect results
for i, doc in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(doc.page_content[:500], "...")


Result 1:
of an F1 race decided? It's a case of simple  maths. A Grand Prix distance is 305 kilometers so divide that by the  length of a single lap and round it up to a whole number and hey presto  that's the number of laps for the race. The only exception is if any  stoppages in the race cause it to run over a two-hour time limit. On the final  lap, the first car to cross the finish line and take the checkered flag wins.  Championship points are awarded for all drivers in the top 10 at the end of  the ...

Result 2:
to survive all knockouts starts the race from the very  front of the grid or pole position. Behind them are the other drivers ordered  from qualifying lap times set in the rest of the starting positions for the  race. The cars use a formation lap to line up in their starting or grid  positions on a start finish straight and wait for the lights to go out and the  race begins. So how is the duration of an F1 race decided? It's a case of simple  maths. A Grand Prix distance

### Using Gemini to answer the question based on the given context

In [None]:
# with query1

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite")

# Combine the context
context = "\n\n".join(doc.page_content for doc in results)

prompt = f"""You are a helpful assistant.

Based on the following context from an audio transcription, answer the user's question.

Context:
{context}

Question: {query1}
"""

response = llm.invoke([HumanMessage(content=prompt)])
print("\n Gemini Answer:\n", response.content)



 Gemini Answer:
 The purpose of the practice sessions is for the teams and drivers to:

*   Learn the track.
*   Experiment with the conditions.
*   Test different setups and configurations they want to use in qualifying and the race.


In [80]:
# with query 2

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite")

# Combine the context
context = "\n\n".join(doc.page_content for doc in results)

prompt = f"""You are a helpful assistant.

Based on the following context from an audio transcription, answer the user's question.

Context:
{context}

Question: {query2}
"""

response = llm.invoke([HumanMessage(content=prompt)])
print("\n Gemini Answer:\n", response.content)



 Gemini Answer:
 The total number of laps in an F1 race is determined by dividing the Grand Prix distance (305 kilometers) by the length of a single lap and rounding up to a whole number. However, if the race is stopped and runs over a two-hour time limit, this can affect the number of laps.


##### 🤷‍♀️Note the difference between the two , semantic search picks up the ouput from pre existing array based on similarity no makeup while gemini use some makeup and also re-structure the output to make it more readable and understandable .

# Similarily for the video input , we can use the following pipeline:


In [None]:
import subprocess
import os

def extract_audio_from_video(video_path: str, output_audio_path: str = "audio_from_video.wav"):
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")
    
    command = [
        "ffmpeg",
        "-y",  # overwrite output if exists
        "-i", video_path,
        "-ac", "1",           # mono
        "-ar", "16000",       # 16 kHz
        output_audio_path
    ]
    
    try:
        result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print("Audio extracted successfully.")
    except subprocess.CalledProcessError as e:
        print("FFmpeg error:")
        print(e.stderr)
        raise

    return output_audio_path


In [23]:
def video_rag_pipeline(video_path: str, query: str):
    # Step 1: Extract audio
    audio_path = extract_audio_from_video(video_path)

    # Step 2: Transcribe audio
    transcript = transcribe_audio_local(audio_path)

    # Step 3: Chunk
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_text(transcript)

    # Step 4: Vector store
    vector_store.add_texts(chunks)

    # Step 5: Semantic search + RAG
    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    retrieved_docs = retriever.invoke(query)
    semantic_chunks = [doc.page_content for doc in retrieved_docs]

    context = "\n\n".join(semantic_chunks)
    prompt = f"""
You are a helpful assistant. Based on the following context from a video transcription, answer the user's question.

Context:
{context}

Question:
{query}
"""
    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite", temperature=0.3)
    gemini_response = llm.invoke(prompt).content

    return {
        "semantic_chunks": semantic_chunks,
        "gemini_answer": gemini_response
    }


In [35]:
query= "what does he talking about?"
result = video_rag_pipeline("video_data.mp4", query) 


[✔] Audio extracted successfully.
🔍 Loading Whisper model: )
🎙️ Transcribing: audio_from_video.wav
[0.00s - 18.14s]:  Wow, what an audience, but if I'm being honest, I don't care what you think of my talk.
[18.14s - 19.14s]:  I don't.
[19.14s - 23.28s]:  I care what the internet thinks of my talk, because they're the ones who get it seen
[23.28s - 24.52s]:  and get it shared.
[24.52s - 26.46s]:  And I think that's where most people get it wrong.
[26.46s - 34.56s]:  We're talking to you here, instead of talking to you, random person, scrolling Facebook.
[34.56s - 36.26s]:  Thanks for the click.
[36.26s - 41.62s]:  You see, back in 2009, we all had these weird little things called attention spans.
[41.62s - 43.38s]:  Yeah, they're gone.
[43.38s - 44.38s]:  They're gone.
[44.38s - 45.38s]:  We killed them.
[45.38s - 46.38s]:  They're dead.
[46.38s - 50.22s]:  I'm trying to think of the last time I watched an 18-minute TED Talk.
[50.22s - 52.94s]:  It's been years, literally years.
[52.94s

In [38]:
# Showing the output

print("Semantic Chunks:\n", "\n\n".join(result["semantic_chunks"]))
print("\nGemini Answer:\n",result["gemini_answer"])

Semantic Chunks:
 Wow, what an audience, but if I'm being honest, I don't care what you think of my talk.  I don't.  I care what the internet thinks of my talk, because they're the ones who get it seen  and get it shared.  And I think that's where most people get it wrong.  We're talking to you here, instead of talking to you, random person, scrolling Facebook.  Thanks for the click.  You see, back in 2009, we all had these weird little things called attention spans.  Yeah, they're gone.  They're gone.  We killed

Wow, what an audience, but if I'm being honest, I don't care what you think of my talk.  I don't.  I care what the internet thinks of my talk, because they're the ones who get it seen  and get it shared.  And I think that's where most people get it wrong.  We're talking to you here, instead of talking to you, random person, scrolling Facebook.  Thanks for the click.  You see, back in 2009, we all had these weird little things called attention spans.  Yeah, they're gone.  They

## End