In [1]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


## Step 1a - Document Ingestion

In [20]:
try:
    
    transcript = YouTubeTranscriptApi().fetch(video_id="K-Q4r37XgVw", languages=['en'])

    # transcript = " ".join(chunk["text"] for chunk in transcript)
    
    print(transcript)

except TranscriptsDisabled:
    print("No captions available for this video.")

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text='And you said India has one of the top', start=0.08, duration=3.68), FetchedTranscriptSnippet(text="two agencies in the world. There's RAW", start=1.839, duration=3.44), FetchedTranscriptSnippet(text="and there's the intelligence bureau", start=3.76, duration=3.28), FetchedTranscriptSnippet(text='which is IB. Why did you say so?', start=5.279, duration=3.521), FetchedTranscriptSnippet(text='>> Indian intelligence is so good. They are', start=7.04, duration=4.0), FetchedTranscriptSnippet(text='very effective at being able to identify', start=8.8, duration=3.839), FetchedTranscriptSnippet(text='terrorist activities, threats to the', start=11.04, duration=4.599), FetchedTranscriptSnippet(text='homeland.', start=12.639, duration=3.0), FetchedTranscriptSnippet(text="And Pakistan's ISI uses terrorist", start=15.839, duration=5.121), FetchedTranscriptSnippet(text="tactics in India. They're blowing things", start=18.88, duration=5.68), 

In [24]:
transcript = " ".join([item.text for item in transcript])

print(transcript)

And you said India has one of the top two agencies in the world. There's RAW and there's the intelligence bureau which is IB. Why did you say so? >> Indian intelligence is so good. They are very effective at being able to identify terrorist activities, threats to the homeland. And Pakistan's ISI uses terrorist tactics in India. They're blowing things up. They're killing people. >> Terrorists targeted tourists in Kashmir's behelgam. Pakistan harbors terrorism, hides terrorists including Osama bin Laden. >> Osama bin Laden. Osama bin Laden. Osama bin Laden. Indian Pakistan. >> What's the advantage for America or CIA to continue supporting Pakistan? >> We have spies that can tell us about India. >> Why do you need intelligence against India at this point? >> Well, India is not going to tell us everything either. So, how do we know when India is lying? How do we know when India is telling the truth? We need spies in India. Well, who's the best at getting spies in India? Pakistan. You're te

## Step 1b - Indexing (Text Splitting)

In [25]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [26]:
len(chunks)

180

In [27]:
chunks[0]

Document(metadata={}, page_content="And you said India has one of the top two agencies in the world. There's RAW and there's the intelligence bureau which is IB. Why did you say so? >> Indian intelligence is so good. They are very effective at being able to identify terrorist activities, threats to the homeland. And Pakistan's ISI uses terrorist tactics in India. They're blowing things up. They're killing people. >> Terrorists targeted tourists in Kashmir's behelgam. Pakistan harbors terrorism, hides terrorists including Osama bin Laden. >> Osama bin Laden. Osama bin Laden. Osama bin Laden. Indian Pakistan. >> What's the advantage for America or CIA to continue supporting Pakistan? >> We have spies that can tell us about India. >> Why do you need intelligence against India at this point? >> Well, India is not going to tell us everything either. So, how do we know when India is lying? How do we know when India is telling the truth? We need spies in India. Well, who's the best at getting

## 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [29]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
vector_store = FAISS.from_documents(chunks, embeddings)

In [30]:
vector_store.index_to_docstore_id

{0: 'e46190fd-a2fa-4ce3-8555-bf799e06b052',
 1: '1da8f7ee-7116-4c29-81cc-7218eca55439',
 2: '6bcd9beb-6a51-4d84-93b1-b8db0238ae51',
 3: 'da594c0e-1d6c-4e48-9686-b90dfaae1879',
 4: '730a1e1e-edc5-4b00-8705-abcd0b4ed560',
 5: '237a99ac-fcf5-4a1f-97a8-2fbc887f78a0',
 6: '6565cd7b-88c6-403b-9022-68367348d7e9',
 7: '04ab4d4f-d4dd-4ed3-9ca1-c18c04115659',
 8: '53d598ee-338a-4b42-ac07-64e2ea442704',
 9: 'e81efee6-9454-46d5-8161-f17787887905',
 10: '597108af-bf6f-4ded-806b-96e0f910d583',
 11: '612a4219-9aa0-4dc0-97bf-d90db505f27f',
 12: 'd2c7a198-0d29-48ac-b7d6-45a4fdcf2802',
 13: 'd83c8dce-0a73-4e27-9b8e-341f0374fbac',
 14: '59b7a136-fae6-41f7-8dc7-27471a750f8a',
 15: '1915fbf0-75bb-4635-ae0f-4eb61cb8db95',
 16: '58ebe5e4-078a-40ab-9abc-0a86d89ae5ab',
 17: 'd700a621-e0e3-4dcf-b969-75698c5bfe27',
 18: 'dbd24150-7ead-42cb-9f8e-8a9afec534da',
 19: '3c9af87f-997b-47aa-bb60-42e157442ca1',
 20: 'ce6d6697-1697-4d9c-8079-6c07cf6cdb6a',
 21: 'd5ede154-39b0-4b7d-8aaf-dccc9a8e35c3',
 22: 'd1522bbb-e0c6-

In [31]:
vector_store.get_by_ids(['e46190fd-a2fa-4ce3-8555-bf799e06b052'])

[Document(id='e46190fd-a2fa-4ce3-8555-bf799e06b052', metadata={}, page_content="And you said India has one of the top two agencies in the world. There's RAW and there's the intelligence bureau which is IB. Why did you say so? >> Indian intelligence is so good. They are very effective at being able to identify terrorist activities, threats to the homeland. And Pakistan's ISI uses terrorist tactics in India. They're blowing things up. They're killing people. >> Terrorists targeted tourists in Kashmir's behelgam. Pakistan harbors terrorism, hides terrorists including Osama bin Laden. >> Osama bin Laden. Osama bin Laden. Osama bin Laden. Indian Pakistan. >> What's the advantage for America or CIA to continue supporting Pakistan? >> We have spies that can tell us about India. >> Why do you need intelligence against India at this point? >> Well, India is not going to tell us everything either. So, how do we know when India is lying? How do we know when India is telling the truth? We need spi

## Step 2 - Retrieval

In [32]:
retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 4})

In [33]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000021F4907B9B0>, search_kwargs={'k': 4})

In [35]:
retriever.invoke('What are best intelligence agency of India?')

[Document(id='d64c49d4-b083-4075-8daa-bc87b0743354', metadata={}, page_content="the president. But it's only really for human intelligence. NSA does signals intelligence. DoD or DIA does defense military intelligence. And we have all these different intelligence groups that do all sorts of different things. Inside India, RAW doesn't really only serve one purpose. It serves multiple purposes. It has paramilitary activities. CIA also has paramilitary activities, but CIA's paramilitary activities is small. RAW's paramilitary are comparatively much larger. RAW also handles more than just humans. They handle SIGN. They handle open source intelligence. They don't serve as the only aggregator of intelligence around all of India. Whereas the United States CIA is the central aggregator for all intelligence. CIA is what produces a daily report for the president. Um I don't know if RA of RAW of RAW creates a central until a central daily report for um for the prime minister. I don't I don't think

## Step 3 - Augmentation

In [36]:
llm = ChatGoogleGenerativeAI(model='gemini-2.5-flash', temperature=0.2)

In [38]:
prompt = PromptTemplate(
    template="""
        You are a helpful assistant. Answer ONLY from the provided transcript context. If the context is sufficient, just say you don't know.

        {context}
        question: {question}
    """,
    input_variables = ['context', 'question'] 
)

In [39]:
question = "Is the topic of India vs Pakistan war discussed in this video? If yes then what  was discussed"
retrieved_docs = retriever.invoke(question)

In [43]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

'that are being used by all across Islam including Islamic fundamentalism. But India refuses to use fundamentalist tactics because it identifies as a na as a nation first. That\'s also why India is the preferred nation over Pakistan worldwide. And I understand that\'s going to make Pakistanis pissed, but the truth is the truth. People see the value of India. They don\'t see the value of Pakistan. People understand India is a commercial growing entrepreneurial manufacturing hub, tech hub of the world. Pakistan is not. Pakistan is just a strategic geographic ally that sits between the east and the west. You go back to the global war on terror. Pakistan harbors terrorism. It it hides terrorists including Osama bin Laden. It lies to its allies. It\'s a complicated, difficult place that also has nuclear weapons. Right? India is trying to become first world. India is trying to establish and grow itself as a competitor to the Chinese and to the United States by making itself an ally to the\n\

In [41]:
final_prompt = prompt.invoke({'context': context_text, 'question': question})

## Step - 4 Generation

In [45]:
answer = llm .invoke(final_prompt)
print(answer.content)

Yes, the topic of conflict between India and Pakistan is discussed.

It is mentioned that Pakistan uses terrorist tactics against India, which is described as problematic and illegal. The transcript also discusses how external countries like Russia, China, and the United States might manipulate intelligence or propaganda to "fuel enough conflict back and forth" and "continue this internal conflict between your two countries" so that their resources are used against each other.


## Building a chain

In [46]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [47]:
def format_docs(retrieved_docs):
    context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return context_text

In [48]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [49]:
parallel_chain.invoke('What are best 2 intelligence agencies of India?')

{'context': "the president. But it's only really for human intelligence. NSA does signals intelligence. DoD or DIA does defense military intelligence. And we have all these different intelligence groups that do all sorts of different things. Inside India, RAW doesn't really only serve one purpose. It serves multiple purposes. It has paramilitary activities. CIA also has paramilitary activities, but CIA's paramilitary activities is small. RAW's paramilitary are comparatively much larger. RAW also handles more than just humans. They handle SIGN. They handle open source intelligence. They don't serve as the only aggregator of intelligence around all of India. Whereas the United States CIA is the central aggregator for all intelligence. CIA is what produces a daily report for the president. Um I don't know if RA of RAW of RAW creates a central until a central daily report for um for the prime minister. I don't I don't think they do but I don't know. So those are kind of some of the differe

In [50]:
parser = StrOutputParser()

In [51]:
main_chain = parallel_chain | prompt | llm | parser

In [56]:
answer = main_chain.invoke('List all intelligence agencies of India?')

In [57]:
answer

'Based on the provided transcript, the intelligence agencies of India mentioned are:\n\n*   RAW\n*   The Intelligence Bureau (IB)'