## Setup

Required libs:
* youtube-transcript-api for extracting transcripts from YouTube videos.
* faiss-cpu for efficient similarity search.
* langchain and langchain-community for text processing and language models.
* ibm-watsonx-ai and langchain_ibm for integrating IBM Watson services.

Install requried libs:

In [None]:
!pip install youtube-transcript-api==0.6.2
!pip install faiss-cpu==1.8.0
!pip install langchain==0.2.6 | tail -n 1
!pip install langchain-community==0.2.6 | tail -n 1
!pip install ibm-watsonx-ai==1.0.10 | tail -n 1
!pip install langchain_ibm==0.1.8 | tail -n 1

Import required libs:

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.text_splitter import RecursiveCharacterTextSplitter
from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import DecodingMethods
from langchain_ibm import WatsonxLLM
from ibm_watsonx_ai.foundation_models.utils import get_embedding_model_specs
from langchain_ibm import WatsonxEmbeddings
from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes
from langchain_community.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import re

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

## Youtube transcription

* manual transcripts are preferred over automatic
* A typical YouTube URL format is: https://www.youtube.com/watch?v=VIDEO_ID
* The transcript is represented as a list of dictionaries. Each dictionary contains:
    * text: The spoken content.
    * start: The starting time of the segment in seconds.
    * duration: The duration of the segment in seconds.

In [None]:
def get_video_id(url):    
    # Define a regular expression pattern to match YouTube video URLs
    # The pattern captures 11 alphanumeric characters (plus hyphen or underscore) after '?v='
    pattern = r'https:\/\/www\.youtube\.com\/watch\?v=([a-zA-Z0-9_-]{11})'
    
    # Search the provided URL for the pattern
    match = re.search(pattern, url)
    
    # If a match is found, return the captured video ID group
    # Otherwise, return None
    return match.group(1) if match else None

def get_transcript(url):
    video_id = get_video_id(url)
    # Fetches the list of available transcripts for the given YouTube video
    srt = YouTubeTranscriptApi.list_transcripts(video_id)

    transcript = ""
    for i in srt:
        # Check if the transcript is auto-generated
        if i.is_generated:
            # If no transcript has been set yet, use the auto-generated one
            if len(transcript) == 0:
                transcript = i.fetch()
        else:
            # If a manually created transcript is found, use it (overrides auto-generated)
            transcript = i.fetch()

    return transcript

# Retrieve the transcript for the specified YouTube video URL
transcript = get_transcript("https://www.youtube.com/watch?v=kEOCrtkLvEo&t=24s")

# Display the first 10 entries of the transcript
# Each entry is a dictionary containing 'text', 'start', and 'duration'
transcript[:10]

Process the fetched transcript into readable content

In [None]:
def process(transcript):
    # Initialize an empty string to accumulate processed text
    txt = ""

    # Iterate over each segment in the transcript
    for i in transcript:
        try:
            # Format the text and start time, then add to the accumulated string
            txt += f"Text: {i['text']} Start: {i['start']}\n"
        except:
            # If an error occurs (e.g., missing keys), skip the entry
            pass

    # Return the processed text
    return txt

processed_transcript = process(transcript)
processed_transcript[:100] # Display the first 100 characters of the processed transcript

## Chunking the transcript

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,  # Maximum chunk size of 200 characters
    chunk_overlap=20  # Overlap of 20 characters between chunks
)

chunks = text_splitter.split_text(processed_transcript)
chunks[:10]  # Display the first 10 chunks

Defining the model

In [None]:
# Define the model ID for the Granite 8B Instruct Generation 3 model
model_id = "ibm/granite-3-8b-instruct"

# Set up the credentials needed to access the IBM Watson services
credentials = Credentials(
    url = "https://us-south.ml.cloud.ibm.com",
)

# Initialize the API client with the given credentials
client = APIClient(credentials)

# Define the project ID for organizing tasks within IBM Watson services
project_id = "skills-network"

parameters = {
    # Specifies the decoding method as greedy decoding
    # This means the model always chooses the most probable next token
    GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
    
    # Sets the minimum number of new tokens to generate to 1
    # The model will always produce at least this many tokens
    GenParams.MIN_NEW_TOKENS: 1,
    
    # Sets the maximum number of new tokens to generate to 500
    # The model will stop generating after reaching this limit
    GenParams.MAX_NEW_TOKENS: 500,
    
    # Defines sequences that will cause the generation to stop
    # In this case, generation will stop when encountering two consecutive newlines
    GenParams.STOP_SEQUENCES: ["\n\n"],
}

watsonx_granite = WatsonxLLM(
    # Specifies the ID of the model to be used
    # This is likely an enum or constant value defining a specific model
    model_id=model_id,
    
    # The URL endpoint for the Watson service
    # This is retrieved from a credentials dictionary
    url=credentials.get("url"),
    
    # The ID of the project in which this LLM instance will operate
    # This helps in organizing and managing different LLM instances
    project_id=project_id,
    
    # A dictionary of parameters that configure the behavior of the LLM
    # This includes settings like decoding method, token limits, and stop sequences
    params=parameters
)

Embedding

In [None]:
# Fetch specifications for available embedding models from the Watson service
get_embedding_model_specs(credentials.get('url'))

# Part 1: Create Embedding Model
# Set up the WatsonxEmbeddings object
embeddings = WatsonxEmbeddings(
    # Specifies the ID of the embedding model to be used
    # In this case, it's using the IBM SLATE 30M English model
    model_id=EmbeddingTypes.IBM_SLATE_30M_ENG.value,
    
    # The URL endpoint for the Watson service
    # This is retrieved from the credentials dictionary
    url=credentials["url"],
    
    # The ID of the project in which this embedding model will operate
    # This helps in organizing and managing different model instances
    project_id=project_id
)

## Implementing FAISS

FAISS = Facebook AI Similarity Search
Converting transcript chunks in embeddings and store them in FAISS Index

In [None]:
# Create a FAISS index from the text chunks using the specified embeddings
faiss_index = FAISS.from_texts(chunks, embeddings)

### Explanation

In this example, we asked the question "Which company were they talking about?" The similarity search algorithm successfully retrieved relevant chunks of text from the transcript.

1. **Identifying the Company**: The first result directly answers our query. It mentions "IBM" twice:
   - "she's recently placed in IBM"
   - "interview experience of IBM"
   
   This clearly indicates that the company being discussed is IBM.

2. **How We Fetched It**:
   - The FAISS index we created earlier contains vector representations (embeddings) of all the text chunks from the transcript.
   - When we input our query, it's also converted into a vector using the same embedding model.
   - The `similarity_search` function then finds the chunks whose vector representations are most similar to our query vector.
   - We requested the top 3 most similar chunks (`k=3`), which are returned and printed.

3. **Context**: The other results provide additional context about the interview process, mentioning details like:
   - There were multiple interviewers, including a senior software developer.
   - There was a technical round in the interview process.

This demonstrates the power of semantic search: even though our query didn't use the exact words found in the transcript, the system was able to understand the intent of the question and retrieve relevant information, successfully identifying IBM as the company being discussed.


## Summarizing with LangChain

Define a prompt template

In [None]:
# Define the prompt template for summarizing the transcript
prompt = PromptTemplate(
    # Specify the input variables that will be used in the template
    input_variables=["transcript"],
    
    # Define the actual template string
    template="""
Summarize the following YouTube video transcript in terms of paragraph:

{transcript}

Your summary should have concise summary in terms of paragraph. Ignore any timestamps.
"""
)

# Instantiate LLMChain with the refined prompt and LLM
summarise_chain = LLMChain(
    llm=watsonx_granite,  # The language model to be used (in this case, watsonx_granite)
    prompt=prompt,        # The PromptTemplate we defined earlier
    verbose=True          # Enable verbose mode for detailed output
)

# Return the instantiated LLMChain object
summarise_chain

# Pass the processed transcript to the LLMChain for summarization
summary = summarise_chain.predict(transcript=processed_transcript)

# Print the generated summary
print(summary)

## Retrieve relevant context and generate answers

### Explanation 

1. **User query**: 
   - A user asks a question about a specific YouTube video or a topic covered in multiple videos.

2. **Query embedding**: 
   - The user's question is converted into a vector embedding using the same model used for processing video transcripts (e.g., IBM SLATE-30M).

3. **Similarity search**: 
   - The query embedding is compared to the embeddings of transcript chunks stored in your vector database (FAISS index). 
   - These transcript chunks represent segments of YouTube video.

4. **Retrieval**: 
   - The system retrieves the most similar transcript chunks based on the similarity scores. 
   - These chunks are likely to contain information relevant to the user's question.

5. **Context assembly**: 
   - The retrieved transcript chunks are combined to form the relevant context. 
   - This context might include portions of transcripts from the youtube video, depending on the query and available content.

In [None]:
def retrieve(query):
    # Perform a similarity search on the FAISS index
    relevant_context = faiss_index.similarity_search(query, k=7)
    return relevant_context

# Define the template for question answering
qa_template = """
You are an expert assistant providing detailed answers based on the following video content.

Relevant Video Context: {context}

Based on the above context, please answer the following question:
Question: {question}
"""

# Create the PromptTemplate object
prompt_template = PromptTemplate(
    input_variables=["context", "question"],  # Variables to be filled dynamically
    template=qa_template
)

# Create a Question-Answering LLM Chain
QAChain = LLMChain(
    llm=watsonx_granite,     # The language model to be used (watsonx_granite in this case)
    prompt=prompt_template,  # The PromptTemplate we defined earlier for structuring the input
    verbose=True             # Enable verbose mode for detailed output
)

QAChain

def generate_answer(question):
    # Retrieve relevant context based on the question
    relevant_context = retrieve(question)
    
    # Use the QAChain to generate an answer based on the context and question
    answer = QAChain.predict(context=relevant_context, question=question)
    
    # Return the generated answer
    return answer

# Input question (We can ask: Is there a coding round?)
question = input("Enter your question: ")

# Generate the answer
answer = generate_answer(question)
print(answer)