In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

In [None]:
nltk.download("punkt")
# Fetch the YouTube transcript
def fetch_transcript(video_url: str) -> str:
    """
    Fetches the transcript of a YouTube video using its URL.

    Args:
        video_url (str): The YouTube video URL.

    Returns:
        str: Concatenated transcript text or an error message.
    """
    try:
        # Extract video ID
        if "youtube.com" in video_url:
            video_id = video_url.split("v=")[-1].split("&")[0]
        elif "youtu.be" in video_url:
            video_id = video_url.split("/")[-1].split("?")[0]
        else:
            return "Error: Invalid YouTube URL."

        # Fetch transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        full_transcript = " ".join([item["text"] for item in transcript])
        return full_transcript
    except Exception as e:
        return f"Error fetching transcript: {e}"


# Chunk transcript into sentences
def split_into_chunks(transcript: str) -> list:
    """
    Splits the transcript into smaller chunks for retrieval.

    Args:
        transcript (str): The full transcript text.

    Returns:
        list: A list of transcript chunks (sentences).
    """
    return sent_tokenize(transcript)


# Retrieve the most relevant chunks using BM25
def retrieve_relevant_chunks(chunks: list, question: str, top_n: int = 3) -> str:
    """
    Retrieves the most relevant transcript chunks using BM25.

    Args:
        chunks (list): List of transcript sentences.
        question (str): The user's question.
        top_n (int): Number of top relevant chunks to retrieve.

    Returns:
        str: Concatenated relevant chunks.
    """
    try:
        tokenized_chunks = [word_tokenize(chunk.lower()) for chunk in chunks]
        bm25 = BM25Okapi(tokenized_chunks)
        query_tokens = word_tokenize(question.lower())
        scores = bm25.get_scores(query_tokens)

        # Get top-n chunks based on scores
        top_chunks = [chunks[i] for i in sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]]
        return " ".join(top_chunks)
    except Exception as e:
        return f"Error retrieving chunks: {e}"


# Generate an answer using T5
def answer_question_with_t5(context: str, question: str, model, tokenizer) -> str:
    """
    Generates an answer using the T5 model.

    Args:
        context (str): The relevant context.
        question (str): The question.
        model: Pre-trained T5 model.
        tokenizer: Tokenizer for the T5 model.

    Returns:
        str: Generated answer or an error message.
    """
    try:
        input_text = f"question: {question} context: {context}"
        inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

        # Generate answer
        outputs = model.generate(inputs, max_length=100, num_beams=4, early_stopping=True)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return answer
    except Exception as e:
        return f"Error generating answer: {e}"


if __name__ == "__main__":
    # Load T5 model and tokenizer
    model_name = "t5-large"  # You can use "t5-base" for a lighter model
    try:
        tokenizer = T5Tokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(model_name)
        print("Model and tokenizer loaded successfully.")
    except Exception as e:
        print(f"Error loading model/tokenizer: {e}")
        exit()

    # Example usage
    video_url = "https://youtu.be/6lt5WzBZN4Q"  # Replace with a real YouTube video URL
    question = "What is this video about"

    # Fetch and process transcript
    transcript = fetch_transcript(video_url)
    if "Error" in transcript:
        print(transcript)
    else:
        # Split transcript into chunks
        chunks = split_into_chunks(transcript)

        # Retrieve relevant chunks
        relevant_context = retrieve_relevant_chunks(chunks, question, top_n=3)
        if "Error" in relevant_context:
            print(relevant_context)
        else:
            # Generate answer
            answer = answer_question_with_t5(relevant_context, question, model, tokenizer)
            print("\nQuestion:", question)
            print("Answer:", answer)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model and tokenizer loaded successfully.

Question: What is this video about
Answer: a long time ago an old man lived in a small village in the mountains in the middle of China
