# Project

In [36]:
!pip install langchain langchain-community openai youtube-transcript-api langsmith
!pip install tiktoken
!pip install faiss-cpu




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




## Imports

In [37]:
import os
import re

import requests
from bs4 import BeautifulSoup
from langchain.callbacks import LangChainTracer
from langchain.chains import RetrievalQA
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS
from langsmith import Client
from youtube_transcript_api import YouTubeTranscriptApi

## Prework

In [38]:
# Is this workbook running on google colab?
COLAB = False

### Helper Methods

In [39]:
def get_youtube_id(video_url):
    """
    Extracts the video ID from a YouTube URL.

    Args:
        video_url (str): The URL of the YouTube video.

    Returns:
        str: The extracted video ID.
    """

    return re.search(r"v=([^&]+)", video_url).group(1)

In [40]:
def get_youtube_title(video_url):
    """
    Extracts the title of a YouTube video.

    Args:
        video_url (str): The URL of the YouTube video.

    Returns:
        str: The title of the YouTube video.
    """

    soup = BeautifulSoup(requests.get(video_url).text)
    link = soup.find_all(name="title")[0]
    return re.search(r"<title>(.*?) - YouTube</title>", str(link)).group(1)

### Get Transcript and Title

In [41]:
url = 'https://www.youtube.com/watch?v=XEzRZ35urlk'

In [42]:
print(get_youtube_title(url))
print(get_youtube_id(url))

Google Keynote (Google I/O ‘24)
XEzRZ35urlk


In [43]:
video_title = get_youtube_title(url)
raw_transcript = YouTubeTranscriptApi.get_transcript(get_youtube_id(url))

In [44]:
raw_transcript[0:5]

[{'text': '[Cheers and Applause].\n>>WOMAN: Google’s ambitions in\xa0',
  'start': 0.0,
  'duration': 1.52},
 {'text': 'artificial intelligence.\n>>MAN: Google launches Gemini,\xa0',
  'start': 1.52,
  'duration': 2.36},
 {'text': "the generative AI.\n>> And it's completely changing\xa0",
  'start': 3.88,
  'duration': 2.6},
 {'text': 'the way we work.\n>> You know, a lot has happened\xa0',
  'start': 6.48,
  'duration': 3.28},
 {'text': 'in a year.\nThere have been new beginnings.\xa0',
  'start': 9.76,
  'duration': 6.0}]

In [45]:
transcript = " ".join([entry['text'] for entry in raw_transcript])

## LLM and Embeddings

In [46]:
if COLAB:
    from google.colab import userdata

    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = userdata.get('LANGCHAIN_API_KEY')
else:
    from dotenv import load_dotenv, find_dotenv

    _ = load_dotenv(find_dotenv())
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')

In [47]:
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_PROJECT"] = 'youtube-project'
os.environ['LANGCHAIN_TRACING_V2'] = 'true'  # enables tracing

### Helper Methods

In [48]:
def clear_text(raw_text):
    """
    Cleans the input text by removing unwanted characters and formatting.

    Args:
        raw_text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text.
    """

    # Remove bracketed content using regex
    raw_text = re.sub(r"\[.*?\]", "", raw_text)

    # Replace newline and non-breaking space characters with spaces
    raw_text = raw_text.replace("\n", " ").replace("\xa0", " ")

    # Remove speaker indicators using regex
    raw_text = re.sub(r">>.+?:", "", raw_text)

    # Remove all double Spaces
    raw_text = raw_text.replace("  ", " ")

    # Remove doubled stops
    raw_text = raw_text.replace(". . ", ". ")

    # Remove leading and trailing spaces
    return raw_text.strip()

In [49]:
def select_timestamps(sources):
    """
    Selects relevant timestamps from a list of source documents.

    Args:
        sources (list): A list of source documents, each containing a metadata dictionary with a "timestamp" key.

    Returns:
        list: A list of selected timestamps, sorted in ascending order and deduplicated.
        Timestamps that are too close to the previous timestamp are removed to avoid redundancy.
    """

    timestamps = [int(source.metadata["timestamp"]) for source in sources]
    timestamps = sorted(list(set(timestamps)))  # Deduplicate timestamps and sort it

    # now remove timestamps which are too close to the timestamp before
    result = []
    threshold = 100
    last_number = None  # Initialize to None to avoid skipping the first element

    for number in timestamps:
        if last_number is None or number - last_number >= threshold:
            result.append(number)
            last_number = number  # Update last_number for the next iteration

    return result

In [50]:
def prompt(title, question_text):
    """
    Generates a prompt for a language model to answer questions about a YouTube video.

    Args:
        title (str): The title of the YouTube video.
        question_text (str): The question to be answered.

    Returns:
        str: A formatted prompt string containing the video title and question.
    """

    return f"""
              You are a helpful and informative AI assistant. You are given a transcript of a video '{title}' and asked to answer questions about it.
              If you don't know the answer, just say 'I don't know'.

              Question: {question_text}
              """

In [51]:
def ask_question_with_timestamp(title, question_text):
    # Run the query to get the response and source documents
    result = qa_chain.invoke(input=prompt(title, question_text), output_key="result")
    answer_text = clear_text(result["result"])
    sources = result["source_documents"]

    # define timestamps
    timestamps = None
    if "I don't know." not in answer_text:
        timestamps = select_timestamps(sources)

    # Append timestamp information to the answer
    return {"answer": answer_text, "timestamps": timestamps}

### Langsmith

In [52]:
# Initialize LangSmith client and tracer
client = Client()
tracer = LangChainTracer(client=client)

### FAISS Embedding

In [53]:
chunks_with_metadata = []
for chunk in raw_transcript:
    text = clear_text(chunk['text'])
    chunks_with_metadata.append({
        "content": text,
        "timestamp": chunk["start"]
    })

In [54]:
# Generate embeddings for each chunk
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
texts = [chunk["content"] for chunk in chunks_with_metadata]
metadata = [{"timestamp": chunk["timestamp"]} for chunk in chunks_with_metadata]

vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadata)

### Model with Memory

In [55]:
# Initialize the language model
llm = OpenAI(model_name="gpt-3.5-turbo-instruct", api_key=OPENAI_API_KEY, temperature=0)

# Set up chat memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='youtube_project_history',
    k=3,
    return_messages=True,
    output_key='result'
)

# Set up the RetrievalQA chain with vectorstore and tracer for LangSmith logging
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    memory=conversational_memory,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    callbacks=[tracer]
)

  conversational_memory = ConversationBufferWindowMemory(


In [63]:
# Example question
question = "What are the key points of the video"
answer = ask_question_with_timestamp(video_title, question)
print(answer)

{'answer': 'The video is about Google I/O, a conference where Google presents new technologies and updates. The speaker mentions AI Overviews, a feature that allows users to get an overview of AI in just seconds. The speaker also mentions that with AI Overviews, Google does the work for you.', 'timestamps': [116, 2663, 3086]}
