# Project

In [48]:
!pip install langchain langchain-community langchain_openai youtube-transcript-api langsmith faiss-cpu



You should consider upgrading via the 'C:\CODE2ORDER\Python\_week8\project\venv\Scripts\python.exe -m pip install --upgrade pip' command.


## Imports

In [112]:
import ast
import os
import re
import requests
from bs4 import BeautifulSoup
from langchain.callbacks import LangChainTracer, StdOutCallbackHandler
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain_core.prompts import PromptTemplate
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langsmith import Client
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled

## Prework

In [50]:
# Is this workbook running on Google colab?
COLAB = 'google.colab' in str(get_ipython())

### Helper Methods

In [51]:
def get_video_title(video_id):
    """
    Extracts the title of a YouTube video.

    Args:
        video_url (str): The URL of the YouTube video.

    Returns:
        str: The title of the YouTube video.
    """
    soup = BeautifulSoup(requests.get(f"https://www.youtube.com/watch?v={video_id}").text, 'html.parser')
    return soup.title.string.replace(' - YouTube', '').strip()

In [52]:
def clear_text(raw_text):
    """
    Cleans the input text by removing unwanted characters and formatting.

    Args:
        raw_text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text.
    """

    # Remove bracketed content using regex
    raw_text = re.sub(r"\[.*?\]", "", raw_text)

    # Replace newline and non-breaking space characters with spaces
    raw_text = raw_text.replace("\n", " ").replace("\xa0", " ")

    # Remove speaker indicators using regex
    raw_text = re.sub(r">>.+?:", "", raw_text)

    # Remove all double Spaces
    raw_text = raw_text.replace("  ", " ")

    # Remove doubled stops
    raw_text = raw_text.replace(". . ", ". ")

    # Remove leading and trailing spaces
    return raw_text.strip()

### Get Transcript and Title

In [53]:
id = 'XEzRZ35urlk'
chatter = 'Fabian'
memory_key = f"youtube:{id}#{chatter}:memory"

In [54]:
video_title = get_video_title(id)
languages = ['en', 'de', 'es', 'pt']
try:
    raw_transcript = YouTubeTranscriptApi.get_transcript(id, languages=languages)
except TranscriptsDisabled:
    proxies = {'http': 'http://94.186.213.73:7212',
               'https': 'http://94.186.213.73:7212'}
    raw_transcript = YouTubeTranscriptApi.get_transcript(id, languages=languages, proxies=proxies)

In [55]:
raw_transcript[0:5]

[{'text': '[Cheers and Applause].\n>>WOMAN: Google’s ambitions in\xa0',
  'start': 0.0,
  'duration': 1.52},
 {'text': 'artificial intelligence.\n>>MAN: Google launches Gemini,\xa0',
  'start': 1.52,
  'duration': 2.36},
 {'text': "the generative AI.\n>> And it's completely changing\xa0",
  'start': 3.88,
  'duration': 2.6},
 {'text': 'the way we work.\n>> You know, a lot has happened\xa0',
  'start': 6.48,
  'duration': 3.28},
 {'text': 'in a year.\nThere have been new beginnings.\xa0',
  'start': 9.76,
  'duration': 6.0}]

In [56]:
# Combine text parts and clean the text to get a brief overview what we have
transcript = ' '.join([clear_text(entry['text']) for entry in raw_transcript])
transcript



## LLM and Embeddings

In [57]:
if COLAB:
    from google.colab import userdata

    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = userdata.get('LANGCHAIN_API_KEY')
else:
    from dotenv import load_dotenv, find_dotenv

    _ = load_dotenv(find_dotenv())
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')

In [58]:
os.environ['LANGCHAIN_API_KEY'] = LANGCHAIN_API_KEY
os.environ['LANGCHAIN_PROJECT'] = 'youtube-project-chat'
os.environ['LANGCHAIN_TRACING_V2'] = 'true'  # enables tracing
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

### Prompt definition

In [59]:
template = """
You are a helpful and informative AI assistant that is good at remembering previous turns in the conversation to give helpful and relevant answers.
You are given a transcript of the video called "{title}".
{chatter} is asking questions. Please answer the following question, which comes after 'Question:'.
If the question cannot be answered using the information provided, answer with "Sorry {chatter}, I don't know".

Question: {question_text}
"""

PROMPT = PromptTemplate(
    input_variables=['title', 'chatter', 'question_text'],
    template=template
)

In [60]:
template = """
You are a helpful and informative AI assistant. You are given a transcript of the video called "{title}".
Please create 3 short interesting questions
about the video a user might ask for. Return those 3 questions in a array like:
['What is the video about?', 'Can you give me more information about Veo?', 'Are there any news about Android?']
"""

EXAMPLE_PROMPT = PromptTemplate(
    input_variables=['title'],
    template=template
)

### Helper Methods

In [61]:
def select_timestamps(sources):
    """
    Selects relevant timestamps from a list of source documents.

    Args:
        sources (list): A list of source documents, each containing a metadata dictionary with a "timestamp" key.

    Returns:
        list: A list of selected timestamps, sorted in ascending order and deduplicated.
        Timestamps that are too close to the previous timestamp are removed to avoid redundancy.
    """

    timestamps = [int(source.metadata['timestamp']) for source in sources]
    timestamps = sorted(list(set(timestamps)))  # Deduplicate timestamps and sort it

    # now remove timestamps which are too close to the timestamp before
    result = []
    threshold = 200
    last_number = None  # Initialize to None to avoid skipping the first element

    for number in timestamps:
        if last_number is None or number - last_number >= threshold:
            result.append(number)
            last_number = number  # Update last_number for the next iteration

    return result

In [117]:
def ask_question_with_timestamp(prompt_text):
    """Asks a question to the QA chain, incorporating relevant context and metadata.
    Retrieves relevant context from the vectorstore based on the question,
    formats it with metadata for the prompt, and invokes the QA chain to get the answer.

    Args:
        prompt_text (str): The question to ask.

    Returns:
        dict: A dictionary containing the answer text and a list of timestamps.
    """

    # Run the query to get the response and source documents
    chat_history = qa_chain.memory.chat_memory.messages
    result = qa_chain({'question': prompt_text, 'chat_history': chat_history})
    answer_text = clear_text(result['answer'])
    sources = result['source_documents']

    # define timestamps
    timestamps = None
    if "I don't know." not in answer_text:
        timestamps = select_timestamps(sources)

    # Append timestamp information to the answer
    return {'answer': answer_text, 'timestamps': timestamps}

### Langsmith

In [63]:
# Initialize LangSmith client and tracer
client = Client()
tracer = LangChainTracer(client=client)

### FAISS Embedding

Create Embeddings manually with a maximum chunk size to maintain the metadata of the starting time of the context

In [64]:
# Initialize the list to hold the chunks with metadata and the variables for current chunk
chunks_with_metadata = []
current_text = ''
current_start = None

# Maximum length for each chunk
max_chunk_length = 1000

# Iterate over each entry in raw_transcript
for entry in raw_transcript:
    # Set the start time for the first entry in the current chunk
    if current_start is None:
        current_start = entry['start']

    # Check if adding the current text would exceed the max_chunk_length
    if len(current_text) + len(entry['text']) + 1 > max_chunk_length:
        # If it does, save the current chunk and reset the variables
        chunks_with_metadata.append({'content': clear_text(current_text), 'timestamp': current_start})
        current_text = ''
        current_start = entry['start']

    # Add the current text to the chunk with a space
    current_text += entry['text'] + ' '

# After the loop, ensure any remaining text is added as a final chunk
if current_text:
    chunks_with_metadata.append({'content': clear_text(current_text), 'timestamp': current_start})

# Print the average length of the generated chunks
average_length = sum(len(entry['content']) for entry in chunks_with_metadata) / len(chunks_with_metadata)
print('Average length:', int(average_length))
print('Chunks:', len(chunks_with_metadata))

Average length: 911
Chunks: 93


In [101]:
# Generate embeddings for each chunk
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
texts = [chunk['content'] for chunk in chunks_with_metadata]
metadata = [{'timestamp': chunk['timestamp']} for chunk in chunks_with_metadata]

vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadata)

### Chains

**qa_chain**: Used for the user chat experience. Initialized with a ConversationBuffer to preserver chat history

**example_chain**: User to create example questions based on the transcript, where no memory is needed.  

In [118]:
# Initialize the language model
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.2, n=3)

# Create a handler instance
handler = StdOutCallbackHandler()

# Set up chat memory and save 3 messages
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=3,
    return_messages=True,
    output_key='answer'
)

retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 4})

# ConversationalRetrievalChain chain with vectorstore, memory and tracer for LangSmith logging
# Used for chat
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    memory=conversational_memory,
    callbacks=[tracer]
)

# RetrievalQA chain with vectorstore and tracer for LangSmith logging
# Used for initial example creation
example_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,
    callbacks=[tracer, handler]
)

In [119]:
# Generate Example questions
prompt_text = EXAMPLE_PROMPT.format(title=video_title)
result = example_chain.invoke(input=prompt_text, output_key='result')
example_questions = result['result']
ast.literal_eval(example_questions)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


['How is AI transforming Google products across Gemini, Search, and Workspace?',
 'What advancements are being made to make Android the best place to experience Google AI?',
 'How is multi-step reasoning in Google Search making complex questions easier to answer?']

In [120]:
# Example question
examples = [
    'What is the video about?',
    'Anything new in android?',
    'What is the name of the image generation model?',
    'Can you tell me more about Imagen 3?',
    'What is Veo?',
    'Can you tell me more about it?'
]

for example in examples:
    prompt = PROMPT.format(title=video_title, chatter=chatter, question_text=example)
    answer = ask_question_with_timestamp(prompt)
    print(example, answer)

What is the video about? {'answer': 'The video is about the advancements in Google Search, specifically introducing the ability to ask questions with video in Google Search.', 'timestamps': [2231, 2985, 5041]}
Anything new in android? {'answer': 'Some of the new features introduced in Android include AI-powered search, Gemini becoming a new AI assistant on Android, harnessing on-device AI for faster experiences while keeping data private, and Gemini Nano with Multimodality for understanding the world through sights, sounds, and spoken language. These features aim to enhance the overall smartphone experience by integrating Google AI directly into the operating system.', 'timestamps': [4752, 5174, 5421]}
What is the name of the image generation model? {'answer': 'The name of the image generation model mentioned in the video "Google Keynote (Google I/O ‘24)" is Imagen 3.', 'timestamps': [1763, 2110]}
Can you tell me more about Imagen 3? {'answer': 'Imagen 3 is a high-quality image generat

In [121]:
# Check if the history is created
print(len(conversational_memory.chat_memory.messages))
print(conversational_memory.chat_memory.messages)

12
[HumanMessage(content='\nYou are a helpful and informative AI assistant that is good at remembering previous turns in the conversation to give helpful and relevant answers.\nYou are given a transcript of the video called "Google Keynote (Google I/O ‘24)".\nFabian is asking questions. Please answer the following question, which comes after \'Question:\'.\nIf the question cannot be answered using the information provided, answer with "Sorry Fabian, I don\'t know".\n\nQuestion: What is the video about?\n', additional_kwargs={}, response_metadata={}), AIMessage(content='The video is about the advancements in Google Search, specifically introducing the ability to ask questions with video in Google Search.', additional_kwargs={}, response_metadata={}), HumanMessage(content='\nYou are a helpful and informative AI assistant that is good at remembering previous turns in the conversation to give helpful and relevant answers.\nYou are given a transcript of the video called "Google Keynote (Goo