# Project

In [2]:
!pip install langchain langchain-community langchain_openai youtube-transcript-api langsmith faiss-cpu




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Imports

In [55]:
import os
import re

import requests
from bs4 import BeautifulSoup
from langchain.callbacks import LangChainTracer
from langchain.chains import RetrievalQA
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.embeddings import OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langsmith import Client
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled

## Prework

In [12]:
# Is this workbook running on Google colab?
COLAB = 'google.colab' in str(get_ipython())

### Helper Methods

In [4]:
def get_video_title(video_id):
    """
    Extracts the title of a YouTube video.

    Args:
        video_url (str): The URL of the YouTube video.

    Returns:
        str: The title of the YouTube video.
    """
    soup = BeautifulSoup(requests.get(f"https://www.youtube.com/watch?v={video_id}").text, 'html.parser')
    return soup.title.string.replace(" - YouTube", "").strip()

### Get Transcript and Title

In [5]:
id = 'XEzRZ35urlk'
#id = '6nJX5tbZzDo'
chatter = 'Fabian'

In [6]:
video_title = get_video_title(id)
languages = ['en', 'de', 'es', 'pt']
try:
    raw_transcript = YouTubeTranscriptApi.get_transcript(id, languages=languages)
except TranscriptsDisabled:
    proxies = {'http': 'http://94.186.213.73:7212',
               'https': 'http://94.186.213.73:7212'}
    raw_transcript = YouTubeTranscriptApi.get_transcript(id, languages=languages, proxies=proxies)

In [7]:
raw_transcript[0:5]

[{'text': '[Cheers and Applause].\n>>WOMAN: Google’s ambitions in\xa0',
  'start': 0.0,
  'duration': 1.52},
 {'text': 'artificial intelligence.\n>>MAN: Google launches Gemini,\xa0',
  'start': 1.52,
  'duration': 2.36},
 {'text': "the generative AI.\n>> And it's completely changing\xa0",
  'start': 3.88,
  'duration': 2.6},
 {'text': 'the way we work.\n>> You know, a lot has happened\xa0',
  'start': 6.48,
  'duration': 3.28},
 {'text': 'in a year.\nThere have been new beginnings.\xa0',
  'start': 9.76,
  'duration': 6.0}]

In [8]:
transcript = " ".join([entry['text'] for entry in raw_transcript])

In [9]:
transcript



## LLM and Embeddings

In [13]:
if COLAB:
    from google.colab import userdata

    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = userdata.get('LANGCHAIN_API_KEY')
else:
    from dotenv import load_dotenv, find_dotenv

    _ = load_dotenv(find_dotenv())
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')

In [14]:
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ["LANGCHAIN_PROJECT"] = 'youtube-project'
os.environ['LANGCHAIN_TRACING_V2'] = 'true'  # enables tracing

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

### Helper Methods

In [15]:
def clear_text(raw_text):
    """
    Cleans the input text by removing unwanted characters and formatting.

    Args:
        raw_text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text.
    """

    # Remove bracketed content using regex
    raw_text = re.sub(r"\[.*?\]", "", raw_text)

    # Replace newline and non-breaking space characters with spaces
    raw_text = raw_text.replace("\n", " ").replace("\xa0", " ")

    # Remove speaker indicators using regex
    raw_text = re.sub(r">>.+?:", "", raw_text)

    # Remove all double Spaces
    raw_text = raw_text.replace("  ", " ")

    # Remove doubled stops
    raw_text = raw_text.replace(". . ", ". ")

    # Remove leading and trailing spaces
    return raw_text.strip()

In [16]:
def select_timestamps(sources):
    """
    Selects relevant timestamps from a list of source documents.

    Args:
        sources (list): A list of source documents, each containing a metadata dictionary with a "timestamp" key.

    Returns:
        list: A list of selected timestamps, sorted in ascending order and deduplicated.
        Timestamps that are too close to the previous timestamp are removed to avoid redundancy.
    """

    timestamps = [int(source.metadata["timestamp"]) for source in sources]
    timestamps = sorted(list(set(timestamps)))  # Deduplicate timestamps and sort it

    # now remove timestamps which are too close to the timestamp before
    result = []
    threshold = 100
    last_number = None  # Initialize to None to avoid skipping the first element

    for number in timestamps:
        if last_number is None or number - last_number >= threshold:
            result.append(number)
            last_number = number  # Update last_number for the next iteration

    return result

In [47]:
def prompt(title, chatter, question_text):
    """
    Generates a prompt for a language model to answer questions about a YouTube video.

    Args:
        title (str): The title of the YouTube video.
        question_text (str): The question to be answered.

    Returns:
        str: A formatted prompt string containing the video title and question.
    """

    return f"""
              You are a helpful and informative AI assistant. You are given a transcript of the video with the name "{title}". 
              {chatter} is asking questions. Please answer the following question, which comes after 'Question:'.
              If the question cannot be answered using the information provided answer with "I'm sorry {chatter}, I don't know".
              
              Here are some examples, how you can answer the following question.
              Examples:
              What is the name of the Video?
              You: {title}
      
              What's my name?
              You: Your name is {chatter}.

              Question: {question_text}
              """

In [38]:
def ask_question_with_timestamp(title, chatter, question_text):
    # Run the query to get the response and source documents
    result = qa_chain.invoke(input=prompt(title, chatter, question_text), output_key="result")
    answer_text = clear_text(result["result"])
    sources = result["source_documents"]

    # define timestamps
    timestamps = None
    if "I don't know." not in answer_text:
        timestamps = select_timestamps(sources)

    # Append timestamp information to the answer
    return {"answer": answer_text, "timestamps": timestamps}

### Langsmith

In [19]:
# Initialize LangSmith client and tracer
client = Client()
tracer = LangChainTracer(client=client)

### FAISS Embedding

In [20]:
# Initialize the list to hold the chunks with metadata and the variables for current chunk
chunks_with_metadata = []
current_text = ""
current_start = None

# Maximum length for each chunk
max_chunk_length = 1000

# Iterate over each entry in raw_transcript
for entry in raw_transcript:
    # Set the start time for the first entry in the current chunk
    if current_start is None:
        current_start = entry['start']

    # Check if adding the current text would exceed the max_chunk_length
    if len(current_text) + len(entry['text']) + 1 > max_chunk_length:
        # If it does, save the current chunk and reset the variables
        chunks_with_metadata.append({'content': clear_text(current_text), 'timestamp': current_start})
        current_text = ""
        current_start = entry['start']

    # Add the current text to the chunk with a space
    current_text += entry['text'] + " "

# After the loop, ensure any remaining text is added as a final chunk
if current_text:
    chunks_with_metadata.append({'content': clear_text(current_text), 'timestamp': current_start})

# Print the average length of the generated chunks
average_length = sum(len(entry['content']) for entry in chunks_with_metadata) / len(chunks_with_metadata)
print("Average length:", int(average_length))
print("Chunks:", len(chunks_with_metadata))

Average length: 911
Chunks: 93


In [28]:
# Generate embeddings for each chunk
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
texts = [chunk["content"] for chunk in chunks_with_metadata]
metadata = [{"timestamp": chunk["timestamp"]} for chunk in chunks_with_metadata]

vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadata)

### Model with Memory

In [39]:
# Initialize the language model
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Set up chat memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='youtube_project_history',
    k=3,
    return_messages=True,
    output_key='result'
)

# Set up the RetrievalQA chain with vectorstore and tracer for LangSmith logging
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    memory=conversational_memory,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    callbacks=[tracer]
)

In [53]:
# Example question
question = "Can you give me a summarize of the video"
question = "What is the name of the Video?"
answer = ask_question_with_timestamp(video_title, chatter, question)
print(answer)

{'answer': "I'm sorry Fabian, I don't know.", 'timestamps': None}
