# Project

In [80]:
!pip install bert_score evaluate faiss-cpu giskard langchain langchain-community langchain_openai langsmith  youtube-transcript-api

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


## Imports

In [29]:
import ast
import giskard
import json
import os
import re
import requests

from bs4 import BeautifulSoup
from langchain.callbacks import LangChainTracer, StdOutCallbackHandler
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.evaluation.qa import QAEvalChain
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langsmith import Client
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled

## Prework

In [5]:
# Is this workbook running on Google colab?
COLAB = 'google.colab' in str(get_ipython())

### Helper Methods

In [6]:
def get_video_title(video_id):
    """
    Extracts the title of a YouTube video.

    Args:
        video_url (str): The URL of the YouTube video.

    Returns:
        str: The title of the YouTube video.
    """
    soup = BeautifulSoup(requests.get(f"https://www.youtube.com/watch?v={video_id}").text, 'html.parser')
    return soup.title.string.replace(' - YouTube', '').strip()

In [82]:
def clear_text(raw_text):
    """
    Cleans the input text by removing unwanted characters and formatting.

    Args:
        raw_text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text.
    """

    # Remove bracketed content and replace unwanted characters with spaces
    raw_text = re.sub(r"\[.*?\]|\n|\r|\xa0|\x08", " ", raw_text)

    # Remove speaker indicators using regex
    raw_text = re.sub(r">>.+?:", "", raw_text)

    # Remove all double Spaces
    raw_text = raw_text.replace("  ", " ")

    # Remove doubled stops
    raw_text = raw_text.replace(". . ", ". ")

    # Remove leading and trailing spaces
    return raw_text.strip()

### Get Transcript and Title

In [8]:
id = 'XEzRZ35urlk'
chatter = 'Fabian'
memory_key = f"youtube:{id}#{chatter}:memory"

In [9]:
video_title = get_video_title(id)
languages = ['en', 'de', 'es', 'pt']
try:
    raw_transcript = YouTubeTranscriptApi.get_transcript(id, languages=languages)
except TranscriptsDisabled:
    proxies = {'http': 'http://94.186.213.73:7212',
               'https': 'http://94.186.213.73:7212'}
    raw_transcript = YouTubeTranscriptApi.get_transcript(id, languages=languages, proxies=proxies)

In [10]:
raw_transcript[0:5]

[{'text': '[Cheers and Applause].\n>>WOMAN: Google’s ambitions in\xa0',
  'start': 0.0,
  'duration': 1.52},
 {'text': 'artificial intelligence.\n>>MAN: Google launches Gemini,\xa0',
  'start': 1.52,
  'duration': 2.36},
 {'text': "the generative AI.\n>> And it's completely changing\xa0",
  'start': 3.88,
  'duration': 2.6},
 {'text': 'the way we work.\n>> You know, a lot has happened\xa0',
  'start': 6.48,
  'duration': 3.28},
 {'text': 'in a year.\nThere have been new beginnings.\xa0',
  'start': 9.76,
  'duration': 6.0}]

In [83]:
# Combine text parts and clean the text to get a brief overview what we have
transcript = ' '.join([clear_text(entry['text']) for entry in raw_transcript])
transcript



## LLM and Embeddings

In [12]:
if COLAB:
    from google.colab import userdata

    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = userdata.get('LANGCHAIN_API_KEY')
else:
    from dotenv import load_dotenv, find_dotenv

    _ = load_dotenv(find_dotenv())
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')

In [13]:
os.environ['LANGCHAIN_API_KEY'] = LANGCHAIN_API_KEY
os.environ['LANGCHAIN_PROJECT'] = 'youtube-project-chat'
os.environ['LANGCHAIN_TRACING_V2'] = 'true'  # enables tracing
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

### Prompt definition

In [14]:
template = """
You are a helpful and informative AI assistant that is good at remembering previous turns in the conversation to give helpful and relevant answers.
You are given a transcript of the video called "{title}".
{chatter} is asking questions. Please answer the following question, which comes after 'Question:'.
If the question cannot be answered using the information provided, answer with "Sorry {chatter}, I don't know".

Question: {question_text}
"""

PROMPT = PromptTemplate(
    input_variables=['title', 'chatter', 'question_text'],
    template=template
)

In [15]:
template = """
You are a helpful and informative AI assistant. You are given a transcript of the video called "{title}".
Please create 3 short interesting questions
about the video a user might ask for. Return those 3 questions in a array like:
['What is the video about?', 'Can you give me more information about Veo?', 'Are there any news about Android?']
"""

EXAMPLE_PROMPT = PromptTemplate(
    input_variables=['title'],
    template=template
)

### Helper Methods

In [16]:
def select_timestamps(sources):
    """
    Selects relevant timestamps from a list of source documents.

    Args:
        sources (list): A list of source documents, each containing a metadata dictionary with a "timestamp" key.

    Returns:
        list: A list of selected timestamps, sorted in ascending order and deduplicated.
        Timestamps that are too close to the previous timestamp are removed to avoid redundancy.
    """

    timestamps = [int(source.metadata['timestamp']) for source in sources]
    timestamps = sorted(list(set(timestamps)))  # Deduplicate timestamps and sort it

    # now remove timestamps which are too close to the timestamp before
    result = []
    threshold = 200
    last_number = None  # Initialize to None to avoid skipping the first element

    for number in timestamps:
        if last_number is None or number - last_number >= threshold:
            result.append(number)
            last_number = number  # Update last_number for the next iteration

    return result

In [17]:
def ask_question_with_timestamp(prompt_text):
    """Asks a question to the QA chain, incorporating relevant context and metadata.
    Retrieves relevant context from the vectorstore based on the question,
    formats it with metadata for the prompt, and invokes the QA chain to get the answer.

    Args:
        prompt_text (str): The question to ask.

    Returns:
        dict: A dictionary containing the answer text and a list of timestamps.
    """

    # Run the query to get the response and source documents
    chat_history = qa_chain.memory.chat_memory.messages
    result = qa_chain({'question': prompt_text, 'chat_history': chat_history})
    answer_text = clear_text(result['answer'])
    sources = result['source_documents']

    # define timestamps
    timestamps = None
    if "I don't know." not in answer_text:
        timestamps = select_timestamps(sources)

    # Append timestamp information to the answer
    return {'answer': answer_text, 'timestamps': timestamps}

### Langsmith

In [18]:
# Initialize LangSmith client and tracer
client = Client()
tracer = LangChainTracer(client=client)

### FAISS Embedding

Create Embeddings manually with a maximum chunk size to maintain the metadata of the starting time of the context

In [19]:
# Initialize the list to hold the chunks with metadata and the variables for current chunk
chunks_with_metadata = []
current_text = ''
current_start = None

# Maximum length for each chunk
max_chunk_length = 1000

# Iterate over each entry in raw_transcript
for entry in raw_transcript:
    # Set the start time for the first entry in the current chunk
    if current_start is None:
        current_start = entry['start']

    # Check if adding the current text would exceed the max_chunk_length
    if len(current_text) + len(entry['text']) + 1 > max_chunk_length:
        # If it does, save the current chunk and reset the variables
        chunks_with_metadata.append({'content': clear_text(current_text), 'timestamp': current_start})
        current_text = ''
        current_start = entry['start']

    # Add the current text to the chunk with a space
    current_text += entry['text'] + ' '

# After the loop, ensure any remaining text is added as a final chunk
if current_text:
    chunks_with_metadata.append({'content': clear_text(current_text), 'timestamp': current_start})

# Print the average length of the generated chunks
average_length = sum(len(entry['content']) for entry in chunks_with_metadata) / len(chunks_with_metadata)
print('Average length:', int(average_length))
print('Chunks:', len(chunks_with_metadata))

Average length: 911
Chunks: 93


In [20]:
# Generate embeddings for each chunk
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
texts = [chunk['content'] for chunk in chunks_with_metadata]
metadata = [{'timestamp': chunk['timestamp']} for chunk in chunks_with_metadata]

vectorstore = FAISS.from_texts(texts, embeddings, metadatas=metadata)

  embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)


### Chains

**qa_chain**: Used for the user chat experience. Initialized with a ConversationBuffer to preserver chat history

**example_chain**: User to create example questions based on the transcript, where no memory is needed.  

In [21]:
# Initialize the language model
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.2, n=3)

# Create a handler instance
handler = StdOutCallbackHandler()

# Set up chat memory and save 3 messages
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=3,
    return_messages=True,
    output_key='answer'
)

retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 4})

# ConversationalRetrievalChain chain with vectorstore, memory and tracer for LangSmith logging
# Used for chat
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    memory=conversational_memory,
    callbacks=[tracer]
)

# RetrievalQA chain with vectorstore and tracer for LangSmith logging
# Used for initial example creation
example_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,
    callbacks=[tracer, handler]
)

  conversational_memory = ConversationBufferWindowMemory(


## Testing

Generate outputs for both models

In [22]:
# Generate Example questions
prompt_text = EXAMPLE_PROMPT.format(title=video_title)
result = example_chain.invoke(input=prompt_text, output_key='result')
example_questions = result['result']
ast.literal_eval(example_questions)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


['How is AI transforming Google products across Gemini, Search, and Workspace?',
 'What advancements are being made to make Android the best place to experience Google AI?',
 'How is Google reimagining Google Search in the Gemini era with agentive capabilities?']

In [23]:
# Example question
examples = [
    'What is the video about?',
    'Anything new in android?',
    'What is the name of the image generation model?',
    'Can you tell me more about Imagen 3?',
    'What is Veo?',
    'Can you tell me more about it?'
]

for example in examples:
    prompt = PROMPT.format(title=video_title, chatter=chatter, question_text=example)
    answer = ask_question_with_timestamp(prompt)
    print(example, answer)

  result = qa_chain({'question': prompt_text, 'chat_history': chat_history})


What is the video about? {'answer': 'The video is about the advancements in Google Search, specifically introducing new features like asking questions with video directly in Google Search, using AI to understand videos, and providing context-aware search results.', 'timestamps': [2231, 2985, 5041]}
Anything new in android? {'answer': "During the Google Keynote at Google I/O '24, several new features were introduced in Android. Some of the key features include: 1. AI-powered search: Google is putting AI-powered search right at your fingertips, creating new ways to get the answers you need directly on your Android device. 2. Gemini becoming your new AI assistant: Gemini is being integrated as your new AI assistant on Android, available to help you anytime you need assistance. 3. On-device AI experiences: Google is harnessing on-device AI to unlock new experiences that work quickly while keeping your sensitive data private. 4. Gemini Nano with Multimodality: Google is introducing Gemini N

## Evaluating

In [25]:
# Open and read the JSON file with examples
with open('test_files/examples.json', 'r', encoding='utf-8') as file:
    examples = json.load(file)

In [26]:
# Split examples into "predictions" and "questions" using list comprehensions
predictions = [{"result": example["result"]} for example in examples]
questions = [{"query": example["query"], "answer": example["answer"]} for example in examples]

In [27]:
qa_eval_chain = QAEvalChain.from_llm(llm)

# Run the evaluation with both examples and predictions
eval_results = qa_eval_chain.evaluate(questions, predictions)

# Output the evaluation results
for idx, result in enumerate(eval_results):
    color = '\033[92m' if result['results'] == 'CORRECT' else '\033[91m'
    print(f"Question {idx + 1}:")
    print(f"  Reference Answer: {examples[idx]['answer']}")
    print(f"  Generated Answer: {predictions[idx]['result']}")
    print(f"  Evaluation Result: {color + result['results']}\033[0m\n")

Question 1:
  Reference Answer: 
The video covers Google's progress in artificial intelligence, focusing on their work with the Gemini AI model. It showcases Gemini's ability to handle different types of data—like text, images, and code—and how it's used in Google products like Search, Photos, and Workspace.
  Generated Answer: The video is about Google's advancements in artificial intelligence, particularly their work on the Gemini AI model. It highlights Gemini's multimodal capabilities, its ability to reason across different types of data like text, images, and code, and its use in various Google products like Search, Photos, and Workspace.
  Evaluation Result: [92mCORRECT[0m

Question 2:
  Reference Answer: Gemini is the AI model from Google, which can handle texts, files, images and code.
  Generated Answer: Gemini is Google's most advanced AI model. It's designed to be multimodal, meaning it can understand and reason across different types of information like text, images, vide

In [71]:
def model_predict(df: pd.DataFrame):
    """Wraps the LLM call in a simple Python function.

    The function takes a pandas.DataFrame containing the input variables needed
    by your model, and must return a list of the outputs (one for each row).
    """
    return [qa_chain.invoke({"question": question}) for question in df["question"]]

In [None]:
def model_predict2(df: pd.DataFrame) -> list:
    """
    Wraps the LLM call in a simple Python function.

    Parameters:
        df (pd.DataFrame): DataFrame with a column 'question' containing queries.

    Returns:
        List of model responses, one for each question.
    """

    return df["question"].apply(lambda question: qa_chain.invoke({"question": question})).tolist()

In [72]:
giskard_model = giskard.Model(
    model=model_predict,
    model_type='text_generation',
    name="Fabi's Tube Bot",
    description="This model answers any question about the provided transcript of a youtube video",
    feature_names=["question"],
)

INFO:giskard.models.automodel:Your 'prediction_function' is successfully wrapped by Giskard's 'PredictionFunctionModel' wrapper class.


In [75]:
import pandas as pd

giskard_questions = pd.DataFrame(examples)[['query']].rename(columns={'query': 'question'})
giskard_dataset = giskard.Dataset(giskard_questions, target=None)

print(giskard_model.predict(giskard_dataset, verbose=False).prediction)

INFO:giskard.datasets.base:Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.utils.logging_utils:Predicted dataset with shape (8, 1) executed in 0:00:19.931203


[{'question': 'What is the video about?', 'chat_history': [HumanMessage(content='What are some of the new features in Google Photos powered by Gemini?', additional_kwargs={}, response_metadata={}), AIMessage(content='Some of the new features in Google Photos powered by Gemini include making it easier to search for specific photos or videos, such as searching for a license plate number when needed. Gemini enhances the search capabilities within Google Photos, allowing users to find their important memories more efficiently.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is long context in Gemini?', additional_kwargs={}, response_metadata={}), AIMessage(content='Long context in Gemini refers to the ability of the Gemini model to process and understand a large amount of information, such as hundreds of pages of text, hours of audio, a full hour of video, or even entire code repositories. With long context capabilities, Gemini can analyze and make connections acr

In [77]:
report = giskard.scan(giskard_model, giskard_dataset, only="hallucination", verbose=False)
display(report)

In [81]:
full_report = giskard.scan(giskard_model, giskard_dataset, verbose=False)
display(full_report)

INFO:giskard.scanner.logger:Running detectors: ['LLMBasicSycophancyDetector', 'LLMCharsInjectionDetector', 'LLMHarmfulContentDetector', 'LLMImplausibleOutputDetector', 'LLMInformationDisclosureDetector', 'LLMOutputFormattingDetector', 'LLMPromptInjectionDetector', 'LLMStereotypesDetector', 'LLMFaithfulnessDetector']
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.utils.logging_utils:Predicted dataset with shape (9, 1) executed in 0:00:11.335453
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.utils.logging_utils:Predicted dataset with shape (9, 1) executed in 0:00:14.527318
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.utils.logging_utils:Predicted dataset with shape (8, 1) executed in 0:00:00.015037
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'o

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

INFO:giskard.scanner.logger:LLMCharsInjectionDetector: Tested `question` for special char injection `\r`	Fail rate = 1.000	Vulnerable = True
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.utils.logging_utils:Predicted dataset with shape (1, 1) executed in 0:00:01.833089
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.datasets.base:Casting dataframe columns from {'question': 'object'} to {'question': 'object'}
INFO:giskard.d