# Question and Answer over a YouTube PlayList!


## Introduction

### Collect Data
Download youtube videos and store to a location ```data/```

In [None]:
!pip install pytube
!pip install sentencepiece
from pytube import YouTube, Playlist
playlist_url = "https://www.youtube.com/playlist?list=PLeBO0zY8b9XkbJgjfxGlCVCGc0ve8wQj2"
playlist = Playlist(playlist_url)
for url in playlist.video_urls[40:41]:
    yt=YouTube(url)
    t=yt.streams.filter(only_audio=True).all()
    t[0].download("data/")
    

### Setup Model
We will be using ```whisper-large-v3``` for transcribing each episode. The model is downloaded from huggingface hub and a pipeline is used to extract the transcription

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)



#### Setup helper function to generate data

In [None]:
import os
def generate_transcripts(pipe, data_path):
    for filename in os.listdir(data_path):
        f = os.path.join(data_path, filename)
        # checking if it is a file
        if os.path.isfile(f) and "txt" not in f:
            f.replace(" ","_")
            result = pipe(f,  generate_kwargs={"language": "english"})
            with open(f"{f}.txt","w", encoding="utf8", errors="ignore") as fdat:
                # print(result["text"][20:])
                fdat.write(result["text"][20:])


In [None]:
generate_transcripts(pipe, "data")

### Generate Langchain documents

In [None]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader
text_loader_kwargs = {"autodetect_encoding": True}

loader = DirectoryLoader("data/", glob="*.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(docs)

### Setup vector store to store all transcribed documents

In [3]:

# Run the following if not run already
# !pip install langchain-chroma
# !pip install langchainhub

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
import  os
from dotenv import load_dotenv

load_dotenv()

if os.path.isdir('./chroma_db'):
    vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=OpenAIEmbeddings())
else:
    vectorstore = Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings(), persist_directory="./chroma_db")


3.45.3 2024-04-15 13:34:05 8653b758870e6ef0c98d46b3ace27849054af85da891eb121e9aaa537f1e8355 (64-bit)


  warn_deprecated(


### Setup RAG chain using OpenAI API

In [None]:
from operator import itemgetter


from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Setup chain and ensure that the output is the response AND context. We will use it for evals
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | RunnablePassthrough().assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context" :itemgetter("context")} 
)

### Test Results

In [None]:
result = rag_chain.invoke("Who is Pat Mcafee?")
print(f"Response: {result['response']}" )
print(f"Context: {result['context']}" )

## Creating Evaluations

We will be using ```Ragas``` as our choice of RAG evaluation framework. For this we will create a new set of documents for evaulation

In [None]:
# Creating Evaluations

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
text_loader_kwargs = {"autodetect_encoding": True}

loader = DirectoryLoader("data/", glob="*.txt")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(docs)

In [7]:
# Get random samples for evals
import numpy as np

docs_idx = np.random.randint(low=0,high=len(docs)-1,size=10)

filtered_docs = [docs[i] for i in docs_idx]

### Setup the Test set generator

In [16]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4")
generator = TestsetGenerator.from_langchain(
                                            generator_llm,
                                            critic_llm,
                                            OpenAIEmbeddings(model="text-embedding-3-small"),
                                        )

testset = generator.generate_with_langchain_docs(filtered_docs, test_size=len(docs_idx)//2, distributions={simple : 0.5 , reasoning : 0.25 , multi_context : 0.25})
test_set_df = testset.to_pandas()

Filename and doc_id are the same for all nodes.                 
Generating: 100%|██████████| 5/5 [00:55<00:00, 11.15s/it]


### Generate dataset for evaluation

In [40]:
from datasets import Dataset
answers = []
contexts = []

for query in test_set_df.question:
    response = rag_chain.invoke(query)
    answers.append(response['response'].content)
    contexts.append([response['context']])

response_dataset = Dataset.from_dict({
    "question": test_set_df.question,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": test_set_df.ground_truth,
    
})

### Time to run evaluations!

In [42]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness
)
from ragas import evaluate

metrics = [
            answer_relevancy,
            faithfulness,
            context_recall,
            context_precision,
            answer_correctness
            ]

results = evaluate(response_dataset, metrics)

### Evaluate results

In [None]:
print(f"Metrics: {results}")

In [45]:
results_df = results.to_pandas()
results_df