In [None]:
pip install openai langchain chromadb pypdf sentence-transformers yt_dlp pydub librosa

In [5]:
# Step 1 - load documents
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
import openai

urls = ["https://youtu.be/ZV-fcq-DIuE"]

# Directory to save audio files
save_dir = "yt-audios/"

# Setting the openai API key for the WhisperParser
api_key = "<YOUR API KEY HERE>"
openai.api_key = api_key

loader = GenericLoader(YoutubeAudioLoader(urls, save_dir), OpenAIWhisperParser())
documents = loader.load()

[youtube] Extracting URL: https://youtu.be/ZV-fcq-DIuE
[youtube] ZV-fcq-DIuE: Downloading webpage
[youtube] ZV-fcq-DIuE: Downloading ios player API JSON
[youtube] ZV-fcq-DIuE: Downloading android player API JSON
[youtube] ZV-fcq-DIuE: Downloading m3u8 information
[info] ZV-fcq-DIuE: Downloading 1 format(s): 140
[download] yt-audios//World Cup ｜ India's mid-tournament review ft. Harsha Bhogle.m4a has already been downloaded
[download] 100% of    4.09MiB
[ExtractAudio] Not converting audio yt-audios//World Cup ｜ India's mid-tournament review ft. Harsha Bhogle.m4a; file is already in target format m4a
Transcribing part 1!


In [6]:
# Step 2 - split documnets into chunks
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

In [7]:
# Step 3 - load documents into Chroma using the open-source embedding function
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_documents(docs, embedding_function)

In [8]:
# Step 4 - define the LLM model

import datetime
current_date = datetime.datetime.now().date()
if current_date < datetime.date(2023, 9, 2):
    llm_name = "gpt-3.5-turbo-0301"
else:
    llm_name = "gpt-3.5-turbo"

from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, 
                 temperature=0, 
                 openai_api_key=api_key)

In [9]:
# Step 5 - create the chain with the LLM model and the database
from langchain.chains import RetrievalQA

retrievalQA = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=db.as_retriever(), 
    verbose=True
)

In [14]:
# Step 6 - ask the question and display the response!
from IPython.display import display, Markdown

prompt = "Tell me how many matches have India won in the 2023 cricket world cup. Also, list the top 3 players based on given context"
response = retrievalQA.run(prompt)

display(Markdown(response))

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


Based on the given context, it is mentioned that India has won five matches in the 2023 cricket world cup. 

The top three players based on the given context are:

1. Rohit Sharma: Rohit is mentioned as the captain who is setting the platform for the team's batting and keeping the atmosphere right.
2. Virat Kohli: Virat is praised for his contributions in the last two chases and his ability to carry the team through in successive games.
3. Jasprit Bumrah: Bumrah is highlighted as the catalyst for the bowling unit, delivering excellent first spells and maintaining low economy rates.