In [None]:
!python3 -m venv env
!bash env/bin/activate

In [None]:
%pip install --user --break-system-packages langchain==0.0.292
%pip install --user --break-system-packages yt_dlp==2023.7.6
%pip install --user --break-system-packages tiktoken==0.5.1
%pip install --user --break-system-packages openai==0.28.0

In [None]:
#Import OS package
import os

#Import glob
import glob

#Import the openai package
import openai

#Import the yt_dlp as youtube_dl
import yt_dlp as youtube_dl

#Import DownloadError from yt_dlp
from yt_dlp import DownloadError

In [None]:
openai_api_key = os.getenv("OPENAI_API_KEY")

In [None]:
import shutil
#Add youtube video url
youtube_url = "https://www.youtube.com/watch?v=tLaLfaIJf-Y"

#Directory to save the downloaded video
output_dir = "./audios"

#Config for youtube-dl
# Specify the path to ffprobe and ffmpeg executables
ffmpeg_path = shutil.which("ffmpeg")

ydl_config = {
    "format": "bestaudio/best",
    "postprocessors": [
        {
            "key": "FFmpegExtractAudio",
            "preferredcodec": "mp3",
            "preferredquality": "192",
        }
    ],
    "outtmpl": os.path.join(output_dir, "%(title)s.%(ext)s"),
    "verbose": True,
    "ffmpeg_location": ffmpeg_path
}

#Check if the output directory exists, if not create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#Print a message indicating that the video is being downloaded
print(f"Downloading audio from {youtube_url}")

# Attempt to download the video using specified config
# If a DownloadError is raised, attempt to download the video again
try:
    with youtube_dl.YoutubeDL(ydl_config) as ydl:
        ydl.download([youtube_url])
except DownloadError:
    with youtube_dl.YoutubeDL(ydl_config) as ydl:
        ydl.download([youtube_url])

In [None]:
# Find the audio files in the output directory
audio_files = glob.glob(os.path.join(output_dir, "*.mp3"))

# Select the first audio file
audio_filename = audio_files[0]

print(audio_filename)

In [None]:
# Function parameters
audio_file = audio_filename
model = "whisper-1"

# Transcribe the audio file to text using OpenAI API
print("Conveting audio to text...")

with open(audio_file, "rb") as audio:
    response = openai.Audio.transcribe(model, audio)

# Extract the transcription from the response
transcript = (response["text"])

In [None]:
output_file = "./transcripts/output.txt"

# If an output file is specified, write the transcription to the file
if output_file is not None:
    # Create the output directory if it does not exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    # Write the transcript to the output file
    with open(output_file, "w") as file:
        file.write(transcript)

# Print the transcription
print(transcript)

In [None]:
# Import the TextLoader class from the langchain.document_loaders module
from langchain.document_loaders import TextLoader

# Create a new instance of the TextLoader class
loader = TextLoader("./transcripts/output.txt")

# Load the document
docs = loader.load()

In [None]:
docs[0]

In [None]:
# Import the tiktoken package
import tiktoken

In [None]:
# Import the RetrievalQA class from the langchain.chains module
from langchain.chains import RetrievalQA

# Import the ChatOpenAI class from the langchain.chat_models module
from langchain.chat_models import ChatOpenAI

# Import the DocArrayInMemorySearch class from the langchain.vectorstores module
from langchain.vectorstores import DocArrayInMemorySearch

# Import the OpenAIEmbedding class from the langchain.embeddings module
from langchain.embeddings import OpenAIEmbeddings

In [None]:
# Create a new DocArrayInMemorySearch instance from the specified document
db = DocArrayInMemorySearch.from_documents(docs, OpenAIEmbeddings())

In [None]:
# Convert the DocArrayInMemorySearch instance to a retriever
retriever = db.as_retriever()

# Create a new ChatOpenAI instance with the temperature of 0.0
llm = ChatOpenAI(temperature=0.0)

In [None]:
# Create  a new RetrievalQA instance with the specified parameters
qa_stuff = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, verbose=True)

In [None]:
# Select the query to be used for the QA system
query = "Who is best suited to take this course?"

# Run the query through the RetrievalQA instance and store the response
response = qa_stuff.run(query)

# Print the response
print(response)

In [None]:
os.remove(audio_file)