In [25]:
import os
import textwrap
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# This is the YouTube video we're going to use.
YOUTUBE_VIDEO = "https://youtu.be/sVC4_yK0hr8?feature=shared"

Setting up the model

In [2]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")


Adding a langchain parser so that we can skip the AIMessage() object that is being returned from OpenAI.

In [None]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

Transcribing Youtube Video

In [13]:
import tempfile
import whisper
from pytube import YouTube


# Check if we have created the transcription file yet. If not, then create one.
if not os.path.exists("short_transcription.txt"):
    youtube = YouTube(YOUTUBE_VIDEO)
    audio = youtube.streams.filter(only_audio=True).first()

    # Let's load the base model. This is not the most accurate
    # model but it's fast.
    whisper_model = whisper.load_model("base")

    with tempfile.TemporaryDirectory() as tmpdir:
        file = audio.download(output_path=tmpdir)
        transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()

        with open("short_transcription.txt", "w") as file:
            file.write(transcription)

##### Splitting the transcription file into smaller chunks

In [14]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("short_transcription.txt")
text_documents = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)

In [15]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_pinecone import PineconeVectorStore

embeddings = OpenAIEmbeddings()

index_name = "rag-application"

pinecone = PineconeVectorStore.from_documents(
    documents, 
    embeddings, 
    index_name=index_name
)

In [26]:
chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

response = chain.invoke("What's going on with shrimp industry in India?")

formatted_response = textwrap.fill(response, width=80)
print(formatted_response)


The shrimp industry in India is booming and has become one of the leading
exporters of shrimp. However, it has raised concerns about labor abuse,
environmental damage, and the use of antibiotics in shrimp production.


Translate response to Marathi

In [None]:
# from operator import itemgetter

# translation_prompt = ChatPromptTemplate.from_template(
#     "Translate {answer} to {language}"
# )

# translation_chain = (
#     {"answer": chain, "language": itemgetter("language")} 
#     | translation_prompt 
#     | model 
#     | parser
# )

# #Answering in Marathi
# translation_chain.invoke(
#     {   "question":"Summarize",
#         "language": "Marathi"
#     }
# )