In [None]:
import os
import pickle
import langchain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS

In [None]:
#load Hugging Face api key
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "Your Hugging Face API Key"

In [None]:
llm = HuggingFaceHub(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
    model_kwargs={"temperature":0.8, "max_length":1000}
)

In [None]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.aljazeera.com/news/2024/3/29/what-is-openais-sora-text-to-video-generator"
])
data = loaders.load() 

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [None]:
# Create the embeddings of the chunks using openAIEmbeddings
embeddings = HuggingFaceEmbeddings()

# Pass the documents and embeddings inorder to create FAISS vector index
vectorindex_openai = FAISS.from_documents(docs, embeddings)

In [None]:
# Storing vector index create in local
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex_openai, f)

if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)

In [None]:
chain = RetrievalQA.from_chain_type(llm=llm,
                                 retriever=vectorIndex.as_retriever(),
                                 return_source_documents=True)

In [None]:
query = "Is that possible to create 30 minutes video using SORA?"

langchain.debug=True
result = chain({"query": query}, return_only_outputs=True)

In [None]:
text = result['result']
answer_start = text.find("Answer:")

# Extract the text after "Answer:" until a blank line is found
answer_text = text[answer_start:text.find("\n\n", answer_start)]

# Print the extracted text
print(answer_text)