In [1]:
import os
import time
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_community.document_loaders import UnstructuredURLLoader, SeleniumURLLoader, PlaywrightURLLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.embeddings import HuggingFaceInstructEmbeddings
# from InstructorEmbedding import INSTRUCTOR
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv


USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
# Loading environment variables
load_dotenv()
azure_endpoint = os.getenv("ENDPOINT_URL")
azure_key = os.getenv("API_KEY")

# LLM deployment
deployment_name = "gpt-4o-mini"
api_version = "2024-05-01-preview"

llm = AzureChatOpenAI(
    azure_endpoint = azure_endpoint,
    api_key = azure_key,
    azure_deployment = deployment_name,
    model_name = deployment_name,
    api_version = api_version,
    max_tokens = 300,
    temperature = 0.3
)

# Embedding deployment
# model_name = "hkunlp/instructor-large"
# model_kwargs = {'device': 'cpu'}
# encode_kwargs = {'normalize_embeddings': True}
# embeddings = HuggingFaceInstructEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs
# )

embedding_deployment_name = "text-embedding-3-small"
embedding_api_version = "2024-12-01-preview",

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint = azure_endpoint,
    api_key = azure_key,
    azure_deployment = embedding_deployment_name
    )


In [3]:
# url = "https://finance.yahoo.com/m/e98645b8-09d9-3220-b404-3e559b2659db/7-stocks-burn-a-2-5-trillion.html"
url = "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_percentage_error.html"
urls = []
urls.append(url)

# loader = UnstructuredURLLoader(urls=urls)
# loader = SeleniumURLLoader(
#     urls=urls,
#     headless=True
# )  
# loader = PlaywrightURLLoader(urls=urls, remove_selectors=["header", "footer"])
# data = await loader.load()

loader = WebBaseLoader(web_paths=urls)

for url in urls: 
    try:
        data = loader.load()
        print(data)
        time.sleep(2)
    except Exception as e:
        print(f"Error loading data: {e}")

r_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", ".", ","],
chunk_size = 500,
chunk_overlap = 0
)
docs = r_splitter.split_documents(data)

# creat embeddings and save it to FAISS index
vectorstore = FAISS.from_documents(docs, embeddings)

vectorstore.save_local("faiss_vectorstore")



In [4]:
try:
    faiss_vectorstore = FAISS.load_local(
        "faiss_vectorstore", 
        embeddings, 
        allow_dangerous_deserialization=True
    )
except Exception as e:
    print(f"Error loading FAISS vectorstore: {e}")

chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=faiss_vectorstore.as_retriever())

In [5]:
query = "What is MAPE? When should I use it? How should I tell if the model is good or bad?"

print("Query:", query)
result = chain.invoke({"question": query}, return_only_outputs=True)
print("Answer:", result["answer"])

Query: What is MAPE? When should I use it? How should I tell if the model is good or bad?
Answer: MAPE stands for Mean Absolute Percentage Error. It is a regression loss metric that measures the accuracy of a forecasting method. The output of MAPE is a non-negative floating point, with the best value being 0.0. However, it is important to note that bad predictions can lead to arbitrarily large MAPE values, especially if some true values are very close to zero. MAPE is particularly useful for understanding the percentage error in predictions.

To determine if a model is good or bad using MAPE, you should look for lower values, as a lower MAPE indicates better predictive accuracy. However, keep in mind that MAPE can be misleading if the actual values are close to zero.


