### **Load Environment variables from .env file**

In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
from dotenv import load_dotenv
import os



In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

client = AzureOpenAI(
  azure_endpoint = OPENAI_DEPLOYMENT_ENDPOINT, 
  api_key=OPENAI_API_KEY,  
  api_version="2023-05-15"
)

In [3]:
embeddings = AzureOpenAIEmbeddings(
    deployment=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME,
    model=OPENAI_ADA_EMBEDDING_MODEL_NAME,
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
    chunk_size = 1)


#### **Run ONLY ONCE to create the embeddings - Split text into chunks** 

The document will be split into chunks (default - 4000 characters each with 200 overlapping)

In [4]:
fileName = "./data/fabric-data-engineering.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split()
print("Number of pages: ", len(pages))

Number of pages:  292


#### **Run ONLY ONCE to create the embeddings - Create embeddings and save to FAISS**

FAISS (Facebook AI Similarity Search) is a simple free vector store (developed by Meta).
The from_documents method will run a chain that calculates the vector embeddings of each chunk and store the vectors in FAISS.

In [5]:
db = FAISS.from_documents(documents=pages, embedding=embeddings)
# save the FAISS index to disk
db.save_local("./dbs/documentation/faiss_index")

In [6]:
from langchain.chat_models import AzureChatOpenAI
llm = AzureChatOpenAI(
    deployment_name=OPENAI_DEPLOYMENT_NAME,
    model_name=OPENAI_MODEL_NAME,
    api_version="2023-05-15",
    azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
)

#### **Initialize retrieval API WITH your data **

In [7]:
# load the vector store to memory
vectorStore = FAISS.load_local("./dbs/documentation/faiss_index", embeddings)
retriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k": 2})  # returns 2 most similar vectors/documents
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False)

#### **Ask questions**

In [8]:
qa({"query": "Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?"})

{'query': 'Can I use PowerBI datamart for 110 GB data volume? Explain which other options are available for data volumes of 110 GB?',
 'result': 'No, according to the information provided, the Power BI Datamart is designed for data volumes up to 100 GB. For a data volume of 110 GB, you may consider using a data warehouse in Microsoft Fabric, which supports unlimited data volume for structured data and is suitable for SQL engineers and data warehouse developers. Alternatively, you could also consider creating a lakehouse in Microsoft Fabric, which supports unstructured, semi-structured, and structured data and is suitable for data engineers and data scientists.'}

In [9]:
qa({"query": "What are the steps to load a CSV file to a delta table in Microsoft Fabric?"})

{'query': 'What are the steps to load a CSV file to a delta table in Microsoft Fabric?',
 'result': "To load a CSV file to a Delta table in Microsoft Fabric, you can follow these general steps based on the provided context:\n\n1. In Microsoft Fabric, select the Synapse Data Engineering experience.\n2. Ensure that you are in the desired workspace or select/create one.\n3. Select the Lakehouse icon under the New section on the main page.\n4. Upload the CSV file to the Lakehouse.\n5. Convert the uploaded CSV file to a Delta table.\n6. Generate a dataset and create a Power BI report.\n\nPlease note that the specific steps may vary based on the current UI and functionality of Microsoft Fabric, as it is mentioned to be in PREVIEW and subject to substantial modifications before release. For the most accurate and up-to-date instructions, it's best to refer to the official documentation or tutorials provided by Microsoft."}