# RAG with Azure Cosmos DB for Mongo vcore

#### IMPORTANT!! Embeddings Creation - Run this only once !!!
You only need to run this once to create the embeddings and save them to Azure Cosmos DB.  

In [None]:
from dotenv import load_dotenv
import pandas as pd
from IPython.display import display, HTML, JSON, Markdown
import os

# Configure environment variables
load_dotenv()

# Configure OpenAI API
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_GPT4_DEPLOYMENT_NAME")
AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME  = os.getenv("AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

#configure Cosmos 
COSMOS_MONGO_CONNECTION_STRING = os.getenv("COSMOS_MONGO_CONNECTION_STRING")
COSMOS_INDEX_NAME = os.getenv("COSMOS_INDEX_NAME")
COSMOS_DBNAME = os.getenv("COSMOS_DBNAME")
COSMOS_COLLECTION_NAME = os.getenv("COSMOS_COLLECTION_NAME")

In [2]:
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter

from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [3]:
embeddingmodel = AzureOpenAIEmbeddings(
    deployment=AZURE_OPENAI_ENDPOINT,
    model=AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key = AZURE_OPENAI_API_KEY,
    chunk_size = 1)

In [4]:
#we use the tenacity library to create delays and retries when calling openAI embeddings to avoid hitting throttling limits
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def calc_embeddings(text):
    deployment = AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME
    # replace newlines, which can negatively affect performance.
    txt = text.replace("\n", " ")
    return embeddingmodel.embed_query(txt)

In [5]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=30,
)

documentName = "moby dick book"
fileName = "../data/moby dick.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split(text_splitter=splitter)
print("Number of pages: ", len(pages))

Number of pages:  1446


In [6]:
#save all the pages into a pandas dataframe
df = pd.DataFrame(columns=['document_name', 'content', 'embedding'])
for page in pages:
    df.loc[len(df.index)] = [documentName, page.page_content, ""]  
df.head()

Unnamed: 0,document_name,content,embedding
0,moby dick book,The Project Gutenberg eBook of Moby-Dick; or T...,
1,moby dick book,CHAPTER 1. Loomings. \n \nCHAPTER 2. The Carpe...,
2,moby dick book,CHAPTER 11. Nightgown. \n \nCHAPTER 12. Biogra...,
3,moby dick book,CHAPTER 41. Moby Dick. \n \nCHAPTER 42. The Wh...,
4,moby dick book,CHAPTER 68. The Blanket. \n \nCHAPTER 69. The ...,


In [7]:
from pymongo import MongoClient
from langchain_community.vectorstores.azure_cosmos_db import (
    AzureCosmosDBVectorSearch,
    CosmosDBSimilarityType,
)

client: MongoClient = MongoClient(COSMOS_MONGO_CONNECTION_STRING)

# Create database if it doesn't exist
db = client[COSMOS_DBNAME]
if COSMOS_DBNAME not in client.list_database_names():
    # Create a database with 400 RU throughput that can be shared across
    # the DB's collections
    db = client[COSMOS_DBNAME]
    print("Created db '{}'.\n".format(COSMOS_DBNAME))
else:
    print("Using database: '{}'.\n".format(COSMOS_DBNAME))

# Create collection if it doesn't exist
collection = db[COSMOS_COLLECTION_NAME]
if COSMOS_COLLECTION_NAME not in db.list_collection_names():
    # Creates a unsharded collection that uses the DBs shared throughput
    collection = db[COSMOS_COLLECTION_NAME]
    print("Will create collection '{}'.\n".format(COSMOS_COLLECTION_NAME))
else:
    print("Using collection: '{}'.\n".format(COSMOS_COLLECTION_NAME))

collection = client[COSMOS_DBNAME][COSMOS_COLLECTION_NAME]

  client: MongoClient = MongoClient(COSMOS_MONGO_CONNECTION_STRING)


Using database: 'embed-test-db'.

Will create collection 'embed-pdf-collection'.



In [8]:
# create and index the vector store and insert the content
vectorstore = AzureCosmosDBVectorSearch.from_documents(
    pages,
    embeddingmodel,
    collection=collection,
    index_name=COSMOS_INDEX_NAME,
)

num_lists = 100
dimensions = 1536
similarity_algorithm = CosmosDBSimilarityType.COS

vectorstore.create_index(num_lists, dimensions, similarity_algorithm)

NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}