# CosmosDB for NoSQL RAG
<img src = "./cosmosdbrag.png">


### Installing important packages and libraries

In [None]:
%pip install azure-cosmos
%pip install openai

In [1]:
from azure.cosmos import CosmosClient, PartitionKey, exceptions
import os
import json
from dotenv import load_dotenv

load_dotenv()

True

### Creating a connection to CosmosDB via connection string
and creating a database if it doesn't exist

In [None]:
cosmosdb_connection_string = os.getenv("COSMOSDB_CONNECTION_STRING")

client = CosmosClient.from_connection_string(cosmosdb_connection_string)
database_name = os.getenv("DATABASE_NAME")

database = client.create_database_if_not_exists(id=database_name)


### Defining the vector embedding policy 

In [3]:
pk = "/category"

vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/vector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":1536
        }
    ]
}

### Creating a vector index with diskANN algorithm


In [4]:
indexing_policy = {
    "vectorIndexes": [
        {
            "path":"/vector",
            "type":"diskANN"
        }

    ]
}

### Creating container inside of the database

In [5]:
try:
    container_name = os.getenv("CONTAINER_NAME")
    
    container = database.create_container_if_not_exists(
        id=container_name,
        partition_key=PartitionKey(path=pk),
        indexing_policy=indexing_policy,
        vector_embedding_policy=vector_embedding_policy
        
    )
    
except Exception as e:
    print(e)

### Creating Azure OpenAI Client

In [6]:
from openai import AzureOpenAI

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")  
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")

azure_openai_client = AzureOpenAI(
    api_key=azure_openai_key,
    api_version="2024-02-15-preview",
    azure_endpoint=azure_openai_endpoint
)

### Creating Embedding Generation Function
embedding engine to be used: text-embedding-ada-002 
<br>
vector dimensions: 1536

In [7]:
def generate_embeddings(client, text):
    embedding_model = os.getenv("EMBEDDING_ENGINE")
    
    response = client.embeddings.create(
        input=text,
        model = embedding_model
    )
    
    embeddings=response.model_dump()
    return embeddings['data'][0]['embedding']
    

### Loading food dataset
the food dataset is stored in `"./fooditems.json"`
<br>
we will generate vector embedding for the `/description` field of each food object and store it in a new field `/vector`

In [8]:
import json
import uuid

file_path = "./food_items.json"

with open(file_path) as f:
    data = json.load(f)
    

    
for obj in data:
    guid = str(uuid.uuid4())
    vector_embeddings = generate_embeddings(azure_openai_client, obj['description'])
    obj['vector'] = vector_embeddings
    obj['id']=guid
    container.upsert_item(obj)
    
    
with open("./new_dataset.json", 'w') as f:
    json.dump(data, f)
    


### Generating vector embeddings for the user query

In [26]:
user_query="are pizzas available? i am lactose intolerant"
user_embeddings = generate_embeddings(azure_openai_client, user_query)
print(user_embeddings)


[0.0062963166274130344, -0.02462921105325222, -0.006452283356338739, -0.0318443737924099, -0.02235073782503605, 0.006906622089445591, -0.019570456817746162, -0.027382366359233856, -0.007188040297478437, -0.018770279362797737, 0.0448370985686779, 0.008388307876884937, -0.02460208535194397, -0.004194153938442469, -0.010015788488090038, 0.004180591553449631, 0.02488689497113228, -0.002286949660629034, 0.006835419684648514, -0.022771170362830162, 0.00813740398734808, -0.002609055256471038, 0.022160863503813744, -0.013447060249745846, 0.005268969107419252, 0.016247684136033058, 0.01056506298482418, -0.0022310051135718822, -0.006604859605431557, 0.007649159990251064, 0.03650981932878494, 0.024235902354121208, -0.031464628875255585, -0.0018970323726534843, -0.009385139681398869, -0.01068034302443266, -0.0017546277958899736, -0.0009993750136345625, 0.018621092662215233, -0.015298319980502129, 0.009005393832921982, -0.011894172988831997, 0.007981437258422375, -0.021631933748722076, -0.036482695

### Sending a query to database with filtering based upon VectorDistance

In [27]:
queryText = f""" SELECT TOP 5 c.category, c.name, c.description, c.price, VectorDistance(c.vector, {user_embeddings}) AS SimilarityScore
FROM c
ORDER BY VectorDistance(c.vector, {user_embeddings})"""
results = container.query_items(
    query=queryText,
    enable_cross_partition_query=True
)
dishes = []

for item in results:
    print(item)
    dishes.append(item)
    


{'category': 'Pizza', 'name': 'Cheese Pizza', 'description': 'Hot and delicious cheese pizza made with our special tomato sauce and a generous topping of classic Italian mozzarella cheese.  ', 'price': '5.99 USD', 'SimilarityScore': 0.812584800127846}
{'category': 'Pizza', 'name': 'Double Slice Cheese Pizza', 'description': 'Two slices of cheese pizza made with vine-ripened tomato sauce, mozzarella cheese, and provolone cheese.', 'price': '5.99 USD', 'SimilarityScore': 0.8038828463901069}
{'category': 'Pizza', 'name': 'Double Slice Pepperoni Pizza', 'description': 'Two slices of pepperoni pizza made with vine-ripened tomato sauce, mozzarella cheese, and provolone cheese.', 'price': '5.99 USD', 'SimilarityScore': 0.7998702453125702}
{'category': 'Lunch', 'name': 'Italian', 'description': "Tomato basil focaccia, salami, ham, and cheese provolone (feta dressing). Take a,oad off and relax sip-n-snack at J'ti'z.", 'price': '7.29 USD', 'SimilarityScore': 0.7876641884634044}
{'category': 'Sal

### Sending call to our GPT engine for summarisation 

In [28]:
system_message = f"""You are meant to behave as a RAG chatbot that derives its context from a database of food items stored in azure cosmosDB for noSQL API.
please asnwer strictly from the context from the database provided and if you dont have an answer please politely say so. dont include any extra 
information that is not in the context and dont include links as well.
the context passed to you will be in the form of a pythonic list with each object in the list containing details of a food item and
having structure as follows:

 "category": "the category of the food item like smoothies, burgers, etc",
 "name": "the name of the food item",
 "description": "the description of the food item",
"price": "the price of the food item in USD",


the pythonic list contains best 5 matches to the user query based on cosine similarity of the embeddings of the user query and the food item descriptions.
please structure your answers in a very professional manner and in such a way that the user does not get to know that its RAG working under the hood
and its as if they are talking to a human."""

user_message = f""" the user query is: {user_query}
the context is : {dishes}"""

chat_completions_response = azure_openai_client.chat.completions.create(
    model = os.getenv("GPT_ENGINE"),
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ],
    temperature=0.7
)

print(chat_completions_response.choices[0].message.content)
    

Yes, we have pizzas available. However, please note that our pizzas such as the "Cheese Pizza" and "Double Slice Cheese Pizza" contain mozzarella cheese, and our "Double Slice Pepperoni Pizza" contains mozzarella and provolone cheese which are dairy products. Since you mentioned you are lactose intolerant, consuming these might cause discomfort. 

You might want to consider our other dishes such as the "Italian" from our Lunch category which is a Tomato basil focaccia with salami, ham, and cheese provolone. However, this also contains cheese. 

Alternatively, we have a "Club Salad" made of Romaine lettuce topped with crispy bacon, Canadian bacon, Roma tomatoes, whole milk mozzarella, and mild cheddar cheese. This too has cheese in it.

Please confirm with your server if they can make any of these dishes without cheese before ordering.
