In [1]:
# import pandas as pd
from datetime import datetime, timedelta
from pymongo import MongoClient
import json
import os
from dotenv import load_dotenv

In [None]:
mongo_client= MongoClient(os.getenv("MONGO_CONNECTION_STRING_DISKANN"))
db = mongo_client['filtering_on_diskann']

# Create collection if it doesn't exist
COLLECTION_NAME = "filtering"

collection = db[COLLECTION_NAME]

if COLLECTION_NAME not in db.list_collection_names():
    db.create_collection(COLLECTION_NAME)
    print("Created collection '{}'.\n".format(COLLECTION_NAME))
else:
    print("Using collection: '{}'.\n".format(COLLECTION_NAME))

In [15]:
load_dotenv("variables.env", override=True)

AOAI_KEY = os.getenv("AOAI_KEY")
AOAI_ENDPOINT =  os.getenv("AOAI_ENDPOINT")
API_VERSION =  os.getenv("API_VERSION")
AOAI_EMBEDDING_DEPLOYMENT_MODEL = os.getenv("AOAI_EMBEDDING_DEPLOYMENT_MODEL")

In [24]:
from openai import AzureOpenAI
client = AzureOpenAI(
  azure_endpoint= AOAI_ENDPOINT,
  api_key=AOAI_KEY,  
  api_version="2023-05-15"
)

In [25]:
def generate_embedding(text):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding

In [216]:
db.command({
  'createIndexes': 'filtering',
  'indexes': [
    {
      'name': 'filter',
      'key': {
        "Embedding": "cosmosSearch"
      },
      'cosmosSearchOptions': {
        'kind': 'vector-diskann',
        'similarity': 'COS',
        'dimensions': 1536, 
        'maxDegree': 32,
        'lBuild': 50
      }, 
    }
  ]
})

{'raw': {'defaultShard': {'numIndexesBefore': 1,
   'numIndexesAfter': 2,
   'createdCollectionAutomatically': False,
   'ok': 1}},
 'ok': 1}

In [208]:
print(collection.index_information())

{'_id_': {'v': 2, 'key': [('_id', 1)]}, 'filter': {'v': 2, 'key': [('Embedding', 'cosmosSearch')], 'cosmosSearchOptions': SON([('kind', 'vector-diskann'), ('numLists', 1), ('similarity', 'COS'), ('dimensions', 1536)])}}


In [217]:
collection.delete_many({})
with open("data_w_embedding.json", 'r') as file:
    data = json.load(file)

collection.insert_many(data)

InsertManyResult([ObjectId('679c2404084240d19a301a4e'), ObjectId('679c2404084240d19a301a4f'), ObjectId('679c2404084240d19a301a50'), ObjectId('679c2404084240d19a301a51'), ObjectId('679c2404084240d19a301a52'), ObjectId('679c2404084240d19a301a53'), ObjectId('679c2404084240d19a301a54'), ObjectId('679c2404084240d19a301a55'), ObjectId('679c2404084240d19a301a56'), ObjectId('679c2404084240d19a301a57'), ObjectId('679c2404084240d19a301a58'), ObjectId('679c2404084240d19a301a59'), ObjectId('679c2404084240d19a301a5a'), ObjectId('679c2404084240d19a301a5b'), ObjectId('679c2404084240d19a301a5c'), ObjectId('679c2404084240d19a301a5d'), ObjectId('679c2404084240d19a301a5e'), ObjectId('679c2404084240d19a301a5f'), ObjectId('679c2404084240d19a301a60'), ObjectId('679c2404084240d19a301a61'), ObjectId('679c2404084240d19a301a62'), ObjectId('679c2404084240d19a301a63'), ObjectId('679c2404084240d19a301a64'), ObjectId('679c2404084240d19a301a65'), ObjectId('679c2404084240d19a301a66'), ObjectId('679c2404084240d19a301a

In [224]:
collection.create_index('name')


'name_1'

In [231]:
collection.create_index("is_open")

'is_open_1'

In [256]:
def vector_search(query_text, regex, num_results=5):
    # Generate the embedding for the query text
    query_embedding = generate_embedding(query_text)

    pipeline = [
    {
        "$search": {
            "cosmosSearch": {
                "path": "Embedding",
                "vector": query_embedding,  
                "k": num_results,
                "filter": {
                    "name": {"$regex": regex, "$options": "i"}
                }  
            }
        }
     }
]
    # Execute the aggregation pipeline in Cosmos DB
    results = list(collection.aggregate(pipeline))
    for r in results:
        print(f"- **ID**: {r.get('ID', 'N/A')}")
        print(f"- **Name**: {r.get('name', 'N/A')}")
        print(f"- **Description**: {r.get('description', '(No description provided)')}\n")


In [257]:
regex="p[riv]+[aieuo]*t[e]*"
query = "pvate garden"
vector_search(query, regex)

- **ID**: 1041934
- **Name**: Private apt in Berkeley, MTN Views!
- **Description**: 

- **ID**: 756594
- **Name**: Private lower level 2 bedroom suite
- **Description**: NOT 420 (marijuana) friendly. Our private ground level suite has 2 bedrooms, 1 bathroom, access to a coffee bar (kitchenette) and a living room area. It is roomy, comfortable and clean.  We live upstairs and rent the finished basement.  Common areas include the back door, back yard and laundry room.  Our house is in an upscale neighborhood, convenient location close to public transportation and off-street parking. the ceilings are low, so extra tall people (over 6'1") will be inconvenienced.

- **ID**: 915016
- **Name**: Cozy Capitol Hill Accommodation
- **Description**: Private entrance into a cozy place with a  fully equipped kitchen and all the conveniences of home.  We have lived in this place for two months, so everything is there for your longer stay. Clean and comfy with private laundry and big closet. Good wif