In [None]:
%pip install psycopg2
%pip install pymongo
%pip install redis
%pip install openai
%pip install python-dotenv
# %pip install langchain

### 1. PostgreSQL

#### Pgvector extension on Azure Cosmos DB for PostgreSQL

- Database client for accessing Postgre SQL (GUI)
- https://www.dbvis.com/

1. How to configure vector extension

    ```postgresql
    SELECT CREATE_EXTENSION('vector');
    ```

    To disable an extension use drop_extension()

1. pgvector introduces 3 new operators that can be used to calculate similarity:

    | Operator   | Description           |
    |:----------:|:---------------------:|
    |<->	     |Euclidean distance     |
    |<#>	     |negative inner product |
    |<=>	     |cosine distance        |

In [27]:
import os
from dotenv import load_dotenv

dotenv_path = os.path.join('.', '.env')
load_dotenv(dotenv_path, override=True)

True

#### Using native wiring API for postgreSQL

In [17]:
import os
import psycopg2

# Connect to your PostgreSQL database
conn = psycopg2.connect(
    host=os.getenv("POSTGRE_HOST"),
    database=os.getenv("POSTGRE_DB"),
    user=os.getenv("POSTGRE_USER"),
    password=os.getenv("POSTGRE_PASSWD")
)

# Create a cursor object
cur = conn.cursor()

# Execute the first query to create the table
cur.execute("""
    CREATE TABLE tblvector(
        id bigserial PRIMARY KEY,
        embedding vector(3)
    );
""")

# Execute the second query to insert data into the table
cur.execute("""
    INSERT INTO tblvector (id, embedding) VALUES (1, '[1,2,3]'), (2, '[4,5,6]'), (3, '[5,4,6]'), (4, '[3,5,7]'), (5, '[7,8,9]');
""")

# Execute the third query to insert or update data in the table
cur.execute("""
    INSERT INTO tblvector (id, embedding) VALUES (1, '[1,2,3]'), (2, '[4,5,6]')
    ON CONFLICT (id) DO UPDATE SET embedding = EXCLUDED.embedding;
""")

# Execute the fourth query to delete data from the table
# cur.execute("""
#    DELETE FROM tblvector WHERE id = 1;
# """)

# Execute the fifth query to select data from the table
cur.execute("""
    SELECT * FROM tblvector 
    ORDER BY embedding <-> '[3,1,2]' 
    LIMIT 5;
""")

# Fetch and print the results of the SELECT query
results = cur.fetchall()
for row in results:
    print(row)

# Close the cursor and connection
cur.close()
conn.close()


(1, '[1,2,3]')
(3, '[5,4,6]')
(2, '[4,5,6]')
(4, '[3,5,7]')
(5, '[7,8,9]')


### 2. Azure Cosmos DB for MongoDB vCore


#### Using native wiring API for MongoDB

This option does not seem to support several MongoDB commands for management purposes. It is unclear whether it can create a database through Python code. Based on some trials, I have concluded that vCore does not support the use of `customAction`.

https://learn.microsoft.com/en-us/azure/cosmos-db/mongodb/vcore/compatibility

In [None]:
import pymongo
import urllib

# client = pymongo.MongoClient("mongodb://localhost:27017/")
# db = client["test"]
# exampleCollection = db["exampleCollection"]

encoded_pwd = urllib.parse.quote(os.getenv("MONGODB_PASSWD"))
client = pymongo.MongoClient(os.getenv("MONGODB_CONNECTION_STRING").format(password=encoded_pwd))

# Create database if it doesn't exist
DB_NAME = os.getenv("MONGODB_DB")
print(client.list_database_names())
db = client[DB_NAME]
if DB_NAME not in client.list_database_names():
    # Create a database with 400 RU throughput that can be shared across
    # the DB's collections
    db.command({"customAction": "CreateDatabase", "offerThroughput": 400})
    print("Created db '{}' with shared throughput.\n".format(DB_NAME))
else:
    print("Using database: '{}'.\n".format(DB_NAME))

# Create collection if it doesn't exist
COLLECTION_NAME = os.getenv("MONGODB_COLLECTION")
exampleCollection = db[COLLECTION_NAME]
if COLLECTION_NAME not in db.list_collection_names():
    # Creates a unsharded collection that uses the DBs shared throughput
    db.command(
        {"customAction": "CreateCollection", "collection": COLLECTION_NAME}
    )
    print("Created collection '{}'.\n".format(COLLECTION_NAME))
else:
    print("Using collection: '{}'.\n".format(COLLECTION_NAME))

# Create indexes
indexes = [
    {
      'name': 'vectorSearchIndex',
      'key': {
        "vectorContent": "cosmosSearch"
      },
      'cosmosSearchOptions': {
        'kind': 'vector-ivf',
        'numLists': 100,
        'similarity': 'COS',
        'dimensions': 3
      }
    }
]

db.command(
  {
    'createIndexes': COLLECTION_NAME,
    'indexes': indexes
  }
)

# Insert data
exampleCollection.insert_many([
  {'name': "Eugenia Lopez", 'bio': "Eugenia is the CEO of AdvenureWorks.", 'vectorContent': [0.51, 0.12, 0.23]},
  {'name': "Cameron Baker", 'bio': "Cameron Baker CFO of AdvenureWorks.", 'vectorContent': [0.55, 0.89, 0.44]},
  {'name': "Jessie Irwin", 'bio': "Jessie Irwin is the former CEO of AdventureWorks and now the director of the Our Planet initiative.", 'vectorContent': [0.13, 0.92, 0.85]},
  {'name': "Rory Nguyen", 'bio': "Rory Nguyen is the founder of AdventureWorks and the president of the Our Planet initiative.", 'vectorContent': [0.91, 0.76, 0.83]},
])

# Query data
queryVector = [0.52, 0.28, 0.12]
exampleCollection.aggregate([
  {
    '$search': {
      "cosmosSearch": {
        "vector": queryVector,
        "path": "vectorContent",
        "k": 2
      },
    "returnStoredSource": True
    }
  }
])

# Get metadata
exampleCollection.index_information()


```cmd
OperationFailure: Command CreateDatabase not supported., full error: {'ok': 0.0, 'errmsg': 'Command CreateDatabase not supported.', 'code': 115, 'codeName': 'CommandNotSupported'}
```

### 3. Azure Cognitive Search

#### Vector search (private preview) - Azure Cognitive Search

- Connect to [Azure SDK Python Dev Feed](https://dev.azure.com/azure-sdk/public/_artifacts/feed/azure-sdk-for-python/connect/pip) to use the alpha version of the azure-search-documents pip package.
  - [Download Python](https://www.python.org/downloads/)
  - Update Pip: `python -m pip install --upgrade pip`
  - Install the keyring `pip install keyring artifacts-keyring`
  - If you're using Linux, ensure you've installed the [prerequisites](https://pypi.org/project/artifacts-keyring/), which are required for artifacts-keyring.
  - Add a `pip.ini` (Windows) or `pip.conf` (Mac/Linux) file to your virtualenv or where Python is located on your machine:
  ```plaintext
  [global]
  index-url=https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
  ```
  - For example, on my machine, I placed mine in the following directory: `%AppData%\pip\pip.ini`
  - **Note**: Be sure you don't save it as a `.txt` file

- The command to check location of `pip.ini` in Windows.

    - `pip config -v list`

In [None]:
%pip install azure-search-documents==11.4.0a20230509004

In [None]:
%pip install keyring artifacts-keyring

In [3]:
# Import required libraries  
import os  
import json  
import openai  
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    VectorSearchAlgorithmConfiguration,  
)  
  
# Configure environment variables  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")  
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
openai.api_type = "azure"  
openai.api_key = os.getenv("OPENAI_API_KEY")  
openai.api_base = os.getenv("OPENAI_ENDPOINT")  
openai.api_version = os.getenv("OPENAI_API_VERSION")  
credential = AzureKeyCredential(key)

In [15]:
# Generate Document Embeddings using OpenAI Ada 002

# Read the text-sample.json
sample_path = os.path.join('.', 'text-sample.json')
with open(sample_path, 'r', encoding='utf-8') as file:
    input_data = json.load(file)

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
# Note: "engine" should be set to the deployment name you chose when you deployed the text-embedding-ada-002 model
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine=os.getenv('OPENAI_DEPLOYMENT_NAME'))
    embeddings = response['data'][0]['embedding']
    return embeddings


# Generate embeddings for title and content fields
for item in input_data:
    title = item['title']
    content = item['content']
    title_embeddings = generate_embeddings(title)
    content_embeddings = generate_embeddings(content)
    item['titleVector'] = title_embeddings
    item['contentVector'] = content_embeddings
    item['@search.action'] = 'upload'

# Output embeddings to docVectors.json file
sample_output_path = os.path.join('.', 'text-sample-output-vector.json')
with open(sample_output_path, "w") as f:
    json.dump(input_data, f)

In [16]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="title", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String,
                    searchable=True, retrievable=True),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True, searchable=True, retrievable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, dimensions=1536, vector_search_configuration="my-vector-config"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 1000,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_keywords_fields=[SemanticField(field_name="category")],
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 demoindex created


In [17]:
# Upload some documents to the index
with open(sample_output_path, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)  
print(f"Uploaded {len(documents)} documents") 

Uploaded 108 documents


In [20]:
# Pure Vector Search
query = "tools for software development using AI"  
  
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))  
  
results = search_client.search(  
    search_text="",  
    vector=Vector(value=generate_embeddings(query), k=3, fields="contentVector"),  
    select=["title", "content", "category"] 
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Content: {result['content']}")  
    print(f"Category: {result['category']}\n")  


Title: Azure Cognitive Services
Content: Azure Cognitive Services are a set of AI services that enable you to build intelligent applications with powerful algorithms using just a few lines of code. These services cover a wide range of capabilities, including vision, speech, language, knowledge, and search. They are designed to be easy to use and integrate into your applications. Cognitive Services are fully managed, scalable, and continuously improved by Microsoft. It allows developers to create AI-powered solutions without deep expertise in machine learning.
Category: AI + Machine Learning

Title: Azure Batch AI
Content: Azure Batch AI is a fully managed, AI-powered service that enables you to run distributed training and inferencing workloads for your machine learning models at scale. It provides features like automatic scaling, job scheduling, and integration with popular deep learning frameworks, such as TensorFlow, PyTorch, and Caffe. Batch AI supports various platforms, such as .

### 4. Azure Cache for Redis

#### Azure Cache for Redis Enterprise

Most expensive vector database option in Azure. Redis Enterprise only supports limited regions.

https://redis.readthedocs.io/en/latest/examples/search_vector_similarity_examples.html

https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/redis/getting-started-with-redis-and-openai.ipynb

**Note**: Modules must be enabled at the time you create an `Azure Cache for Redis` instance. Must enable `RedisSearch` Module at the time of creation.

In [31]:
import redis
import numpy as np
from redis.commands.search.field import TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query

# r = redis.Redis(host="localhost", port=6379)
host_name = os.getenv("REDIS_HOST")
access_key = os.getenv("REDIS_ACCESS_KEY")

r = redis.StrictRedis(host=host_name, port=10000,
                      password=access_key, ssl=True)

result = r.ping()
print("Ping returned : " + str(result))

INDEX_NAME = os.getenv("REDIS_INDEX_NAME")  # Vector Index Name
DOC_PREFIX = os.getenv("REDIS_DOC_PREFIX")  # RediSearch Key Prefix for the Index

def create_index(vector_dimensions: int):
    try:
        # check to see if index exists
        r.ft(INDEX_NAME).info()
        print("Index already exists!")
    except:
        # schema
        schema = (
            TagField("tag"),                       # Tag Field Name
            VectorField("vector",                  # Vector Field Name
                "FLAT", {                          # Vector Index Type: FLAT or HNSW
                    "TYPE": "FLOAT32",             # FLOAT32 or FLOAT64
                    "DIM": vector_dimensions,      # Number of Vector Dimensions
                    "DISTANCE_METRIC": "COSINE",   # Vector Search Distance Metric
                }
            ),
        )

        # index Definition
        definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.HASH)

        # create Index
        r.ft(INDEX_NAME).create_index(fields=schema, definition=definition)

Ping returned : True


In [None]:
# define vector dimensions
VECTOR_DIMENSIONS = 1536

# create the index
create_index(vector_dimensions=VECTOR_DIMENSIONS)

In [None]:
import openai

# Create Embeddings with OpenAI text-embedding-ada-002
# https://openai.com/blog/new-and-improved-embedding-model

texts = [
    "Today is a really great day!",
    "The dog next door barks really loudly.",
    "My cat escaped and got out before I could close the door.",
    "It's supposed to rain and thunder tomorrow."
]

response = openai.Embedding.create(input=texts, engine=os.getenv('OPENAI_DEPLOYMENT_NAME'))
embeddings = np.array([r["embedding"] for r in response["data"]], dtype=np.float32)

# Write to Redis
pipe = r.pipeline()
for i, embedding in enumerate(embeddings):
    pipe.hset(f"doc:{i}", mapping = {
        "vector": embedding.tobytes(),
        "content": texts[i],
        "tag": "openai"
    })
res = pipe.execute()

In [None]:
text = "animals"

# create query embedding
response = openai.Embedding.create(input=[text], engine=os.getenv('OPENAI_DEPLOYMENT_NAME'))
query_embedding = np.array([r["embedding"] for r in response["data"]], dtype=np.float32)[0]

In [None]:
# query for similar documents that have the openai tag
query = (
    Query("(@tag:{ openai })=>[KNN 2 @vector $vec as score]")
     .sort_by("score")
     .return_fields("content", "tag", "score")
     .paging(0, 2)
     .dialect(2)
)

query_params = {"vec": query_embedding.tobytes()}
r.ft(INDEX_NAME).search(query, query_params).docs

# the two pieces of content related to animals are returned