In [3]:
from opensearchpy import OpenSearch, helpers, AWSV4SignerAuth, RequestsHttpConnection
from sentence_transformers import SentenceTransformer
import json
import boto3

# Authentication
region = 'us-east-1'  # e.g., 'us-east-1'
service = 'es'  # For OpenSearch Service, the service name is 'es'

# Get AWS credentials (uses default credentials chain)
credentials = boto3.Session().get_credentials()
awsauth = AWSV4SignerAuth(credentials, region)

# OpenSearch domain endpoint
host = 'search-jpc2-rag-hiy7k2n5gjqhj4y67yebre5t7m.us-east-1.es.amazonaws.com'  # e.g., 'search-my-domain.us-east-1.es.amazonaws.com'

# Create an OpenSearch client with AWS authentication
client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
)

In [4]:
# Index configuration
index_name = 'first_index'
embedding_dimension = 384  # Dimension for 'all-MiniLM-L6-v2' model

# Delete the index if it exists
if client.indices.exists(index_name):
    client.indices.delete(index=index_name)

# Create the index with settings and mappings for KNN search
index_body = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 0,
        'index': {
            'knn': True  # Enable KNN search
        }
    },
    'mappings': {
        'properties': {
            'content': {'type': 'text'},
            'embedding': {
                'type': 'knn_vector',
                'dimension': embedding_dimension
            }
        }
    }
}

client.indices.create(index=index_name, body=index_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'first_index'}

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Read and process the raw data
raw_data_file = 'patients.jsonl'  # Replace with your data file
actions = []

with open(raw_data_file, 'r') as f:
    for idx, line in enumerate(f):
        content = line
        embedding = model.encode(content)
        action = {
            "_index": index_name,
            "_id": idx,
            "_source": {
                "content": content,
                "embedding": embedding.tolist()
            }
        }
        actions.append(action)

# Bulk upload the documents
helpers.bulk(client, actions)
print(f"Uploaded {len(actions)} documents to OpenSearch index '{index_name}' with embeddings.")

Uploaded 100 documents to OpenSearch index 'first_index' with embeddings.
