In [337]:
from langchain.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from opensearchpy import OpenSearch

if True:
    import sys
    sys.path.append("../")


from utils.open_search_vector_search_cs import OpenSearchVectorSearchCS
import os

In [338]:
#### Langchain

# Load Document
doc = PDFMinerLoader('contract.pdf').load()

# Split in chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200
)
split_docs = splitter.split_documents(doc)[:5]

# Add metadata
for doc in split_docs:
    doc.metadata['session_id'] = "123456"

# Create index from documents
embedding_function = OpenAIEmbeddings()

In [320]:
host = os.getenv('OPENSEARCH_HOST')
port = os.getenv('OPENSEARCH_PORT')
user = os.getenv('OPENSEARCH_USER')
pwd = os.getenv('OPENSEARCH_PWD')

In [366]:
vs = OpenSearchVectorSearchCS(
    opensearch_url=f"https://{host}:{port}",
    embedding_function=embedding_function,
    http_auth=(user, pwd),
    index_name='python-test2',
    use_ssl=True,
    verify_certs=False,
    ssl_assert_hostname=False,
    ssl_show_warn=False,
)

In [367]:
filter = {
            "bool" : {
                "filter" : {
                    "term" : {
                        "metadata.session_id" : "123456" 
                    }
                }
            }
        }

In [368]:
search = vs.similarity_search_with_score(
    query="What is the title of the document?", 
    kwargs={
        "search_type":"script_scoring",
        "pre_filter": filter
    }
)
print(len(search))

0


In [369]:
ret = vs.as_retriever(
    search_kwargs={'search_type' : 'script_scoring','pre_filter':filter}
)

In [370]:
result = ret.get_relevant_documents(
    query='what is the title of this document?'
    )

In [371]:
len(result)

0

In [373]:
vs.add_documents(split_docs)

['ee9388a4-2eae-4064-9897-cc3921546e5c',
 '61ee8d45-6389-4411-9d90-04d9eab7720b',
 'df4c773a-00df-492b-88ca-ca5b98c34234',
 'ac0f3eb8-110a-4e62-9d88-10b89bae933b',
 '8877886d-1191-4ae3-8094-705e1c6b22b7']

In [382]:
%%markdown
#### Client -> Check index & elements through query

#### Client -> Check index & elements through query


In [290]:

client = OpenSearch(
        hosts=[{"host": host, "port": port}],
        http_auth=(user, pwd),
        use_ssl=True,
        verify_certs=False,
        ssl_assert_hostname=False,
        ssl_show_warn=False,
    )
print(client.info())

{'name': 'opensearch-node1', 'cluster_name': 'opensearch-cluster', 'cluster_uuid': '2mfrglYGQuauyiBKUe09xA', 'version': {'distribution': 'opensearch', 'number': '2.10.0', 'build_type': 'tar', 'build_hash': 'eee49cb340edc6c4d489bcd9324dda571fc8dc03', 'build_date': '2023-09-20T23:54:29.889267151Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.10.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'The OpenSearch Project: https://opensearch.org/'}


In [376]:
test_vector = OpenAIEmbeddings().embed_query("What are the consultant responsibilities?")

In [389]:
# This works
vs.client.search(
    index="python-test2",
    body={
        "query": {
            "bool": {
                "must": {
                    "match": {
                        "metadata.session_id":  "142425"
                        }
                    }
                }
            }
        }
)

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []}}

In [296]:
filter = {
            "bool" : {
                "filter" : {
                    "term" : {
                        "metadata.session_id" : "123456" 
                    }
                }
            }
        }

In [297]:
body = {"size": 4,
        "query": {
            "script_score": {
                "query": filter,
                "script": {
                    "source": "knn_score",
                    "lang": "knn",
                    "params": {
                        "field": "vector_field",
                        "query_value": test_vector,
                        "space_type": "l2",
                    },
                },
            }
        },
    }

In [289]:
print(body['query']['script_score']['query'])

{'bool': {'filter': {'term': {'metadata.session_id': '123456'}}}}


In [298]:
search_result = client.search(
    index="python-test",
    body=body
)

{'took': 13,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10, 'relation': 'eq'},
  'max_score': 0.6907443,
  'hits': [{'_index': 'python-test',
    '_id': '14a5b70a-0ca7-41b7-881a-0626e49469ed',
    '_score': 0.6907443,
    '_source': {'vector_field': [0.013221181876605394,
      3.1256185924137814e-05,
      0.00472667392328795,
      -0.018839171081470044,
      0.008116373760057638,
      -4.0461592855828826e-05,
      -0.0034943623329506093,
      0.0048144548008871475,
      0.02375491140289597,
      -0.037759370951900136,
      0.02002759269866907,
      0.014342078882122666,
      0.008237916943497714,
      0.02199929012562173,
      -0.0037847151829499335,
      -0.00683679615226241,
      0.020918907514584483,
      0.008717336657296298,
      -0.004794197603647135,
      -0.004237125610869359,
      -0.02599670541393051,
      -0.004824583399507154,
      -0.03224942075319727,
      0.015233394629360

In [241]:
element = client.get(index='python-test', id='83a9685e-c8c2-4982-a9ad-3f604f2960c9')

In [246]:
element['_source'].keys()

dict_keys(['vector_field', 'text', 'metadata'])

In [381]:
client.indices.get_alias()

{'.ql-datasources': {'aliases': {}},
 '.kibana_92668751_admin_1': {'aliases': {'.kibana_92668751_admin': {}}},
 '.kibana_1': {'aliases': {'.kibana': {}}},
 '.opendistro_security': {'aliases': {}},
 '.plugins-ml-config': {'aliases': {}},
 '.opensearch-observability': {'aliases': {}},
 '.opensearch-sap-log-types-config': {'aliases': {}},
 'security-auditlog-2023.09.26': {'aliases': {}},
 'python-test': {'aliases': {}},
 'security-auditlog-2023.09.27': {'aliases': {}},
 'security-auditlog-2023.09.28': {'aliases': {}},
 'hubsync-ai-assistant': {'aliases': {}},
 'python-test2': {'aliases': {}}}

In [359]:
# Search for index parameters to create an empty one where we could init and manage separately
client.indices.get('python-test')

{'python-test': {'aliases': {},
  'mappings': {'properties': {'metadata': {'properties': {'session_id': {'type': 'text',
       'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
      'source': {'type': 'text',
       'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}},
    'text': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'vector_field': {'type': 'knn_vector',
     'dimension': 1536,
     'method': {'engine': 'nmslib',
      'space_type': 'l2',
      'name': 'hnsw',
      'parameters': {'ef_construction': 512, 'm': 16}}}}},
  'settings': {'index': {'replication': {'type': 'DOCUMENT'},
    'number_of_shards': '1',
    'knn.algo_param': {'ef_search': '512'},
    'provided_name': 'python-test',
    'knn': 'true',
    'creation_date': '1695824699052',
    'number_of_replicas': '1',
    'uuid': 'U7z0JfgQQeCkYUfv1crcpA',
    'version': {'created': '136317827'}}}}}

In [363]:
body_create_index = {
        "aliases": {},
        "mappings": {
            "properties": {
                "metadata": {
                    "properties": {
                        "session_id": {
                            "type": "text",
                            "fields": {
                                "keyword": {"type": "keyword", "ignore_above": 256}
                            },
                        },
                        "source": {
                            "type": "text",
                            "fields": {
                                "keyword": {"type": "keyword", "ignore_above": 256}
                            },
                        },
                    }
                },
                "text": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 256}},
                },
                "vector_field": {
                    "type": "knn_vector",
                    "dimension": 1536,
                    "method": {
                        "engine": "nmslib",
                        "space_type": "l2",
                        "name": "hnsw",
                        "parameters": {"ef_construction": 512, "m": 16},
                    },
                },
            }
        },
        "settings": {
            "index": {
                "number_of_shards": "1",
                "knn.algo_param": {"ef_search": "512"},
                "knn": "true",
                "number_of_replicas": "1",
            }
        },
}

In [364]:
client.indices.create(
    index='python-test2',
    body=body_create_index
)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'python-test2'}

In [None]:
client.search

In [394]:
client.indices.get(index='hubsync-ai-assistant')

{'hubsync-ai-assistant': {'aliases': {},
  'mappings': {'properties': {'metadata': {'properties': {'session_id': {'type': 'text',
       'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
      'source': {'type': 'text',
       'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}},
    'text': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'vector_field': {'type': 'knn_vector',
     'dimension': 1536,
     'method': {'engine': 'nmslib',
      'space_type': 'l2',
      'name': 'hnsw',
      'parameters': {'ef_construction': 512, 'm': 16}}}}},
  'settings': {'index': {'replication': {'type': 'DOCUMENT'},
    'number_of_shards': '1',
    'knn.algo_param': {'ef_search': '512'},
    'provided_name': 'hubsync-ai-assistant',
    'knn': 'true',
    'creation_date': '1695919399036',
    'number_of_replicas': '1',
    'uuid': '6ILpQSAjRM20mQiBoZFf-Q',
    'version': {'created': '136317827'}}}}}

In [392]:
client.delete_by_query(
    index='python-test2',
    body={
        "query": {
            "bool": {
                "must": {
                    "match": {
                        "metadata.session_id":  "123456"
                        }
                    }
                }
            }
        }
)

{'took': 533,
 'timed_out': False,
 'total': 0,
 'deleted': 0,
 'batches': 0,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

In [396]:
search = client.search(
    index='hubsync-ai-assistant',
    body={
        "query": {
            "bool": {
                "must": {
                    "match": {
                        "metadata.session_id":  "123456"
                        }
                    }
                }
            }
        }
)

In [398]:
len(search)

4