In [4]:
import pandas as pd  
from getpass import getpass  
from elasticsearch import Elasticsearch, helpers 
from elasticsearch.client import MlClient
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore


In [1]:
! pip install --upgrade --quiet  elasticsearch langchain-openai tiktoken langchain


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
! pip install -r requirements.txt

In [2]:
# Prompt the user to enter their Elastic Cloud ID and API Key securely
ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")
ELASTIC_API_KEY = getpass("Elastic API Key: ")

# Create an Elasticsearch client using the provided credentials
es = Elasticsearch(
    cloud_id=ELASTIC_CLOUD_ID,  # cloud id can be found under deployment management
    api_key=ELASTIC_API_KEY, # your username and password for connecting to elastic, found under Deplouments - Security
)

In [9]:
df = pd.read_csv("collection.tsv", delimiter="	", header=None, names=["docid", "body"])

In [10]:
df.head()

Unnamed: 0,docid,body
0,0,The presence of communication amid scientific ...
1,1,The Manhattan Project and its atomic bomb help...
2,2,Essay on The Manhattan Project - The Manhattan...
3,3,The Manhattan Project was the name for a proje...
4,4,versions of each volume as well as complementa...


In [None]:
queries = pd.read_csv("queries/queries.eval.tsv", delimiter="	", header=None, names=["docid", "body"])

In [None]:
from langchain_openai import OpenAIEmbeddings

ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")
ELASTIC_API_KEY = getpass("Elastic API Key: ")

In [8]:
index_name = "hp"
model_id = "sentence-transformers__msmarco-minilm-l-12-v3"

### Three options:

In [None]:
# + generating embeddings within Elatic w/ inference runs
db = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,
    
    query_field="text_field",
    vector_query_field="vector_query_field.predicted_value",
    strategy=ElasticsearchStore.ApproxRetrievalStrategy(
            query_model_id=model_id
    ),
)

In [None]:
embeddings = OpenAIEmbeddings()

# with external embeddings
db = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,

    embeddings = embeddings,
)

In [7]:
# with ELSER
db = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,
    
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(),
)


In [9]:
# with no embeddings
db = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,
    )

In [14]:
docs = db._search( query = {"match": {"House": "Gryffindor"}})

ValueError: You must provide an embedding function or a query_model_id to perform a similarity search.

In [None]:
db.client.indices.refresh(index=index_name)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datasets import load_dataset

dataset = load_dataset("reuters21578", 'ModHayes', split="train[:1%]")

metadata = []
content = []
chunk_size = 300
chunk_overlap_part = 2

for doc in dataset:
    content.append(doc["text"])
    metadata.append({
        "name": doc["title"]
    })

#docs = split_by_chunk(documents=content, chunk=chunk_size, metadata = metadata)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_size/chunk_overlap_part,
    length_function=len,
    is_separator_regex=False,
    keep_separator=False,
    separators = ["     ", ". ", " ", ""]
)

docs = text_splitter.create_documents(content, metadata)


In [None]:
db.client.ingest.put_pipeline(
    id="embeddings",
    processors=[
        {
            "inference": {
                "model_id": model_id,
                "field_map": {"_ingest._value.page_content": "text_field"},
                "target_field": "vector_query_field",
            }
        }
    ],
)

db.client.indices.create(
    index=index_name,
    mappings={
        "dynamic": "true",
        "properties": {
            "vector_query_field": {
                "properties": {
                    "predicted_value": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine",
                    }
                }
            },
            "name" :{"type": "text"}, 
        }
    },
    settings={"index": {"default_pipeline": "embeddings", "refresh_interval" : "1000s"}},
)

In [None]:
db.from_documents(
    docs,
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,
    query_field="text_field",
    vector_query_field="vector_query_field.predicted_value",
    strategy=ElasticsearchStore.ApproxRetrievalStrategy(
        query_model_id=model_id
    ),
)