In [None]:
import pandas as pd  
from getpass import getpass  
from elasticsearch import Elasticsearch, helpers 
from elasticsearch.client import MlClient
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore

In [2]:
! pip install --upgrade --quiet  elasticsearch langchain-openai tiktoken langchain

In [None]:
! pip install -r requirements.txt

In [4]:
# Prompt the user to enter their Elastic Cloud ID and API Key securely
ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")
ELASTIC_API_KEY = getpass("Elastic API Key: ")

# Create an Elasticsearch client using the provided credentials
es = Elasticsearch(
    cloud_id=ELASTIC_CLOUD_ID,  # cloud id can be found under deployment management
    api_key=ELASTIC_API_KEY, # your username and password for connecting to elastic, found under Deplouments - Security
)

In [5]:
from datasets import load_dataset

dataset = load_dataset("imdb", split=["train"])
dataset = dataset[0]

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
index_name = "imdb"

# Create the Elasticsearch index with the specified name
es.indices.create(index=index_name)

# Define a function to convert DataFrame rows to Elasticsearch documents
def df_to_doc(dataset, name_of_index):
    for document in dataset:
        yield dict(_index=name_of_index, _source=document)

# Use the Elasticsearch helpers.bulk() method to index the DataFrame data into Elasticsearch
load = helpers.bulk(es, df_to_doc(dataset, index_name))

In [8]:
query={
        "match": {
            "text": "i really loved this movie"
        }
    }

response = es.search(index=index_name, query=query)

print("We get back {total} results, here are the top ones:".format(total=response["hits"]['total']['value']))
for hit in response["hits"]["hits"]:
    print(hit['_source']['text'])

We get back 10000 results, here are the top ones:
I was really disappointed with this film. The first Waters movie I saw was Serial Mom and I loved it. Then I saw Pecker and I loved it. Then I watched Polyester and really sort of hated it. The only thing I liked about that movie was DIVINE. She/He had a hell of a lot of talent. I was truly surprised. As a whole, I wouldn't recommend this film...
I remember I loved this movie when it came out. I was 12 years old, had a Commodore 64 and loved to play Rambo on it. I was therefore really thrilled when I got to buy this movie really cheap. I put it in my VCR and started up: Man this movie is really bad! Sylvester Stallone says like 3 words in the entire movie (except for that awful sentimental speech at the end), and has the same expression on his face all the way. And that stupid love thing in the middle, it's just so amazingly predictable. I just ended up fast forwarding the entire thing and went to exchange the movie for something else.


In [36]:
query={
    "text_expansion": {
        "ml.tokens": {
            "model_id":".elser_model_1",
            "model_text":"I really loved this movie"
        }
    }
}

result = es.search(index=index_name, query=query)
    
for element in result["hits"]["hits"]:
    print("{}: {}, score {}".format(element["_source"]["text"], element["_score"]))

ConflictError: ConflictError(409, 'status_exception', 'Trained model deployment [.elser_model_1] is not allocated to any nodes')

### Three options:

In [None]:
# + generating embeddings within Elatic w/ inference runs
model_id = "sentence-transformers__msmarco-minilm-l-12-v3"

db = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,
    
    query_field="text_field",
    vector_query_field="vector_query_field.predicted_value",
    strategy=ElasticsearchStore.ApproxRetrievalStrategy(
            query_model_id=model_id
    ),
)

In [None]:
embeddings = OpenAIEmbeddings()

# with external embeddings
db = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,

    embeddings = embeddings,
)

In [7]:
# with ELSER
db = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,
    
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(),
)


In [9]:
# with no embeddings
db = ElasticsearchStore(
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,
    )

In [None]:
db.client.indices.refresh(index=index_name)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datasets import load_dataset

dataset = load_dataset("reuters21578", 'ModHayes', split="train[:1%]")

metadata = []
content = []
chunk_size = 300
chunk_overlap_part = 2

for doc in dataset:
    content.append(doc["text"])
    metadata.append({
        "name": doc["title"]
    })

#docs = split_by_chunk(documents=content, chunk=chunk_size, metadata = metadata)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_size/chunk_overlap_part,
    length_function=len,
    is_separator_regex=False,
    keep_separator=False,
    separators = ["     ", ". ", " ", ""]
)

docs = text_splitter.create_documents(content, metadata)


In [None]:
db.client.ingest.put_pipeline(
    id="embeddings",
    processors=[
        {
            "inference": {
                "model_id": model_id,
                "field_map": {"_ingest._value.page_content": "text_field"},
                "target_field": "vector_query_field",
            }
        }
    ],
)

db.client.indices.create(
    index=index_name,
    mappings={
        "dynamic": "true",
        "properties": {
            "vector_query_field": {
                "properties": {
                    "predicted_value": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine",
                    }
                }
            },
            "name" :{"type": "text"}, 
        }
    },
    settings={"index": {"default_pipeline": "embeddings", "refresh_interval" : "1000s"}},
)

In [None]:
db.from_documents(
    docs,
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,
    query_field="text_field",
    vector_query_field="vector_query_field.predicted_value",
    strategy=ElasticsearchStore.ApproxRetrievalStrategy(
        query_model_id=model_id
    ),
)