In [88]:
from datasets import load_dataset
import numpy as np

In [218]:
dataset = load_dataset("reuters21578", 'ModHayes', split="train")
dataset

Dataset({
    features: ['text', 'text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title'],
    num_rows: 20856
})

In [91]:
def avg_str_len(lst):
    return np.mean([len(s) for s in lst])

example_doc = dataset[0]['text'] + dataset[1]['text']
example_doc = example_doc.replace("\n", " ")
example_doc = example_doc.replace("Reuter", " ")

print(f"The first two documents have {len(example_doc)} charaters")

example_sentence = example_doc.split(". ")
print(f"We can split it into {len(example_sentence)} sentences, with an average of {np.round(avg_str_len(example_sentence), 2)} characters")

example_paragraph = example_doc.split("     ")
paragraph_length = len(example_paragraph)
print(f"Or we can split it into {paragraph_length} paragraphs, with an average of {np.round(avg_str_len(example_paragraph), 2)} charaters")

example_chunk = [example_paragraph[i] + example_paragraph[i+1] + example_paragraph[i+2] for i in range(0, paragraph_length-2,3)]
print(f"Finally, we take {len(example_chunk)} chunks of two paragraphs each, averaging about {np.round(avg_str_len(example_chunk), 2)} characters")

The first two documents have 3291 charaters
We can split it into 24 sentences, with an average of 135.21 characters
Or we can split it into 20 paragraphs, with an average of 159.8 charaters
Finally, we take 6 chunks of two paragraphs each, averaging about 508.33 characters


In [92]:
print("For example, from the first document we can get these different sections depenging on the split: ")
print(f"One sentence of {len(example_sentence[3])} characters: \n{example_sentence[3]}")
print(f"One paragraph of {len(example_paragraph[2])} characters: \n{example_paragraph[2]}")
print(f"One larger chunk of {len(example_chunk[0])} characters: \n{example_chunk[0]}")

For example, from the first document we can get these different sections depenging on the split: 
One sentence of 95 characters: 
Again it seems that cocoa delivered earlier on consignment was included in the arrivals figures
One paragraph of 261 characters: 
Arrivals for the week ended February 22 were 155,221 bags of 60 kilos making a cumulative total for the season of 5.93 mln against 5.81 at the same stage last year. Again it seems that cocoa delivered earlier on consignment was included in the arrivals figures.
One larger chunk of 573 characters: 
Showers continued throughout the week in the Bahia cocoa zone, alleviating the drought since early January and improving prospects for the coming temporao, although normal humidity levels have not been restored, Comissaria Smith said in its weekly review.The dry period means the temporao will be late this year.Arrivals for the week ended February 22 were 155,221 bags of 60 kilos making a cumulative total for the season of 5.93 mln agains

The indexes within the different lists can quickly get hard to track. For example we need to be able to tell Sentence 3 was in paragraph 2 of the first chunk in document document 0.
We would need a system to keep track of the different hierarchical IDs, so when we run a search we can provide the necessary context with different granularities. We can visualise this ID hierarchy in a simplified example like this:


Doc 0, Sen 0, Par 0, Chunk 0
Doc 0, Sen 1, Par 0, Chunk 0

Doc 0, Sen 2, Par 1, Chunk 0
Doc 0, Sen 3, Par 1, Chunk 0
Doc 0, Sen 4, Par 1, Chunk 0


Doc 0, Sen 5, Par 2, Chunk 1
Doc 0, Sen 6, Par 2, Chunk 1

Doc 0, Sen 7, Par 3, Chunk 1
Doc 0, Sen 9, Par 3, Chunk 1

In [135]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_by_chunk(documents, chunk, metadata):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk,
        chunk_overlap=chunk/5,
        length_function=len,
        is_separator_regex=False,
        keep_separator=False,
        separators = ["     ", ". ", " ", ""]
    )

    texts = text_splitter.create_documents(documents, metadatas=metadata)
    return texts

texts = split_by_chunk([example_doc], 150, None)
print(texts[0])

texts = split_by_chunk([example_doc], 300, None)
print(texts[0])

texts = split_by_chunk([example_doc], 600, None)
print(texts[0])


page_content='Showers continued throughout the week in the Bahia cocoa zone, alleviating the drought since early January and improving prospects for the coming'
page_content='Showers continued throughout the week in the Bahia cocoa zone, alleviating the drought since early January and improving prospects for the coming temporao, although normal humidity levels have not been restored, Comissaria Smith said in its weekly review.'
page_content='Showers continued throughout the week in the Bahia cocoa zone, alleviating the drought since early January and improving prospects for the coming temporao, although normal humidity levels have not been restored, Comissaria Smith said in its weekly review.     The dry period means the temporao will be late this year.     Arrivals for the week ended February 22 were 155,221 bags of 60 kilos making a cumulative total for the season of 5.93 mln against 5.81 at the same stage last year. Again it seems that cocoa delivered earlier on consignment was incl

In [94]:
! pip install --upgrade --quiet  elasticsearch langchain-openai tiktoken langchain

In [None]:
from getpass import getpass  # For securely getting user input
model_id = "sentence-transformers__msmarco-minilm-l-12-v3"

ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")
ELASTIC_API_KEY = getpass("Elastic API Key: ")

In [168]:
from langchain_community.vectorstores.elasticsearch import ElasticsearchStore
index_name = "reuters"
db = ElasticsearchStore(
        es_cloud_id=ELASTIC_CLOUD_ID,
        es_api_key=ELASTIC_API_KEY,
        index_name=index_name,
        query_field="text_field",
        vector_query_field="vector_query_field.predicted_value",
        strategy=ElasticsearchStore.ApproxRetrievalStrategy(
                query_model_id=model_id
    ),)


In [172]:
db.client.ingest.put_pipeline(
    id="embeddings",
    processors=[
        {
            "inference": {
                "model_id": model_id,
                "field_map": {"_ingest._value.page_content": "text_field"},
                "target_field": "vector_query_field",
            }
        }
    ],
)

ObjectApiResponse({'acknowledged': True})

In [173]:
db.client.indices.create(
    index=index_name,
    mappings={
        "dynamic": "true",
        "properties": {
            "vector_query_field": {
                "properties": {
                    "predicted_value": {
                        "type": "dense_vector",
                        "dims": 384,
                        "index": True,
                        "similarity": "cosine",
                    }
                }
            },
            "name" :{"type": "text"}, 
        }
    },
    settings={"index": {"default_pipeline": "embeddings"}},
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'reuters'})

In [None]:
dataset = dataset[0:1000]
metadata = []
content = []
chunk_size = 500

for doc in dataset:
  content.append(doc["text"])
  metadata.append({
      "name": doc["title"]
  })

docs = split_by_chunk(documents=content, chunk=chunk_size, metadata = metadata)

In [176]:
db.from_documents(
    docs,
    es_cloud_id=ELASTIC_CLOUD_ID,
    es_api_key=ELASTIC_API_KEY,
    index_name=index_name,
    query_field="text_field",
    vector_query_field="vector_query_field.predicted_value",
    strategy=ElasticsearchStore.ApproxRetrievalStrategy(
        query_model_id=model_id
    ),
)

<langchain_community.vectorstores.elasticsearch.ElasticsearchStore at 0x2ac607a10>

In [286]:
response = db.similarity_search("rising unemployment concerns", k=2)
for doc in response:
    print (f'Article name: {doc.metadata["name"]}: \n {doc.page_content} \n')

Article name: FRANCE FACES PRESSUE TO CHANGE POLICIES: 
 top priority.
 But with unemployment nearing 11 per cent last month, and
still rising, government supporters and some economic analysts
said they were 

Article name: SWISS ECONOMY IN EXCELLENT CONDITION, OECD SAYS: 
 But it said job creation should continue to absorb a modest
increase in the workforce, leaving the unemployment rate
unchanged at around one pct, the 



In [None]:
#In one go:

model_id = "sentence-transformers__msmarco-minilm-l-12-v3"

ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")
ELASTIC_API_KEY = getpass("Elastic API Key: ")

dataset = load_dataset("reuters21578", 'ModHayes', split="train[:1%]")

In [282]:
def create_index_with_chunk(db, index_name, chunk_size, chunk_overlap_part):
    db.client.ingest.put_pipeline(
        id="embeddings",
        processors=[
            {
                "inference": {
                    "model_id": model_id,
                    "field_map": {"_ingest._value.page_content": "text_field"},
                    "target_field": "vector_query_field",
                }
            }
        ],
    )

    db.client.indices.create(
        index=index_name,
        mappings={
            "dynamic": "true",
            "properties": {
                "vector_query_field": {
                    "properties": {
                        "predicted_value": {
                            "type": "dense_vector",
                            "dims": 384,
                            "index": True,
                            "similarity": "cosine",
                        }
                    }
                },
                "name" :{"type": "text"}, 
            }
        },
        settings={"index": {"default_pipeline": "embeddings", "refresh_interval" : "1000s"}},
    )

    metadata = []
    content = []

    for doc in dataset:
        content.append(doc["text"])
        metadata.append({
            "name": doc["title"]
        })

    #docs = split_by_chunk(documents=content, chunk=chunk_size, metadata = metadata)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_size/chunk_overlap_part,
        length_function=len,
        is_separator_regex=False,
        keep_separator=False,
        separators = ["     ", ". ", " ", ""]
    )

    docs = text_splitter.create_documents(content, metadata)

    db.from_documents(
        docs,
        es_cloud_id=ELASTIC_CLOUD_ID,
        es_api_key=ELASTIC_API_KEY,
        index_name=index_name,
        query_field="text_field",
        vector_query_field="vector_query_field.predicted_value",
        strategy=ElasticsearchStore.ApproxRetrievalStrategy(
            query_model_id=model_id
        ),
    )

In [294]:
chunk_sizes = [600, 300, 150]
chunk_overlap_parts = [2, 5, 10]

list = ["how is grain production affected by weather", 
        "countries performing well in the stock market",
        "projection for crude oil demand", 
        "rising unemployment concerns"]

results = {}

for chunk in chunk_sizes:
    chunk_responses = {}
    for chunk_overlap_part in chunk_overlap_parts:
        overlap_reponse = []
        
        index_name = "reuters_" + str(chunk) + "_" + str(chunk_overlap_part)

        db = ElasticsearchStore(
            es_cloud_id=ELASTIC_CLOUD_ID,
            es_api_key=ELASTIC_API_KEY,
            index_name=index_name,
            query_field="text_field",
            vector_query_field="vector_query_field.predicted_value",
            strategy=ElasticsearchStore.ApproxRetrievalStrategy(
                    query_model_id=model_id
        ),
        )

        create_index_with_chunk(db, index_name, chunk, chunk_overlap_part)

        for query in list:
            query_response = {}
            response = db.similarity_search(query, k=8)
            query_response["query"] = query
            answers = []
            for doc in response:
                answers.append({'name' : doc.metadata["name"], 'content': doc.page_content})
            
            query_response["answers"] = answers
            overlap_reponse.append(query_response)
        
        chunk_responses[chunk_overlap_part] = overlap_reponse
    results[chunk] = chunk_responses

In [295]:
import json
with open('result.json', 'w') as fp:
    json.dump(results, fp)

In [299]:
print(results[600][2][1]["answers"][0]["name"])
print(results[600][5][1]["answers"][0]["name"])
print(results[600][10][1]["answers"][0]["name"])
print(results[300][2][1]["answers"][0]["name"])
print(results[300][5][1]["answers"][0]["name"])
print(results[300][10][1]["answers"][0]["name"])
print(results[150][2][1]["answers"][0]["name"])
print(results[150][5][1]["answers"][0]["name"])
print(results[150][10][1]["answers"][0]["name"])


U.S. DATA POINT TO CAPITAL SPENDING SLOWDOWN
U.S. DATA POINT TO CAPITAL SPENDING SLOWDOWN
U.S. DATA POINT TO CAPITAL SPENDING SLOWDOWN
U.S. DATA POINT TO CAPITAL SPENDING SLOWDOWN
U.S. DATA POINT TO CAPITAL SPENDING SLOWDOWN
U.S. DATA POINT TO CAPITAL SPENDING SLOWDOWN
BAHIA COCOA REVIEW
SWISS ECONOMY IN EXCELLENT CONDITION, OECD SAYS
OPEC MAY HAVE TO MEET TO FIRM PRICES - ANALYSTS


In [300]:
print(results[600][2][0]["answers"][0]["name"])
print(results[600][5][0]["answers"][0]["name"])
print(results[600][10][0]["answers"][0]["name"])
print(results[300][2][0]["answers"][0]["name"])
print(results[300][5][0]["answers"][0]["name"])
print(results[300][10][0]["answers"][0]["name"])
print(results[150][2][0]["answers"][0]["name"])
print(results[150][5][0]["answers"][0]["name"])
print(results[150][10][0]["answers"][0]["name"])

U.S. GRAIN CARLOADINGS FALL IN WEEK
U.S. GRAIN CARLOADINGS FALL IN WEEK
U.S. GRAIN CARLOADINGS FALL IN WEEK
U.S. GRAIN CARLOADINGS FALL IN WEEK
U.S. GRAIN CARLOADINGS FALL IN WEEK
U.S. GRAIN CARLOADINGS FALL IN WEEK
U.S. GRAIN CARLOADINGS FALL IN WEEK
U.S. GRAIN CARLOADINGS FALL IN WEEK
U.S. GRAIN CARLOADINGS FALL IN WEEK
