In [None]:
#%pip install langchain
#%pip install unstructured
#%pip install markdown
#%pip install pandas
#%pip install opensearch-py
#%pip install boto3
#%pip install requests-aws4auth
#%pip install python-dotenv
#%pip install openai
#%pip install tiktoken

# Import content

Start by importing content. The content can be used to create embeddings and store in a vector store.

At the moment there are three files that are in the format of smaller blocks of text in a file. The following files with content are available:
- help-account.txt
- help-search.txt
- help-sustainability.txt

In the end, all texts are store in an array called _available_texts_.


In [5]:
from langchain.text_splitter import CharacterTextSplitter


def split_up_file(file_name: str):
    with open(file_name) as split_file:
        help_account = split_file.read()

    # Problem with this splitter is that it splits, but also merges if the chunk size is not reached. 
    # BY choosing a small chunk it work fine. But do test id this is what you need.
    text_splitter = CharacterTextSplitter(
        separator="\n\n",
        chunk_size=200,
        chunk_overlap=0,
        length_function=len,
        is_separator_regex=False,
    )

    return text_splitter.create_documents([help_account])


In [6]:
# available_texts = (split_up_file('./help-account.txt')
#                    + split_up_file('./help-search.txt')
#                    + split_up_file('./help-sustainability.txt'))
available_texts = (split_up_file('./help-account.txt'))

print(len(available_texts))

Created a chunk of size 302, which is longer than the specified 200
Created a chunk of size 310, which is longer than the specified 200
Created a chunk of size 278, which is longer than the specified 200
Created a chunk of size 244, which is longer than the specified 200
Created a chunk of size 205, which is longer than the specified 200
Created a chunk of size 218, which is longer than the specified 200
Created a chunk of size 253, which is longer than the specified 200


12


There is one other file that has a different structure. This is the shop-locations file. As the structure is different, we made a special importer. The function generates an array of dictionaries containing the available information for a shop.


In [16]:
from unstructured.partition.md import partition_md
from unstructured.staging.base import convert_to_dict
import os

In [17]:
def extract_store_line(input_string):
    # Split the input string by newline and colon
    lines = input_string.split('\n')
    data_dict = {}

    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            key = key.strip()
            value = value.strip()

            # Replace spaces with underscores in keys and convert to lowercase
            key = key.lower().replace(' ', '_')

            data_dict[key] = value

    return data_dict


def extract_stores():
    filename = os.path.join('./', 'help-shop-locations.txt')
    with open(filename, "rb") as f:
        elements = partition_md(filename=filename)

    elements_dict = convert_to_dict(elements)

    available_stores = []
    current_store = {}
    for el in elements_dict:
        line_dict = extract_store_line(el["text"])
        if "store_name" in line_dict:
            available_stores.append(current_store)
            line_dict["opening_hours"] = []
            current_store = line_dict
        else:
            if current_store.get("opening_hours"):
                current_store["opening_hours"].append(line_dict)
            else:
                current_store["opening_hours"] = [line_dict]
    return available_stores

In [18]:
found_stores = extract_stores()
print(found_stores)

[{}, {'store_name': 'Trendy Finds', 'city': 'Pijnacker', 'street': 'Kerkweg 1', 'telephone': '06-12345678', 'opening_hours': [{'monday': '11:00-18:00'}, {'tuesday': '09:00-18:00'}, {'wednesday': '09:00-18:00'}, {'thursday': '11:00-18:00'}, {'friday': '09:00-18:00'}, {'saturday': '10:00-17:00'}, {'sunday': 'Closed'}]}, {'store_name': 'Urban Chic Emporium', 'city': 'Rotterdam', 'street': 'Hoogstraat 23', 'telephone': '06-12345678', 'opening_hours': [{'monday': '10:00-18:00'}, {'tuesday': '10:00-18:00'}, {'wednesday': '10:00-18:00'}, {'thursday': '10:00-20:00'}, {'friday': '10:00-18:00'}, {'saturday': '10:00-17:00'}, {'sunday': 'Closed'}]}, {'store_name': 'The Boutique Haven', 'city': 'Ghent', 'street': 'Veldstraat 8', 'telephone': '06-12345678', 'opening_hours': [{'monday': '09:30-18:00'}, {'tuesday': '09:30-18:00'}, {'wednesday': '09:30-18:00'}, {'thursday': '09:30-18:00'}, {'friday': '09:30-20:00'}, {'saturday': '10:00-17:00'}, {'sunday': 'Closed'}]}, {'store_name': 'Chic & Cozy', 'cit

Next we import the products that are available in the store

In [1]:
import pandas as pd

# df = pd.read_csv('./small.csv')
df = pd.read_csv('./extract-data-brickheadz.csv')
df["image_name"] = df["image_link"].str.rsplit('/').str.get(-1)
df["id"] = df["Position"]

df.head()

Unnamed: 0,Position,product_link,age,number_of_pieces,title,price,image_link,product_description,image_name,id
0,1,https://www.lego.com/nl-nl/product/professors-...,10+,601,Leraren van Zweinstein™,"€39,99",https://www.lego.com/cdn/cs/set/assets/blt8c72...,Dit is een betoverende verrassing voor fans va...,40560.png,1
1,2,https://www.lego.com/nl-nl/product/harry-hermi...,10+,466,"Harry, Hermelien, Ron & Hagrid™","€24,99",https://www.lego.com/cdn/cs/set/assets/bltbc78...,LEGO® BrickHeadz™ versies van 4 van de bekends...,40495.jpg,2
2,3,https://www.lego.com/nl-nl/product/chip-dale-4...,10+,226,Knabbel & Babbel,"€19,99",https://www.lego.com/cdn/cs/set/assets/blt0bac...,Keer terug naar je jeugd met deze leuke LEGO® ...,40550.png,3
3,4,https://www.lego.com/nl-nl/product/woody-and-b...,10+,296,Woody & Bo Peep,"€19,99",https://www.lego.com/cdn/cs/set/assets/blt2bea...,Zorg dat je twee favoriete filmpersonages alti...,40553.png,4
4,5,https://www.lego.com/nl-nl/product/goofy-pluto...,10+,214,Goofy en Pluto,"€14,99",https://www.lego.com/cdn/cs/set/assets/blt4306...,Deze Goofy en Pluto set (40378) met 2 klassiek...,40378.jpg,5


Below we download the images

In [75]:
import requests

for key, value in df.iterrows():
    url = value.image_link
    image_name = url.rsplit('/', 1)[-1]
    response = requests.get(url)

    with open("images/" + image_name, "wb") as f:
        f.write(response.content)

In [2]:
from retriever import find_auth_opensearch, OpenSearchClient

config = find_auth_opensearch()
client = OpenSearchClient(config, alias_name="sg-products")

if client.ping():
    print("We have a connection to the Amazon OpenSearch Cluster")
else:
    print("ERROR: no connection to the Amazon OpenSearch Cluster")

We have a connection to the Amazon OpenSearch Cluster


In [4]:

from retriever import OpenSearchTemplate

template = OpenSearchTemplate(
    client=client,
    index_template_name="sg_product_index_template",
    component_name_settings="sg_product_component_settings",
    component_name_dyn_mappings="sg_product_component_dynamic_mappings",
    component_name_mappings="sg_product_component_mappings"
)

for result in template.create_update_template():
    print(result)

The version 2 of the component template sg_product_component_settings is up-to-date
The version 1 of the component template sg_product_component_dynamic_mappings is up-to-date
Update the component template sg_product_component_mappings to version 1.
Update the template to version 3.


# Loading content
Loading the content is a tricky beast. They make it feel so easy. You have a document, do some chunking, create embeddings, store the embeddings in a vector store and do similarity search. Having a extensive search background, there are so many facets to return relevant results, also for semantic search. Often you want more structure in your content.

To have more control, we are indexing the documents ourselves. We do use some Langchain components to make our life easier.


In [5]:
index_name = client.create_index()
print(f"Index created with the name {index_name}")
# for key, value in df.iterrows():
#     client.index_product(index_name=index_name, product=value.to_dict())

client.switch_alias_to(index_name=index_name)

First we use Langchain to index some of the help content

In [6]:
import os

from langchain.vectorstores import OpenSearchVectorSearch
from langchain.embeddings import OpenAIEmbeddings
from opensearchpy import RequestsHttpConnection
from dotenv import load_dotenv

load_dotenv()

# print(os.getenv('OPEN_AI_API_KEY'))

vector_store = OpenSearchVectorSearch(
    index_name=index_name,
    embedding_function=OpenAIEmbeddings(openai_api_key=os.getenv('OPEN_AI_API_KEY')),
    opensearch_url=f"https://{config['host']}:{config['port']}",
    use_ssl=True,
    verify_certs=True,
    http_auth=config["auth"],
    connection_class=RequestsHttpConnection
)


With the vector store in place, we can start indexing documents. You can use the kwargs to configure some of the engine specific aspects:
- text_field: Name of the field to store the text in
- vector_field: Name of the field to store the vector in


In [7]:

vector_store.add_texts(texts=df["title"].to_list(), metadatas=df.to_dict('records'), ids=df["id"].to_list(), text_field="title", vector_field="title_vector")


[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34]

Now we want to query the vector store to see if it works better than lexical search

In [12]:
found_docs = vector_store.similarity_search_with_score(query="disney",text_field="title", vector_field="title_vector")
print(f"\nResults from: OpenSearch")
for doc, _score in found_docs:
    print(f"{_score} - {doc.page_content}")



Results from: OpenSearch
0.7957365 - Donald Duck
0.75702626 - Vaiana & Merida
0.74488837 - Cruella & Maleficent
0.7438165 - Goofy en Pluto
