In [None]:
#%pip install langchain
#%pip install unstructured
#%pip install markdown

# Import content

Start by importing content. The content can be used to create embeddings and store in a vector store.

At the moment there are three files that are in the format of smaller blocks of text in a file. The following files with content are available:
- help-account.txt
- help-search.txt
- help-sustainability.txt

In the end, all texts are store in an array called _available_texts_.


In [2]:
from langchain.text_splitter import CharacterTextSplitter


def split_up_file(file_name: str):
    with open(file_name) as split_file:
        help_account = split_file.read()

    # Problem with this splitter is that it splits, but also merges if the chunk size is not reached. 
    # BY choosing a small chunk it work fine. But do test id this is what you need.
    text_splitter = CharacterTextSplitter(
        separator="\n\n",
        chunk_size=200,
        chunk_overlap=0,
        length_function=len,
        is_separator_regex=False,
    )

    return text_splitter.create_documents([help_account])


In [6]:
available_texts = (split_up_file('./help-account.txt') 
                   + split_up_file('./help-search.txt') 
                   + split_up_file('./help-sustainability.txt'))

print(len(available_texts))

Created a chunk of size 302, which is longer than the specified 200
Created a chunk of size 310, which is longer than the specified 200
Created a chunk of size 278, which is longer than the specified 200
Created a chunk of size 244, which is longer than the specified 200
Created a chunk of size 205, which is longer than the specified 200
Created a chunk of size 218, which is longer than the specified 200
Created a chunk of size 253, which is longer than the specified 200
Created a chunk of size 228, which is longer than the specified 200
Created a chunk of size 203, which is longer than the specified 200
Created a chunk of size 282, which is longer than the specified 200
Created a chunk of size 247, which is longer than the specified 200
Created a chunk of size 234, which is longer than the specified 200
Created a chunk of size 242, which is longer than the specified 200
Created a chunk of size 256, which is longer than the specified 200
Created a chunk of size 234, which is longer tha

36


There is one other file that has a different structure. This is the shop-locations file. As the structure is different, we made a special importer. The function generates an array of dictionaries containing the available information for a shop.


In [16]:
from unstructured.partition.md import partition_md
from unstructured.staging.base import convert_to_dict
import os

In [17]:
def extract_store_line(input_string):
    # Split the input string by newline and colon
    lines = input_string.split('\n')
    data_dict = {}

    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            key = key.strip()
            value = value.strip()

            # Replace spaces with underscores in keys and convert to lowercase
            key = key.lower().replace(' ', '_')

            data_dict[key] = value

    return data_dict


def extract_stores():
    filename = os.path.join('./', 'help-shop-locations.txt')
    with open(filename, "rb") as f:
        elements = partition_md(filename=filename)

    elements_dict = convert_to_dict(elements)
    
    available_stores = []
    current_store = {}
    for el in elements_dict:
        line_dict = extract_store_line(el["text"])
        if "store_name" in line_dict:
            available_stores.append(current_store)
            line_dict["opening_hours"] = []
            current_store = line_dict
        else:
            if current_store.get("opening_hours"):
                current_store["opening_hours"].append(line_dict)
            else:
                current_store["opening_hours"] = [line_dict]
    return available_stores

In [18]:
found_stores = extract_stores()
print(found_stores)

[{}, {'store_name': 'Trendy Finds', 'city': 'Pijnacker', 'street': 'Kerkweg 1', 'telephone': '06-12345678', 'opening_hours': [{'monday': '11:00-18:00'}, {'tuesday': '09:00-18:00'}, {'wednesday': '09:00-18:00'}, {'thursday': '11:00-18:00'}, {'friday': '09:00-18:00'}, {'saturday': '10:00-17:00'}, {'sunday': 'Closed'}]}, {'store_name': 'Urban Chic Emporium', 'city': 'Rotterdam', 'street': 'Hoogstraat 23', 'telephone': '06-12345678', 'opening_hours': [{'monday': '10:00-18:00'}, {'tuesday': '10:00-18:00'}, {'wednesday': '10:00-18:00'}, {'thursday': '10:00-20:00'}, {'friday': '10:00-18:00'}, {'saturday': '10:00-17:00'}, {'sunday': 'Closed'}]}, {'store_name': 'The Boutique Haven', 'city': 'Ghent', 'street': 'Veldstraat 8', 'telephone': '06-12345678', 'opening_hours': [{'monday': '09:30-18:00'}, {'tuesday': '09:30-18:00'}, {'wednesday': '09:30-18:00'}, {'thursday': '09:30-18:00'}, {'friday': '09:30-20:00'}, {'saturday': '10:00-17:00'}, {'sunday': 'Closed'}]}, {'store_name': 'Chic & Cozy', 'cit