In [62]:
import glob
import chromadb
from dataclasses import dataclass

exclude = ['./CLAUDE.md','./progress-tracker.md','./project-description.md']
list_of_files = glob.glob('meridian-islands/**/*.md') 
filtered_list = []
for a in list_of_files:
    if a not in exclude:
        filtered_list.append(a)

filtered_list = sorted(filtered_list)

@dataclass
class Chunk:
        text: str
        chunk_id: int
        start_pos: int
        end_pos: int
        word_count: int
        origin_file: str


class Chunker:
    def __init__(self,chunk_size: int,overlap: int):
        self.chunk_size = chunk_size
        self.overlap = overlap
        self.id = 0
        
       
    def chunk(self, given_list: str) -> list[Chunk]:

        with open(given_list, "r", encoding="utf-8") as f:
            converted_text = f.read() 
        #print(given_list)
        chunk_size = self.chunk_size
        overlap = self.overlap
        chunks = []
        
        words = converted_text.split()  

        for start in (range(0, len(words), chunk_size)):
            if start == 0:
                adj_start = start
            else:
                adj_start = max(0, start - overlap)

            end = min(adj_start+chunk_size, len(words))
            chunk = words[adj_start:end]

            chunks.append(
                Chunk(
                    text=chunk,
                    chunk_id=self.id,
                    start_pos=adj_start,
                    end_pos=end-1,
                    word_count=len(chunk),
                    origin_file=given_list
                )
            )
            self.id += 1
            
        return chunks
    
chunker: Chunker = Chunker(150,25)

all_chunks = []
for a in filtered_list:
    chunks: list = chunker.chunk(a)
    all_chunks.extend(chunks)

for c in all_chunks[-2:]:
    print(c,"\n")

for c in all_chunks:
    print(c.chunk_id)

Chunk(text=['touch', 'coral', 'or', 'marine', 'animals,', 'some', 'species', 'dangerous', 'if', 'disturbed.', '**Cultural', 'Sensitivity**:', 'Respect', 'photography', 'restrictions', 'at', 'sacred', 'sites,', 'dress', 'modestly', 'in', 'villages.', 'Emergency', 'services:', 'Police,', 'Fire,', 'Medical', '-', 'Phone', '911', 'Tourist', 'Police:', '+684-5555-7777', '(English-speaking', 'officers)', 'Medical', 'Center:', '+684-5555-8888', '(24-hour', 'emergency)', '###', 'Best', 'Times', 'to', 'Visit', '**Optimal', 'conditions**:', 'May', 'through', 'October', 'offers', 'cooler', 'temperatures', 'and', 'less', 'rain', '**Avoid**:', 'December', 'through', 'February', 'can', 'be', 'very', 'hot', 'and', 'humid', '**Cultural', 'Events**:', 'March', 'and', 'September', 'offer', 'most', 'festivals', '**Marine', 'Activities**:', 'Visibility', 'best', 'during', 'dry', 'season', '**Mountain', 'Activities**:', 'Year-round', 'but', 'easier', 'during', 'dry', 'season', '###', 'Budget', 'Planning', 

In [None]:
import chromadb
from chromadb.utils import embedding_functions

client = chromadb.PersistentClient(path="./chroma_db")
default_ef = embedding_functions.DefaultEmbeddingFunction()
collection = client.get_or_create_collection(
    name="meridian_travel_docs",
    embedding_function=default_ef
)

chunk_ids = []
docs = []
metas = []
for chunk in all_chunks:
    chunk_ids.append(str(chunk.chunk_id))

    txt = " ".join(chunk.text)   
    docs.append(txt)

    metas.append({
        "origin_file": chunk.origin_file,
        "start_pos": chunk.start_pos,
        "end_pos": chunk.end_pos
    })
collection.add(
    ids=chunk_ids,
    documents=docs,
    metadatas=metas
)

queries = ["What kind of accomodations are in there?",
           "What are the best restaurants?"]
results = collection.query(query_texts=queries, n_results=3)

query_choice = 0

for i in range(len(results["ids"][query_choice])):       
    print(f"id={results['ids'][query_choice][i]} dist={results['distances'][query_choice][i]:.3f}")
    print(results["documents"][query_choice][i])        
                  


id=126 dist=0.895
private bathroom facilities with hot water - Basic furnishings: bed, storage, seating area - Fan cooling or natural ventilation (air conditioning rare) - Shared or private access to kitchen facilities - Common areas for socializing and relaxation **Common Facilities:** - WiFi internet access in common areas - Laundry facilities (washing machines or hand-washing areas) - Luggage storage before check-in and after check-out - Information desk with local knowledge and tour booking - Bicycle rental or loan programs - Cultural activity organization and coordination ### Enhanced Budget Services **Cultural Integration Programs:** - Traditional meal sharing with host families - Cultural workshop participation and skill learning - Community event and celebration participation - Language exchange and learning opportunities - Traditional knowledge sharing with elders - Village and community life integration experiences **Activity Support:** - Adventure equipment rental and basic 