In [16]:
import os
import json
import chromadb
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
from chromadb.config import Settings

In [17]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')

*Step 1: Release the database with the following code*

In [1]:
vectorstore = None  # Release the vector database
import gc
gc.collect()

0

*Step 2: Restart the Kernel associated to the use of the Vectorstore database*

*Step 3: Delete the vectorstore by running the following code*

In [None]:
import os
import shutil

VECTORSTORE_PATH = r"C:\Users\larry\chromadb_store\municipalities_db"  # Update to the actual path

# Delete the existing database folder
if os.path.exists(VECTORSTORE_PATH):
    shutil.rmtree(VECTORSTORE_PATH)
    print("Previous RAG deleted successfully.")
else:
    print("No existing RAG found. Proceeding with a fresh build.")

Previous RAG deleted successfully.


# **Step 1: Data pre-treatment and Loading**

In [2]:
import os
import json
import pandas as pd
import numpy as np

# Ruta donde están los archivos .txt de municipios
municipalities_folder = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\municipalities_cleaning\step3_final_cleaned_data"

# Lista para almacenar la información procesada
processed_municipalities = []

# Recorrer cada archivo en el folder de municipios
for filename in os.listdir(municipalities_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(municipalities_folder, filename)
        
        # Leer el archivo JSON
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Extraer los valores de interés
        geo_metadata = data.get("geo_metadata", {})
        
        processed_entry = {
            "title": geo_metadata.get("title", "Unknown"),  # Nombre del municipio
            "coordinates": geo_metadata.get("coordinates", "N/A"),  # Coordenadas geográficas
            "relevant_links": data.get("relevant_links", []),  # Enlaces relevantes
            "content": data.get("main_content", "")  # Contenido principal
        }
        
        processed_municipalities.append(processed_entry)

# Convertir a DataFrame
df_municipalities = pd.DataFrame(processed_municipalities)

# Mostrar información
print(df_municipalities.head())  # Vista previa de los datos

# Convertir a un NumPy array si es necesario
municipalities_array = df_municipalities.to_numpy()
print(f"Shape of NumPy array: {municipalities_array.shape}")  # Dimensiones del array


          title            coordinates  \
0      Adjuntas  [18.16278, -66.72222]   
1        Aguada  [18.37944, -67.18833]   
2     Aguadilla     [18.43, -67.15444]   
3  Aguas Buenas  [18.25694, -66.10306]   
4      Aibonito     [18.14, -66.26611]   

                                      relevant_links  \
0  [https://en.wikipedia.org/w/index.php?title=Ad...   
1  [https://en.wikivoyage.org/wiki/Aguada, https:...   
2  [https://es.wikipedia.org/wiki/Aguadilla, http...   
3  [https://en.wikivoyage.org/wiki/Aguas_Buenas, ...   
4  [https://en.wikivoyage.org/wiki/Aibonito, http...   

                                             content  
0  Contents\nAdjuntas, Puerto Rico\n\n\nAdjuntas ...  
1  Contents\nAguada, Puerto Rico\n\n\nAguada , or...  
2  Contents\nAguadilla, Puerto Rico\n\n\nAguadill...  
3  Contents\nAguas Buenas, Puerto Rico\n\n\nAguas...  
4  Contents\nAibonito, Puerto Rico\n\n\nAibonito ...  
Shape of NumPy array: (78, 4)


In [3]:
df_municipalities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           78 non-null     object
 1   coordinates     77 non-null     object
 2   relevant_links  78 non-null     object
 3   content         78 non-null     object
dtypes: object(4)
memory usage: 2.6+ KB


In [5]:
# Statistics on the document lengths for evaluation and decision-making on the chunking strategy
import numpy as np

# Compute document lengths again
lengths = [len(doc.split()) for doc in df_municipalities["content"]]

# Compute statistics
avg_length = np.mean(lengths)
max_length = np.max(lengths)
min_length = np.min(lengths)
std_dev = np.std(lengths)

print(f"Total landmarks processed: {len(df_municipalities)}")
print(f"Average document length: {avg_length:.2f} words")
print(f"Max length: {max_length} words | Min length: {min_length} words")
print(f"Standard deviation of lengths: {std_dev:.2f}")


Total landmarks processed: 78
Average document length: 2048.62 words
Max length: 11387 words | Min length: 641 words
Standard deviation of lengths: 1792.54


In [7]:
import json
import os
from langchain.schema import Document

def process_txt_file(filepath):
    """ Extract metadata and content from a .txt file """
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.loads(f.read())  # Assuming JSON structure inside .txt

    # Extract content
    content = data.get("main_content", "")

    # Extract metadata
    geo_metadata = data.get("geo_metadata", {}) or {}
    title = geo_metadata.get("title", "Unknown Title")

    # Ensure coordinates is always a list
    coordinates = geo_metadata.get("coordinates", [])
    if not isinstance(coordinates, list) or len(coordinates) != 2:
        coordinates_str = "Unknown"
    else:
        coordinates_str = f"{coordinates[0]}, {coordinates[1]}"

    # Extract relevant links
    relevant_links = ", ".join(data.get("relevant_links", []))  

    return Document(
        page_content=content,
        metadata={
            "title": title,
            "coordinates": coordinates_str,
            "relevant_links": relevant_links
        }
    )

# Process all .txt files
documents = []
for filename in os.listdir(municipalities_folder):
    if filename.endswith(".txt"):
        filepath = os.path.join(municipalities_folder, filename)
        doc = process_txt_file(filepath)
        documents.append(doc)

# Check some processed documents
print("Sample Processed Docs:", documents[:3])

Sample Processed Docs: [Document(metadata={'title': 'Adjuntas', 'coordinates': '18.16278, -66.72222', 'relevant_links': 'https://en.wikipedia.org/w/index.php?title=Adjuntas,_Puerto_Rico&oldid=1248121926, https://en.wikivoyage.org/wiki/Adjuntas, https://es.wikipedia.org/wiki/Adjuntas, https://geohack.toolforge.org/geohack.php?pagename=Adjuntas,_Puerto_Rico&params=18_09_46_N_66_43_20_W_region:US-PR_type:city(18020), https://en.wikipedia.org/wiki/Adjuntas,_Puerto_Rico'}, page_content='Contents\nAdjuntas, Puerto Rico\n\\n\nAdjuntas  is a small mountainside town and municipality in Puerto Rico located central midwestern portion of the island on the Cordillera Central , north of Yauco , Guayanilla , and Peñuelas ; southeast of Utuado ; east of Lares and Yauco; and northwest of Ponce . Adjuntas is spread over 16 barrios and Adjuntas Pueblo (the downtown area and the administrative center of the city). Adjuntas is about two hours by car westward from the capital, San Juan .\\n\nAdjuntas is nic

In [8]:
import json

# Define the output file path
output_file = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\pre-chunk clean data\municipalities_step1_docs_ext.json"

# Convert Document objects to a serializable format
serializable_docs = [
    {
        "metadata": doc.metadata,
        "page_content": doc.page_content
    }
    for doc in documents
]

# Save to JSON file
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(serializable_docs, f, indent=4, ensure_ascii=False)

print(f"Successfully saved {len(documents)} documents to {output_file}")

Successfully saved 78 documents to C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\pre-chunk clean data\municipalities_step1_docs_ext.json


# **Step 2: Chunking Data Process**

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Adaptive chunking strategy for municipalities
SMALL_DOC_THRESHOLD = 700  # Increased to better fit the dataset
MEDIUM_DOC_THRESHOLD = 2048  # Adjusted to match the average document length

# Define chunking parameters dynamically
def get_chunk_parameters(doc_length):
    if doc_length <= SMALL_DOC_THRESHOLD:
        return None  # No chunking needed
    elif doc_length <= MEDIUM_DOC_THRESHOLD:
        return {"chunk_size": 512, "chunk_overlap": 100}
    else:
        return {"chunk_size": 1024, "chunk_overlap": 200}

chunked_documents = []

processed_docs = documents.copy()  # Assuming updated_docs has been pre-processed

for doc in processed_docs:
    title = doc.metadata.get("title", "Unknown Title")
    coordinates = doc.metadata.get("coordinates", "Unknown")
    relevant_links = doc.metadata.get("relevant_links", "")

    doc_length = len(doc.page_content.split())

    chunk_params = get_chunk_parameters(doc_length)
    if chunk_params is None:
        # Small docs, no chunking, add as-is
        chunked_documents.append({
            "content": doc.page_content,
            "metadata": {
                "title": title,
                "coordinates": coordinates,
                "relevant_links": relevant_links
            }
        })
    else:
        # Apply optimized chunking
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_params["chunk_size"],
            chunk_overlap=chunk_params["chunk_overlap"]
        )
        chunks = text_splitter.split_documents([doc])  # Maintain document structure

        for chunk in chunks:
            chunked_documents.append({
                "content": chunk.page_content,
                "metadata": {
                    "title": title,
                    "coordinates": coordinates,
                    "relevant_links": relevant_links
                }
            })

print(f"Total chunks created: {len(chunked_documents)}")

Total chunks created: 1998


**Step No. 2a: Qucik Chunks Verification**

In [10]:
# Check first 5 chunks for verification
for i, chunk in enumerate(chunked_documents[:5]):
    print(f"🔹 Chunk {i+1}: {len(chunk['content'].split())} words | Title: {chunk['metadata']['title']}")
    print(chunk['content'][:300])  # Preview first 300 chars
    print("-" * 80)

🔹 Chunk 1: 84 words | Title: Adjuntas
Contents
Adjuntas, Puerto Rico
\n
Adjuntas  is a small mountainside town and municipality in Puerto Rico located central midwestern portion of the island on the Cordillera Central , north of Yauco , Guayanilla , and Peñuelas ; southeast of Utuado ; east of Lares and Yauco; and northwest of Ponce . A
--------------------------------------------------------------------------------
🔹 Chunk 2: 119 words | Title: Adjuntas
Adjuntas is nicknamed "the Switzerland of Puerto Rico", because of its relatively chilly weather. Many Puerto Rican mountain towns have cooler weather than the rest of the island; Adjuntas is no exception: the average yearly weather is 70 °F (21 °C ) (High: 83 °F/28 °C; Low: 58 °F/14 °C).  Puerto Ri
--------------------------------------------------------------------------------
🔹 Chunk 3: 168 words | Title: Adjuntas
Adjuntas\' ZIP Code, 00601, is the lowest standard ZIP code in the United States ZIP code system .\n
Etymology and nickn

In [11]:
import json

# Define the output file path
output_file = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\chunked data\municipalities_chunks.json"

# Save chunked documents as JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(chunked_documents, f, indent=4, ensure_ascii=False)

print(f"Successfully saved {len(chunked_documents)} chunks to {output_file}")

Successfully saved 1998 chunks to C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\chunked data\municipalities_chunks.json


In [14]:
print(f"Loaded {len(chunked_documents)} chunks")
print("Sample Chunk:", chunked_documents[0])

Loaded 5088 chunks
Sample Chunk: {'content': 'Contents\nAcademia del Perpetuo Socorro\nMrs. Jeannette Sánchez (1-6)\\n\nAcademia del Perpetuo Socorro (English: "Academy of Our Lady of Perpetual Help") was founded in 1921 as a Catholic parochial school of the Perpetuo Socorro Parish at the Archdiocese of San Juan, Puerto Rico . The school is located in Miramar in Puerto Rico \\\'s capital city of San Juan . Students, teachers and alumni commonly refer to their school as Perpetuo.  \\n\nMission\nThe goal of the  Academia del Perpetuo Socorro is the complete development of the student - spiritually, mentally, morally, socially, culturally, and physically in order to live a full life and to prepare for his/her final goal - union with God. To this end, the school works to create a Catholic academic community in which faith, knowledge, and recreation are shared in a spirit of freedom, love, and creativity. The school, however, does not emphasize religion over other subjects, accepts students

# **Step 3: Initialize ChromaDB**

In [12]:
import chromadb
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
from chromadb.config import Settings

# **Step 3: Create a Persistent ChromaDB Client**

In [13]:
municipalitiesdb_path = r"C:\Users\larry\chromadb_store\municipalities_db"
chroma_client = chromadb.PersistentClient(path=municipalitiesdb_path)

# **Step 4: Create a Collection**

In [14]:
# Define the name of the collection
collection_name = "municipalities_rag"

# Check if the collection already exists and delete it to avoid duplication
existing_collections = [col.name for col in chroma_client.list_collections()]
if collection_name in existing_collections:
    chroma_client.delete_collection(name=collection_name)

# Create a new collection in ChromaDB
collection = chroma_client.get_or_create_collection(name=collection_name)

# **Step 5: Embeddings Preparation Using OpenAI**

In [18]:
# Initialize OpenAI's embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", api_key=OPENAI_API_KEY)

# Prepare lists to store data before inserting into ChromaDB
documents = []  # Store the main content (text) of each landmark
metadatas = []  # Store metadata (title, coordinates, categories, etc.)
ids = []        # Unique document IDs for ChromaDB
embeddings = [] # Store generated embedding vectors

# **Step 6: Process and Embed the Municipalities Data**

In [19]:
for idx, doc in enumerate(chunked_documents):
    doc_id = f"chunk_{idx}"  # Ensure unique ID per chunk
    document_text = doc["content"]

    # Generate embedding for each chunk
    embedding_vector = embedding_model.embed_query(document_text)

    # Store metadata
    metadata = doc["metadata"]

    # Store in lists for batch insertion
    documents.append(document_text)
    metadatas.append(metadata)
    ids.append(doc_id)
    embeddings.append(embedding_vector)


# **Step 7: Insert Data into ChromaDB**

In [20]:
# Insert chunked embeddings into ChromaDB
collection.add(
    ids=ids,                 # Unique document IDs
    embeddings=embeddings,   # Precomputed embeddings
    metadatas=metadatas,     # Metadata for filtering and retrieval
    documents=documents      # Original landmark descriptions
)

# Confirm successful insertion by checking the number of documents stored
print(f"Number of chunks stored in ChromaDB: {collection.count()}")
print(f"Number of documents chunks: {len(chunked_documents)}")

Number of chunks stored in ChromaDB: 1998
Number of documents chunks: 1998


# **Step 8: Perform a Test Query**

In [27]:
# Load the stored vectorstore from ChromaDB
vectorstore = Chroma(
    persist_directory=municipalitiesdb_path, 
    embedding_function=embedding_model
)

# Define a sample query
query_text = "Which municipalities of Puerto Rico have more spanish heritage"

# Perform a similarity search in ChromaDB
retrieval_results = vectorstore.similarity_search(query_text, k=5)  # Retrieve top 5 most relevant results

# Display retrieved results with similarity scores
print("\n**Search Results with Similarity Scores:**\n")
for idx, (doc, score) in enumerate(retrieval_results):
    print(f"Result {idx+1}:")
    print(f"Title: {doc.metadata.get('title', 'Unknown')}")
    print(f"Similarity Score: {score:.4f}")  # Display similarity score
    print(f"Snippet: {doc.page_content[:300]}...")  # Show first 300 characters
    print("-" * 80)


**Search Results with Similarity Scores:**

