In [1]:
import os
import json
import chromadb
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
from chromadb.config import Settings
from dotenv import load_dotenv, find_dotenv

In [2]:
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
OPENWEATHER_API_KEY = os.getenv('OPENWEATHER_API_KEY')

*Step 1: Release the database with the following code*

In [None]:
vectorstore = None  # Release the vector database
import gc
gc.collect()

*Step 2: Restart the Kernel associated to the use of the Vectorstore database*

*Step 3: Delete the vectorstore by running the following code*

In [None]:
import os
import shutil

VECTORSTORE_PATH = r"C:\Users\larry\chromadb_store\landmarks_db"  # Update to the actual path

# Delete the existing database folder
if os.path.exists(VECTORSTORE_PATH):
    shutil.rmtree(VECTORSTORE_PATH)
    print("Previous RAG deleted successfully.")
else:
    print("No existing RAG found. Proceeding with a fresh build.")

# **Step 1: Data pre-treatment and Loading**

In [None]:
import json
import pandas as pd
import numpy as np

# Load the JSON dataset
json_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\pre-chunk clean data\landmarks_step1a_updated_cat.json"
with open(json_path, "r", encoding="utf-8") as f:
    landmarks_data = json.load(f)

# Process and structure the data for ingestion
processed_landmarks = []

for entry in landmarks_data:
    metadata = entry["metadata"]
    document_text = entry["page_content"]
    
    # Extract relevant metadata fields
    processed_entry = {
        "title": metadata.get("title", "Unknown"),
        "coordinates": metadata.get("coordinates", "N/A"),
        "categories": metadata.get("categories", "").split(", "),  # Convert categories to a list
        "relevant_links": metadata.get("relevant_links", "").split(", "),  # Convert links to a list
        "content": document_text  # Store main content
    }

    processed_landmarks.append(processed_entry)

# Convert processed data into a Pandas DataFrame
df_landmarks = pd.DataFrame(processed_landmarks)

# Display DataFrame information
print(df_landmarks.head())  # Show the first few rows

# Convert DataFrame to a NumPy array
landmarks_array = df_landmarks.to_numpy()
print(f"Shape of NumPy array: {landmarks_array.shape}")  # Display array shape


In [None]:
df_landmarks.info()

In [None]:
# Statistics on the document lengths for evaluation and decision-making on the chunking strategy
import numpy as np

# Compute document lengths again
lengths = [len(doc.split()) for doc in df_landmarks["content"]]

# Compute statistics
avg_length = np.mean(lengths)
max_length = np.max(lengths)
min_length = np.min(lengths)
std_dev = np.std(lengths)

print(f"Total landmarks processed: {len(df_landmarks)}")
print(f"Average document length: {avg_length:.2f} words")
print(f"Max length: {max_length} words | Min length: {min_length} words")
print(f"Standard deviation of lengths: {std_dev:.2f}")


In [None]:
df_landmarks.to_csv("df_landmarks_backup.csv", index=False)
print("DataFrame saved as CSV.")

In [None]:
import numpy as np

np.save("landmarks_array_backup.npy", landmarks_array)
print("NumPy array saved.")

# **Step 2: Chunking Data Process**

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define chunking thresholds
SMALL_DOC_THRESHOLD = 500
MEDIUM_DOC_THRESHOLD = 2000

# Function to determine chunking parameters
def get_chunk_parameters(doc_length):
    if doc_length <= SMALL_DOC_THRESHOLD:
        return None  # No chunking needed
    elif doc_length <= MEDIUM_DOC_THRESHOLD:
        return {"chunk_size": 500, "chunk_overlap": 50}
    else:
        return {"chunk_size": 1000, "chunk_overlap": 100}

# Apply chunking where needed
chunked_documents = []

for idx, row in df_landmarks.iterrows():
    title = row["title"]
    coordinates = row["coordinates"]
    categories = ", ".join(row["categories"])
    relevant_links = ", ".join(row["relevant_links"])
    
    document_text = row["content"]
    doc_length = len(document_text.split())

    chunk_params = get_chunk_parameters(doc_length)
    
    if chunk_params is None:
        # Small docs, store as is
        chunked_documents.append({
            "content": document_text,
            "metadata": {
                "title": title,
                "coordinates": coordinates,
                "categories": categories,
                "relevant_links": relevant_links
            }
        })
    else:
        # Apply chunking
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_params["chunk_size"],
            chunk_overlap=chunk_params["chunk_overlap"]
        )
        chunks = text_splitter.split_text(document_text)

        for chunk in chunks:
            chunked_documents.append({
                "content": chunk,
                "metadata": {
                    "title": title,
                    "coordinates": coordinates,
                    "categories": categories,
                    "relevant_links": relevant_links
                }
            })

print(f"Total chunks created: {len(chunked_documents)}")

**Step No. 2a: Qucik Chunks Verification**

In [None]:
# Check first 5 chunks for verification
for i, chunk in enumerate(chunked_documents[:5]):
    print(f"🔹 Chunk {i+1}: {len(chunk['content'].split())} words | Title: {chunk['metadata']['title']}")
    print(chunk['content'][:300])  # Preview first 300 chars
    print("-" * 80)

**Step No. 2b: Categories Distribution Verification**

In [None]:
# Check categories distribution
from collections import Counter
category_counts = Counter([doc['metadata']['categories'] for doc in chunked_documents])
print(category_counts)

In [None]:
import json

# Define the output file path
output_file = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\chunked data\landmarks_chunks_12feb25v03.json"

# Save chunked documents as JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(chunked_documents, f, indent=4, ensure_ascii=False)

print(f"Successfully saved {len(chunked_documents)} chunks to {output_file}")

In [None]:
print(f"Loaded {len(chunked_documents)} chunks")
print("Sample Chunk:", chunked_documents[0])

# **Step 3: Initialize ChromaDB**

In [None]:
import chromadb
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
from chromadb.config import Settings

# **Step 4: Create a Persistent ChromaDB Client**

In [None]:
!pip uninstall chromadb

In [None]:

landmarksdb_path = r"C:\Users\larry\chromadb_store\landmarks_db"

chroma_client = chromadb.PersistentClient(path=landmarksdb_path)

#chroma_client = chromadb.Client(Settings(
#    chroma_db_impl="duckdb+parquet", # Use DuckDB as database with Parquet storage format
#    persist_directory=landmarksdb_path)) # Store the database in the specified directory

# **Step 5: Create a Collection**

In [None]:
# Define the name of the collection
collection_name = "landmarks_rag"

# Check if the collection already exists and delete it to avoid duplication
existing_collections = [col.name for col in chroma_client.list_collections()]
if collection_name in existing_collections:
    chroma_client.delete_collection(name=collection_name)

# Create a new collection in ChromaDB
collection = chroma_client.get_or_create_collection(name=collection_name)

In [None]:
chroma_client.list_collections()

# **Step 6: Embeddings Preparation Using OpenAI**

In [None]:
# Initialize OpenAI's embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002", api_key=OPENAI_API_KEY)

# Prepare lists to store data before inserting into ChromaDB
documents = []  # Store the main content (text) of each landmark
metadatas = []  # Store metadata (title, coordinates, categories, etc.)
ids = []        # Unique document IDs for ChromaDB
embeddings = [] # Store generated embedding vectors

# **Step 7: Process and Embed the Landmarks Data**

In [None]:
for idx, doc in enumerate(chunked_documents):
    doc_id = f"chunk_{idx}"  # Ensure unique ID per chunk
    document_text = doc["content"]

    # Generate embedding for each chunk
    embedding_vector = embedding_model.embed_query(document_text)

    # Store metadata
    metadata = doc["metadata"]

    # Store in lists for batch insertion
    documents.append(document_text)
    metadatas.append(metadata)
    ids.append(doc_id)
    embeddings.append(embedding_vector)


# **Step 8: Insert Data into ChromaDB**

In [None]:
# Insert chunked embeddings into ChromaDB
collection.add(
    ids=ids,                 # Unique document IDs
    embeddings=embeddings,   # Precomputed embeddings
    metadatas=metadatas,     # Metadata for filtering and retrieval
    documents=documents      # Original landmark descriptions
)

# Confirm successful insertion by checking the number of documents stored
print(f"Number of chunks stored in ChromaDB: {collection.count()}")
print(f"Number of documents chunks: {len(chunked_documents)}")

# **Step 9: Perform a Test Query**

In [31]:
# Ensure we use the same embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Define a sample query
query_text = "historical site with a large museum in Puerto Rico"

# Generate embedding for the query
query_embedding = embedding_model.embed_query(query_text)

# Perform a similarity search in ChromaDB using the query embedding
retrieval_results = collection.query(
    query_embeddings=[query_embedding],  # Use embeddings instead of raw text
    n_results=3  # Retrieve top 3 most relevant results
)

# Display retrieved results
for i, result in enumerate(retrieval_results["documents"][0]):
    print(f"\nResult {i+1}:")
    print(f"Title: {retrieval_results['metadatas'][0][i]['title']}")
    print(f"Coordinates: {retrieval_results['metadatas'][0][i]['coordinates']}")
    print(f"Categories: {retrieval_results['metadatas'][0][i]['categories']}")
    print(f"Content: {result[:500]}...")  # Show first 500 characters

AttributeError: 'Chroma' object has no attribute 'query'

In [None]:
# Ensure we use the same embedding model
#embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

# Define a sample query
query_text = "best beaches for family vacations in Puerto Rico"

# Generate embedding for the query
query_embedding = embedding_model.embed_query(query_text)

# Perform a similarity search in ChromaDB using the query embedding
retrieval_results = collection.query(
    query_embeddings=[query_embedding],  # Use embeddings instead of raw text
    n_results=5  # Retrieve top 3 most relevant results
)

# Display retrieved results
for i, result in enumerate(retrieval_results["documents"][0]):
    print(f"\nResult {i+1}:")
    print(f"Title: {retrieval_results['metadatas'][0][i]['title']}")
    print(f"Coordinates: {retrieval_results['metadatas'][0][i]['coordinates']}")
    print(f"Categories: {retrieval_results['metadatas'][0][i]['categories']}")
    print(f"Content: {result[:500]}...")  # Show first 500 characters

**Update Your Retrieval Query to Include Similarity Scores**

In [None]:
# Ensure query text is embedded
query_embedding = embedding_model.embed_query(query_text)

# Define a sample query
query_text = "Which is the top ranked beach in Puerto Rico?"

# Perform a similarity search and include the similarity scores
retrieval_results = collection.query(
    query_embeddings=[query_embedding],  
    n_results=10,  # Retrieve top 5 results
    include=["documents", "metadatas", "distances"]  # Add similarity scores
)

# Display retrieved results with similarity scores
for i, result in enumerate(retrieval_results["documents"][0]):
    similarity_score = 1 - retrieval_results["distances"][0][i]  # Convert distance to similarity
    print(f"\nResult {i+1}: (Similarity Score: {similarity_score:.4f})")
    print(f"Title: {retrieval_results['metadatas'][0][i]['title']}")
    print(f"Coordinates: {retrieval_results['metadatas'][0][i]['coordinates']}")
    print(f"Categories: {retrieval_results['metadatas'][0][i]['categories']}")
    print(f"Content: {result[:500]}...")  # Show first 500 characters

In [None]:
# Ensure query text is embedded
query_embedding = embedding_model.embed_query(query_text)

# Define a sample query
query_text = "Tell me about El Yunque."

# Perform a similarity search and include the similarity scores
retrieval_results = collection.query(
    query_embeddings=[query_embedding],  
    n_results=10,  # Retrieve top 5 results
    include=["documents", "metadatas", "distances"]  # Add similarity scores
)

# Display retrieved results with similarity scores
for i, result in enumerate(retrieval_results["documents"][0]):
    similarity_score = 1 - retrieval_results["distances"][0][i]  # Convert distance to similarity
    print(f"\nResult {i+1}: (Similarity Score: {similarity_score:.4f})")
    print(f"Title: {retrieval_results['metadatas'][0][i]['title']}")
    print(f"Coordinates: {retrieval_results['metadatas'][0][i]['coordinates']}")
    print(f"Categories: {retrieval_results['metadatas'][0][i]['categories']}")
    print(f"Content: {result[:500]}...")  # Show first 500 characters

In [None]:
collection.peek()

In [29]:
print(f"Number of chunks stored in ChromaDB: {collection.count()}")

NameError: name 'collection' is not defined

In [28]:
print(collection.get())

NameError: name 'collection' is not defined

In [None]:
from langchain_openai import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
chain = ConversationalRetrievalChain.from_llm(
    OpenAI(temperature=0),
    vectorstore.as_retriever(), # see below for vectorstore definition
    memory=memory,
    condense_question_prompt=condense_prompt,
    combine_docs_chain_kwargs=dict(prompt=combine_docs_custom_prompt)
)

In [None]:
from langchain.prompts import PromptTemplate

condense_prompt = PromptTemplate(
    input_variables=["chat_history", "question"],
    template="""
Given the conversation history and the latest user query, rephrase the question to be standalone, keeping it concise but maintaining all necessary details.

### Conversation History:
{chat_history}

### Latest User Query:
{question}

### Standalone Question for Retrieval:
""")

In [None]:
combine_docs_custom_prompt = PromptTemplate(
    input_variables=["chat_history", "question", "context"],
    template="""
You are an AI travel planner helping users design an itinerary. Use the retrieved information about landmarks and the user's past preferences to generate a relevant and coherent travel recommendation.

### Conversation History:
{chat_history}

### User's Latest Question:
{question}

### Retrieved Landmark Information:
{context}

### Instructions:
- Provide a well-structured travel recommendation based on the retrieved landmarks.
- Ensure continuity with previous discussions.
- Prioritize landmarks that match the user’s preferences.
- If multiple options exist, suggest the best ones with reasoning.
- Avoid repeating information already given in the conversation.
- In the end ask the user which of these locations you will like to visit.

### Final Answer:
""")


In [None]:
import requests
from langchain.chat_models import ChatOpenAI
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType

OPENWEATHER_API_KEY = "593b25670563b5c443274eb3677c971e"

def get_weather(city: str):
    """Fetches the current weather for a given city using OpenWeather API."""
    base_url = "http://api.openweathermap.org/data/2.5/weather"
    params = {
        "q": city,
        "appid": OPENWEATHER_API_KEY,
        "units": "metric",  # Get temperature in Celsius
    }
    
    try:
        response = requests.get(base_url, params=params)
        data = response.json()
        
        if response.status_code == 200:
            weather_description = data["weather"][0]["description"]
            temperature = data["main"]["temp"]
            humidity = data["main"]["humidity"]
            return f"The weather in {city} is {weather_description} with a temperature of {temperature}°C and {humidity}% humidity."
        else:
            return f"Error fetching weather for {city}: {data.get('message', 'Unknown error')}"
    
    except Exception as e:
        return f"API request failed: {str(e)}"

# Define the tool for the agent
weather_tool = Tool(
    name="Weather Info",
    func=get_weather,
    description="Provides real-time weather details for a given city."
)

In [None]:
from langchain_openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import OpenAIEmbeddings
from langchain_core.tools import tool
from langchain_community.utilities.openweathermap import OpenWeatherMapAPIWrapper
from langchain.agents import load_tools

#Initialize LLM with ReAct Chain
llm = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=0, model="gpt-4-turbo")

agent_weather = initialize_agent(
    tools=[weather_tool],  # Attach weather tool
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,  # Enables reasoning & tool use
    verbose=True
)


#tools = [weather_tool]


#llm_with_tools = llm.bind_tools(tools)

# Define memory to store conversation history
memory = ConversationBufferMemory(
    memory_key="chat_history", return_messages=True
)

# Function to print chat history
def print_chat_history():
    print("\nChat History:")
    for idx, msg in enumerate(memory.chat_memory.messages):
        role = "User" if msg.type == "human" else "AI"
        print(f"{role}: {msg.content}")

# Connect to your existing ChromaDB collection
vectorstore = Chroma(
    collection_name="landmarks_rag",
    embedding_function=OpenAIEmbeddings()
)

#Define the retriever and chain
retriever = vectorstore.as_retriever(k=3)  # This fetches relevant landmarks

# Set up the conversational retrieval chain
retrieval_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,  # Using OpenAI's ChatGPT as the LLM
    retriever=retriever,  # Connect to your ChromaDB retriever
    memory=memory,
    condense_question_prompt=condense_prompt,  # Ensures refined queries for retrieval
    combine_docs_chain_kwargs=dict(prompt=combine_docs_custom_prompt)  # Customizes how retrieved docs are used
)

# Wrap QA Chain as a Tool 
qa_tool = Tool( name="Puerto Rico Travel Guide", func=retrieval_chain.run, description="Retrieve the best places to visit in Puerto Rico based on user queries." )


In [None]:
from langchain.chains import ConversationalRetrievalChain, LLMChain

# =======================# 2. Extracting Locations from QA Response# ======================= 
location_extraction_prompt = PromptTemplate( input_variables=["response"], 
                                            template=""" Extract only the location names from the following text: "{response}" Provide the locations as a comma-separated list. """ 
                                            ) 
                                            
location_extraction_chain = LLMChain( llm=ChatOpenAI(model_name="gpt-4"),
                                     prompt=location_extraction_prompt )

def extract_locations_from_response(response):
    """Extracts locations using the LLM chain."""
    location_list = location_extraction_chain.run(response)
    return [loc.strip() for loc in location_list.split(",") if loc.strip()] 

# =======================# 3. Asking the User for Their Selected Places# =======================
def ask_user_for_places(query): 
    """Asks the user to select places they are interested in visiting."""
    # Retrieve recommended locations from QA #
    recommended_places = qa_tool.run(query) 
    extracted_locations = extract_locations_from_response(recommended_places)
    if extracted_locations:
        weather_details = get_weather_for_selected_places(extracted_locations)
        res = f"Here are some great places to visit: {', '.join(extracted_locations)}. {weather_details}. Which ones do you want to visit?"
        return res
    else:
        return "I couldn't find relevant locations. Please try another query."
    # Wrap as a Tool

ask_places_tool = Tool( name="Ask User for Selected Places", func=ask_user_for_places, description="Ask the user which places they want to visit from the recommended list." )

In [None]:
# =======================# 4. Getting Weather for Selected Locations# =======================# 
def get_weather(location):
    """Fetch real-time weather for a given location."""
    api_key = "593b25670563b5c443274eb3677c971e"  # Replace with your API key
    base_url = "http://api.openweathermap.org/data/2.5/weather"
    params = {"q": location, "appid": api_key, "units": "metric"}
    response = requests.get(base_url, params=params) 
    
    if response.status_code == 200:
        data = response.json()
        weather = data["weather"][0]["description"]
        temp = data["main"]["temp"]
        return f"The weather in {location} is {weather} with a temperature of {temp}°C."
    
    else: return f"Could not fetch weather data for {location}."
    
def get_weather_for_selected_places(selected_places):
    """Fetches weather for the user-selected locations."""
    locations = [loc.strip() for loc in selected_places.split(",") if loc.strip()]
    if not locations:
        return "Please provide at least one valid location."
    weather_reports = [get_weather(location) for location in locations]
    return "\n".join(weather_reports) 

# Wrap Weather Fetching as a Tool
weather_tool = Tool( name="Get Weather for Selected Places", func=get_weather_for_selected_places, description="Retrieve weather for the places selected by the user." )

In [None]:
llm = ChatOpenAI(model_name="gpt-4") 
agent = initialize_agent( tools=[qa_tool, ask_places_tool, weather_tool],
                          # Adding all tools 
                          llm=llm,
                          agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION, # Keeps conversation context
                          verbose=True,
                          memory=memory 
                          )

In [None]:
# Step 1: Get recommended places
response1 = agent.run("What are the best places to visit in Puerto Rico?")

print(response1)


In [None]:
# Expected: "Here are some great places: San Juan, El Yunque, Culebra... Which ones do you want to visit?"# Simulate User Input 
user_selected_places = "San Juan and Ponce"

# Step 2: Fetch weather for selected places 
response2 = agent.run(f"I want to visit {user_selected_places}")

print(response2) # Expected: Weather details for "San Juan" and "El Yunque".

In [None]:

# Step 2: Fetch weather for selected places 
response2 = agent.run("what is the wether in those places?")

print(response2) # Expected: Weather details for "San Juan" and "El Yunque".

In [24]:
# Expected: "Here are some great places: San Juan, El Yunque, Culebra... Which ones do you want to visit?"# Simulate User Input 
user_selected_places = "San Juan, El Yunque"

# Step 2: Fetch weather for selected places 
response2 = agent.run(f"I want to visit {user_selected_places}")

print(response2) # Expected: Weather details for "San Juan" and "El Yunque".#User Query Example: Travel Plan
question = "What are the best places to visit in Puerto Rico?"

# Step 1: Retrieve locations from landmarks database (RAG)
retrieved_docs = retriever.get_relevant_documents(question)

# Extract most relevant location
if retrieved_docs:
    suggested_location = retrieved_docs[0].metadata.get("title", "San Juan")  # Default to San Juan if no match
else:
    suggested_location = "San Juan"

print(f"Recommended Location: {suggested_location}")

# Step 2: Get Weather for Recommended Location
weather_info = get_weather(suggested_location)
print(f"Weather Info: {weather_info}")

# Step 3: Generate Final Answer (LLM integrates both)
final_query = f"{question} Also, include the current temperature in {suggested_location}: {weather_info}."

response = agent_weather.run(final_query)

print("\nAI Response:")
print(response)

NameError: name 'agent' is not defined

In [None]:
# User Query Example

query_01 = "What are the top landmarks I should visit in Puerto Rico based on my interests of beaches and nightlife?"
question = query_01

# Retrieve documents separately so you can see them
retrieved_docs = retriever.get_relevant_documents(question)

#Print chat history BEFORE the query
print_chat_history()

print("\nRetrieved Documents:")
for idx, doc in enumerate(retrieved_docs):
    print(f"{idx+1}. {doc.metadata.get('title', 'No Title')} - {doc.page_content[:200]}...")

# Run the retrieval chain with the retrieved documents
response = retrieval_chain.run(question)
print("\n💡 AI Response:")
print(response)

#Print chat history AFTER the query
print_chat_history()

In [None]:
# User Query Example

query_02 = "I will like to visit Condado Beach and El Yunque."
question = query_02

# Retrieve documents separately so you can see them
retrieved_docs = retriever.get_relevant_documents(question)

#Print chat history BEFORE the query
print_chat_history()

print("\nRetrieved Documents:")
for idx, doc in enumerate(retrieved_docs):
    print(f"{idx+1}. {doc.metadata.get('title', 'No Title')} - {doc.page_content[:200]}...")

# Run the retrieval chain with the retrieved documents
response = retrieval_chain.run(question)
print("\n💡 AI Response:")
print(response)

#Print chat history AFTER the query
print_chat_history()

In [None]:
# User Query Example

query_02 = "I also what to see museums."
question = query_02

# Retrieve documents separately so you can see them
retrieved_docs = retriever.get_relevant_documents(question)

#Print chat history BEFORE the query
print_chat_history()

print("\nRetrieved Documents:")
for idx, doc in enumerate(retrieved_docs):
    print(f"{idx+1}. {doc.metadata.get('title', 'No Title')} - {doc.page_content[:200]}...")

# Run the retrieval chain with the retrieved documents
response = retrieval_chain.run(question)
print("\n💡 AI Response:")
print(response)

#Print chat history AFTER the query
print_chat_history()

In [None]:
# User Query Example

query_03 = "Create an itinerary for me."
question = query_03

# Retrieve documents separately so you can see them
retrieved_docs = retriever.get_relevant_documents(question)

#Print chat history BEFORE the query
print_chat_history()

print("\nRetrieved Documents:")
for idx, doc in enumerate(retrieved_docs):
    print(f"{idx+1}. {doc.metadata.get('title', 'No Title')} - {doc.page_content[:200]}...")

# Run the retrieval chain with the retrieved documents
response = retrieval_chain.run(question)
print("\n💡 AI Response:")
print(response)

#Print chat history AFTER the query
print_chat_history()

In [13]:
#Block 1: Import Required Libraries

import os
import random
import requests
from langchain_openai import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain_chroma import Chroma
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import OpenAIEmbeddings
from langchain_core.tools import tool
from langchain.prompts import PromptTemplate
from langchain.agents import initialize_agent, AgentType
from langchain_community.utilities.openweathermap import OpenWeatherMapAPIWrapper
from dotenv import load_dotenv, find_dotenv

In [14]:
# Load API keys
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
OPENWEATHER_API_KEY = os.getenv('OPENWEATHER_API_KEY')

In [15]:
# Function to Fetch Weather Data

def get_weather(city: str):
    """Fetches real-time weather data for a given city using OpenWeather API."""
    base_url = "http://api.openweathermap.org/data/2.5/weather"
    params = {
        "q": city,
        "appid": OPENWEATHER_API_KEY,
        "units": "metric",
    }

    try:
        response = requests.get(base_url, params=params)
        data = response.json()

        if response.status_code == 200:
            weather_description = data["weather"][0]["description"]
            temperature = data["main"]["temp"]
            humidity = data["main"]["humidity"]
            return f"The current temperature in {city} is {temperature}°C with {weather_description} and {humidity}% humidity."
        else:
            return f"Error fetching weather for {city}: {data.get('message', 'Unknown error')}"
    
    except Exception as e:
        return f"API request failed: {str(e)}"

In [16]:
weather_tool = tool(get_weather)

llm = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=0, model="gpt-4-turbo")

agent_weather = initialize_agent(
    tools=[weather_tool],  # Attach weather tool
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,  # Enables reasoning & tool use
    verbose=True
)

memory = ConversationBufferMemory(
    memory_key="chat_history", return_messages=True
)

def print_chat_history():
    """Displays stored chat history for debugging."""
    print("\nChat History:")
    for idx, msg in enumerate(memory.chat_memory.messages):
        role = "User" if msg.type == "human" else "AI"
        print(f"{role}: {msg.content}")

vectorstore = Chroma(
    collection_name="landmarks_rag",
    embedding_function=OpenAIEmbeddings(model="text-embedding-ada-002", api_key=OPENAI_API_KEY)
)
retriever = vectorstore.as_retriever(k=3)  # Retrieves top 3 relevant landmarks

In [17]:
# Travel Recommendation Chain

travel_prompt = PromptTemplate(
    input_variables=["chat_history", "question"],
    template="""
You are a travel planner to visit Puerto Rico . Based on the conversation history and user question, recommend the best landmarks to visit in a structured day-by-day itinerary.

### Chat History:
{chat_history}

### User Question:
{question}

### Travel Recommendation:
"""
)

travel_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    chain_type="stuff",  # Ensures all retrieved docs are passed to the LLM
    condense_question_prompt=travel_prompt
)

In [18]:
# Location Extraction Chain

location_extraction_prompt = PromptTemplate(
    input_variables=["travel_response"],
    template="""
Extract the names of locations mentioned in the travel response.

### Travel Recommendation:
{travel_response}

### Extracted Locations (Comma-Separated):
"""
)

location_extraction_chain = LLMChain(llm=llm, prompt=location_extraction_prompt)

In [19]:
# Itinerary Generator

def generate_itinerary(selected_places, num_days):
    """Generates a structured itinerary based on user preferences and trip duration."""
    itinerary = f"\n🏝️ **Your {num_days}-Day Itinerary** 🏝️\n"
    places_per_day = max(1, len(selected_places) // num_days)
    random.shuffle(selected_places)

    for day in range(1, num_days + 1):
        start_idx = (day - 1) * places_per_day
        end_idx = start_idx + places_per_day
        day_places = selected_places[start_idx:end_idx]

        if not day_places:
            break

        itinerary += f"\n📅 **Day {day}:**\n"
        for place in day_places:
            itinerary += f"- Visit **{place}**\n"

    return itinerary.strip()

In [22]:
# Conversational Process & Final Recommendation

def travel2pr_chatbot():
    """Handles user interaction until the itinerary is confirmed."""
    confirmed = False
    while not confirmed:
        question = input("\nUser: ")  # User enters their query
        travel_response = travel_chain.invoke({"question": question})

        print("\n💡 AI Travel Recommendation:\n", travel_response["answer"])

        # Extract locations from response (FIXED)
        locations = extract_locations_from_response(travel_response["answer"])
        
        if not locations:
            print("\n⚠️ No valid locations found. Try asking about specific places.\n")
            continue

        # Infer trip duration from conversation or ask
        num_days = None
        for msg in reversed(memory.chat_memory.messages):
            if "days" in msg.content.lower():
                try:
                    words = msg.content.split()
                    for word in words:
                        if word.isdigit():
                            num_days = int(word)
                            break
                except ValueError:
                    continue

        if num_days is None:
            num_days = int(input("\nHow many days is your trip? "))
            memory.chat_memory.add_user_message(f"My trip is {num_days} days.")

        # Fetch weather info for each location
        weather_responses = {location: get_weather(location) for location in locations}

        # Generate final structured itinerary
        itinerary = generate_itinerary(locations, num_days)

        # Display final AI response
        print("\n📅 **Final Itinerary:**\n", itinerary)
        print("\n🌤 **Weather Report:**\n", "\n".join([f"{loc}: {weather}" for loc, weather in weather_responses.items()]))

        # Confirm itinerary
        user_feedback = input("\nDoes this itinerary look good? (yes/no): ").strip().lower()
        if user_feedback == "yes":
            confirmed = True
            print("\n✅ **Itinerary Finalized! Have a great trip!**")
        else:
            print("\n🔄 Adjusting itinerary... Let's refine your plan.")

In [23]:
# Run the chatbot
travel2pr_chatbot()


💡 AI Travel Recommendation:
 It looks like you've already provided a detailed and well-thought-out itinerary for a trip to Puerto Rico! If you have any specific questions or need further information about any of the places mentioned or activities suggested, feel free to ask. Enjoy your trip planning!


NameError: name 'extract_locations_from_response' is not defined

travel_planning_conversation()

# **Pre-RAG Verification Code (Full Implementation)**

In [None]:
import chromadb
import pandas as pd
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# Define the path to the vectorstore (RAG system)
VECTORSTORE_PATH = r"C:\Users\larry\chromadb_store\landmarks_db"

# Initialize embeddings and vectorstore
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
vectorstore = Chroma(persist_directory=VECTORSTORE_PATH, embedding_function=embedding_model)

# Define test queries for RAG verification
test_queries = [
    "Where is El Yunque National Forest located?",
    "Tell me about Castillo San Felipe del Morro.",
    "What are the most famous beaches in Puerto Rico?",
    "Which landmarks should I visit in Old San Juan?",
    "What are the best hiking trails in Puerto Rico?"
]

# Initialize an empty list to store retrieval results
retrieval_results = []

# Function to check retrieval quality
def evaluate_retrieval(query):
    """Retrieves documents from the RAG system using collection.query and logs similarity scores."""
    
    # Ensure the query is embedded before searching
    query_embedding = embedding_model.embed_query(query)

    # Perform a similarity search using collection.query
    retrieval_data = vectorstore._collection.query(
        query_embeddings=[query_embedding],  
        n_results=5,  # Retrieve top 5 most relevant results
        include=["documents", "metadatas", "distances"]  # Include similarity scores
    )

    retrieved_chunks = []

    for i, result in enumerate(retrieval_data["documents"][0]):
        similarity_score = 1 - retrieval_data["distances"][0][i]  # Convert distance to similarity
        chunk_data = {
            "query": query,
            "retrieved_text": result[:300] + "...",  # Show only first 300 characters
            "source_title": retrieval_data["metadatas"][0][i]["title"],
            "coordinates": retrieval_data["metadatas"][0][i]["coordinates"],
            "categories": retrieval_data["metadatas"][0][i]["categories"],
            "similarity_score": round(similarity_score, 4)  # Format similarity score
        }
        retrieved_chunks.append(chunk_data)

    return retrieved_chunks

# Run retrieval tests on all queries
for query in test_queries:
    retrieval_results.extend(evaluate_retrieval(query))

# Convert results to DataFrame for better readability
retrieval_df = pd.DataFrame(retrieval_results)

# Display verification results
print("\n=== Pre-RAG Verification Results ===")
print(retrieval_df.to_string(index=False))

# Save results to a CSV file for further analysis
retrieval_df.to_csv(r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\RAG Tests\lm_test03\rag_verification_results.csv", index=False)

print("\nRAG verification results saved successfully.")

In [None]:
print(f"Number of chunks stored in ChromaDB: {collection.count()}")

In [None]:
print(vectorstore.get())

In [None]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

VECTORSTORE_PATH = r"C:\Users\larry\chromadb_store\landmarks_db"

# Reload Chroma with persistence
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
vectorstore = Chroma(
    persist_directory=VECTORSTORE_PATH,
    embedding_function=embedding_model
)

# Check the number of stored documents
print(f"Total documents stored: {vectorstore._collection.count()}")


In [None]:
# Check if ChromaDB collection actually contains the expected embeddings
print(f"Number of chunks stored in ChromaDB: {collection.count()}")

# Retrieve an arbitrary document to check data integrity
test_retrieval = collection.query(
    query_embeddings=[embedding_model.embed_query("Test query")],  
    n_results=1,  
    include=["documents", "metadatas", "distances"]
)

print("\nSample Retrieved Document:\n", test_retrieval)

In [None]:
# Generar un embedding de prueba
test_embedding = embedding_model.embed_query("Test query")

print(f"Sample embedding vector (length: {len(test_embedding)}): {test_embedding[:5]} ...")  # Muestra los primeros 5 valores

In [None]:
# Genera embeddings para los documentos (asegúrate de usar una lista de textos)
embeddings_list = embedding_model.embed_documents([doc["content"] for doc in chunked_documents])

# Verifica la cantidad de embeddings generados
print(f"Total embeddings generated: {len(embeddings_list)}")
print(f"First embedding vector sample (length: {len(embeddings_list[0])}): {embeddings_list[0][:5]} ...")

In [None]:
# Verificar si los embeddings están almacenados en ChromaDB
sample_query = embedding_model.embed_query("Test query")
test_retrieval = collection.query(
    query_embeddings=[sample_query],
    n_results=1,
    include=["documents", "metadatas", "embeddings", "distances"]
)

print("\nSample Retrieved Document:\n", test_retrieval)