In [57]:
import pandas as pd

# Load the JSON file into a DataFrame
df = pd.read_json('./datasets/fully_classified_puerto_rico_landmarks.json')

print(df.shape)

# Display the first few rows of the DataFrame
print(df.head(3))
print("\n", df.head(1).content)
print("\n", df.head(1).metadata)
print("\n", df.head(1).metadata[0]['type'])

(695, 2)
                                             content  \
0  Adjuntas barrio-pueblo is a barrio and the adm...   
1  Casa Pueblo is an environmental community orga...   
2  Casa Pueblo is an environmental community orga...   

                                            metadata  
0  {'type': 'landmark', 'city': 'Adjuntas', 'name...  
1  {'type': 'landmark', 'city': 'Adjuntas', 'name...  
2  {'type': 'landmark', 'city': 'Adjuntas', 'name...  

 0    Adjuntas barrio-pueblo is a barrio and the adm...
Name: content, dtype: object

 0    {'type': 'landmark', 'city': 'Adjuntas', 'name...
Name: metadata, dtype: object

 landmark


In [110]:
texts = df['content'].to_list()
metadatas = df['metadata'].to_list()

print(f'Texts: {len(texts)}  Metadatas: {len(metadatas)}')

Texts: 695  Metadatas: 695


In [106]:
### ---------
### IMPORT EVENTS
### ---------
import pandas as pd

# Load the JSON file into a DataFrame
events_df = pd.read_json('./datasets/puerto_rico_events.json')

print(events_df.shape)

# Display the first few rows of the DataFrame
print(events_df.head(3))
print("\n", events_df.head(1).content)
print("\n", events_df.head(1).metadata)
print("\n", events_df.head(1).metadata[0]['type'])

(29, 2)
                                             content  \
0  As in most countries, this holiday is celebrat...   
1  This is a Commonwealth of Puerto Rico official...   
2       A federal and commonwealth official holiday.   

                                            metadata  
0  {'type': 'event', 'city': 'Puerto Rico', 'name...  
1  {'type': 'event', 'city': 'Puerto Rico', 'name...  
2  {'type': 'event', 'city': 'Puerto Rico', 'name...  

 0    As in most countries, this holiday is celebrat...
Name: content, dtype: object

 0    {'type': 'event', 'city': 'Puerto Rico', 'name...
Name: metadata, dtype: object

 event


In [112]:
events_texts = events_df['content'].to_list()
events_metadatas = events_df['metadata'].to_list()

print(f'Texts: {len(events_texts)}  Metadatas: {len(events_metadatas)}')

Texts: 29  Metadatas: 29


In [None]:
### JOIN EVENTS AND LANDMARKS

In [114]:
join_texts = events_texts + texts
join_metadatas = events_metadatas + metadatas

print(f'Texts: {len(join_texts)}  Metadatas: {len(join_metadatas)}')

Texts: 724  Metadatas: 724


In [None]:
## ONLY FOR LANDMARKS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

# Init Splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Chunk Size
    chunk_overlap=200  # Overlap
)

# Divide text and propagate the metadata
split_texts = []
split_metadatas = []

# To each original text and associate metadata
for i, text in enumerate(texts):
    # Divide - chunks
    chunks = splitter.split_text(text)
    split_texts.extend(chunks)
    
    # Propagate the same metadata to each chunk
    split_metadatas.extend([metadatas[i]] * len(chunks))

# Check results
print("Chunks:", split_texts)
print("Metadata (propagate):", split_metadatas)


In [None]:
## Convert Metadata Dictionary to String (for embedding purpose)

In [85]:
def convert_metadata_to_str(metadata):
    """Convert any list-based metadata to a comma-separated string."""
    converted_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, list):
            # Join list into a comma-separated string
            converted_metadata[key] = ", ".join(value)
        else:
            converted_metadata[key] = value
    return converted_metadata

# Convert each metadata entry to string (needed for embedding)
converted_metadatas = [convert_metadata_to_str(meta) for meta in metadatas]

In [75]:
import os
# Import OpenAI API key from env file
from dotenv import load_dotenv

openai_api_key = load_dotenv(dotenv_path='.env')

# Get the API key from the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")

In [81]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [87]:
# Initialize embedding model
open_ai_embedding_model = OpenAIEmbeddings(model='text-embedding-ada-002', api_key=openai_api_key)

print('ChromaDb Storing process ...')

# Batch insertion function to handle large datasets
def batch_insert_texts(texts, metadatas, open_ai_embedding_model, persist_directory, batch_size=40000):
    """Insert texts and associated metadata in batches to avoid exceeding the maximum batch size."""
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_metadatas = metadatas[i:i + batch_size]
        print(f"Inserting batch {i // batch_size + 1} of size {len(batch_texts)}...")
        Chroma.from_texts(
            batch_texts,
            embedding=open_ai_embedding_model,
            metadatas=batch_metadatas,
            persist_directory=persist_directory
        )


# Insert texts and metadata in batches
batch_insert_texts(
    texts=texts,
    metadatas=converted_metadatas,
    open_ai_embedding_model=open_ai_embedding_model,
    persist_directory="./chroma_db_tourism_v2",
    batch_size=40000  # Adjust batch size if necessary
)

print("Chunks have been successfully split, vectorized, and stored in the vector database.")

ChromaDb Storing process ...
Inserting batch 1 of size 695...
Chunks have been successfully split, vectorized, and stored in the vector database.


In [None]:

# Create the Vector
# vectorstore = Chroma.from_texts(
#     texts=split_texts,
#     embedding=embedding_function,  # Tu funci√≥n de embedding
#     metadatas=split_metadatas
# )

In [120]:
## ONLY FOR LANDMARKS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

# Init Splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Chunk Size
    chunk_overlap=200  # Overlap
)

# Divide text and propagate the metadata
join_split_texts = []
join_split_metadatas = []

# To each original text and associate metadata
for i, text in enumerate(join_texts):
    # Divide - chunks
    chunks = splitter.split_text(text)
    join_split_texts.extend(chunks)
    
    # Propagate the same metadata to each chunk
    join_split_metadatas.extend([join_metadatas[i]] * len(chunks))

# Check results
print("Chunks:", join_split_texts[0])
print("Metadata (propagate):", join_split_metadatas[0])

Chunks: As in most countries, this holiday is celebrated with gatherings and fireworks. Although illegal,celebratory gunfirehas sometimes led to injuries and even deaths on certain occasions.Official commonwealth and federal holiday.
Metadata (propagate): {'type': 'event', 'city': 'Puerto Rico', 'name': "New Year's Day", 'categories': [], 'date': 'January 1', 'url': 'https://en.wikipedia.org/wiki/Public_holidays_in_Puerto_Rico', 'image_url': ''}


In [122]:
def convert_metadata_to_str(metadata):
    """Convert any list-based metadata to a comma-separated string."""
    converted_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, list):
            # Join list into a comma-separated string
            converted_metadata[key] = ", ".join(value)
        else:
            converted_metadata[key] = value
    return converted_metadata

# Convert each metadata entry to string (needed for embedding)
converted_metadatas = [convert_metadata_to_str(meta) for meta in join_split_metadatas]

In [124]:
# Initialize embedding model
open_ai_embedding_model = OpenAIEmbeddings(model='text-embedding-ada-002', api_key=openai_api_key)

print('ChromaDb Storing process ...')

# Batch insertion function to handle large datasets
def batch_insert_texts(texts, metadatas, open_ai_embedding_model, persist_directory, batch_size=40000):
    """Insert texts and associated metadata in batches to avoid exceeding the maximum batch size."""
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_metadatas = metadatas[i:i + batch_size]
        print(f"Inserting batch {i // batch_size + 1} of size {len(batch_texts)}...")
        Chroma.from_texts(
            batch_texts,
            embedding=open_ai_embedding_model,
            metadatas=batch_metadatas,
            persist_directory=persist_directory
        )

# Insert texts and metadata in batches
batch_insert_texts(
    texts=join_split_texts,
    metadatas=converted_metadatas,
    open_ai_embedding_model=open_ai_embedding_model,
    persist_directory="./chroma_db_tourism_v2",
    batch_size=40000  # Adjust batch size if necessary
)

print("Chunks have been successfully split, vectorized, and stored in the vector database.")

ChromaDb Storing process ...
Inserting batch 1 of size 940...
Chunks have been successfully split, vectorized, and stored in the vector database.
