In [12]:
# Cell 1: Imports and Setup
import json
from typing import Dict, Any, List
import pinecone
from sentence_transformers import SentenceTransformer
import numpy as np
from dotenv import load_dotenv
import os
import re
import unicodedata


In [2]:
load_dotenv()
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]
INDEX_NAME = "rso-chatbot"

In [3]:
# Initialize Pinecone
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(INDEX_NAME)

# Initialize the embedding model
model = SentenceTransformer('all-mpnet-base-v2')

In [9]:
def safe_get(dictionary: Dict, key: str, default: Any = "None") -> Any:
    """Safely get a value from a dictionary, returning default if None or missing."""
    value = dictionary.get(key)
    return default if value is None else value

def transform_rso_data(rso_data: Dict[str, Any]) -> Dict[str, Any]:
    """Transform RSO data for Pinecone database with safe None handling."""
    # Handle None case for entire RSO
    if rso_data is None:
        return {
            "name": "None",
            "full_url": "None",
            "description": "None",
            "categories": [],
            "contact_email": "None",
            "additional_info": {},
            "social_media_links": []
        }
    
    # Extract high confidence AI categories with None handling
    ai_categories = safe_get(rso_data, "ai_categories", [])
    high_confidence_categories = [
        cat["name"] 
        for cat in ai_categories 
        if cat.get("confidence", 0) >= 85 and cat.get("name")
    ]
    
    # Combine with original categories, remove duplicates, handle None
    original_categories = safe_get(rso_data, "categories", [])
    all_categories = list(set(
        [cat for cat in (original_categories + high_confidence_categories) if cat]
    ))
    
    # Use full_description if available, otherwise fall back to description_preview
    description = (
        safe_get(rso_data, "full_description") 
        if rso_data.get("full_description")
        else safe_get(rso_data, "description_preview")
    )
    
    # Safely get contact information
    contact = safe_get(rso_data, "contact", {})
    contact_email = safe_get(contact, "email") if isinstance(contact, dict) else "None"
    
    # Safely get and flatten additional_info
    additional_info = safe_get(rso_data, "additional_info", {})
    flattened_additional_info = {
        str(k): str(v) if v is not None else "None"
        for k, v in additional_info.items()
    } if isinstance(additional_info, dict) else {}
    
    # Safely get social media links
    social_media = safe_get(rso_data, "social_media", {})
    social_media_links = [
        str(link) for link in social_media.values()
        if link is not None
    ] if isinstance(social_media, dict) else []
    
    # Create transformed dictionary with only desired fields and proper types
    transformed_data = {
        "name": safe_get(rso_data, "name"),
        "full_url": safe_get(rso_data, "full_url"),
        "description": description,
        "categories": all_categories,
        "contact_email": contact_email,
        "additional_info": flattened_additional_info,
        "social_media_links": social_media_links
    }
    
    return transformed_data

In [17]:
def generate_safe_id(name: str) -> str:
    """
    Generate a safe ASCII ID from a string.
    Handles unicode characters, spaces, and special characters.
    """
    if not name or name.lower() == "none":
        return f"unknown-rso-{hash(str(name))}"
        
    # Convert to ASCII, remove diacritics
    normalized = unicodedata.normalize('NFKD', name)
    ascii_name = normalized.encode('ASCII', 'ignore').decode('ASCII')
    
    # Replace special characters and spaces with hyphens
    safe_id = re.sub(r'[^a-zA-Z0-9]+', '-', ascii_name.lower())
    
    # Remove leading/trailing hyphens
    safe_id = safe_id.strip('-')
    
    # Ensure we have a valid ID
    if not safe_id:
        return f"unnamed-rso-{hash(name)}"
        
    return safe_id

In [5]:
def generate_embedding(rso_data: Dict[str, Any]) -> np.ndarray:
    """Generate embedding for RSO data."""
    # Combine relevant text fields for embedding
    text_to_embed = f"{rso_data['name']} {rso_data['description']} {' '.join(rso_data['categories'])}"
    
    # Generate embedding
    embedding = model.encode(text_to_embed)
    return embedding

def prepare_pinecone_data(rso_data: Dict[str, Any], embedding: np.ndarray) -> Dict[str, Any]:
    """Prepare data for Pinecone upsert."""
    safe_id = generate_safe_id(rso_data["name"])
    return {
        "id": safe_id,
        "values": embedding.tolist(),
        "metadata": {
            "name": rso_data["name"],
            "full_url": rso_data["full_url"],
            "description": rso_data["description"],
            "categories": rso_data["categories"],
            "contact_email": rso_data["contact_email"],
            "social_media_links": rso_data["social_media_links"],
            # Convert additional_info dict to list of strings for Pinecone compatibility
            "additional_info": [f"{k}: {v}" for k, v in rso_data["additional_info"].items()]
        }
    }

In [6]:
# test for one rso
with open("categorized_rsos.json", 'r', encoding='utf-8') as f:
    data = json.load(f)

# Process single RSO
sample_rso = data[0] if isinstance(data, list) else data
transformed_rso = transform_rso_data(sample_rso)
test_embedding = generate_embedding(transformed_rso)
pinecone_vector = prepare_pinecone_data(transformed_rso, test_embedding)

# Try upserting
try:
    index.upsert(vectors=[pinecone_vector])
    print("Successfully upserted test vector to Pinecone")
except Exception as e:
    print(f"Error upserting to Pinecone: {str(e)}")

Successfully upserted test vector to Pinecone


In [16]:
# Function to process all RSOs (only run after testing single RSO)
def process_all_rsos(data, batch_size=100):
    """Process all RSOs and upsert to Pinecone in batches."""
    # Ensure data is a list
    if not isinstance(data, list):
        data = [data]
    
    vectors = []
    for i, rso in enumerate(data):
        try:
            transformed_data = transform_rso_data(rso)
            embedding = generate_embedding(transformed_data)
            vector = prepare_pinecone_data(transformed_data, embedding)
            vectors.append(vector)
            
            # Upsert when batch is full
            if len(vectors) >= batch_size:
                index.upsert(vectors=vectors)
                print(f"Upserted batch of {len(vectors)} vectors")
                vectors = []
                
            if (i + 1) % 10 == 0:
                print(f"Processed {i + 1} RSOs")
                
        except Exception as e:
            print(f"Error processing RSO {rso.get('name', 'unknown')}: {str(e)}")
    
    # Upsert any remaining vectors
    if vectors:
        index.upsert(vectors=vectors)
        print(f"Upserted final batch of {len(vectors)} vectors")

In [15]:
process_all_rsos(data)

Processed 10 RSOs
Processed 20 RSOs
Processed 30 RSOs
Processed 40 RSOs
Processed 50 RSOs
Processed 60 RSOs
Processed 70 RSOs
Processed 80 RSOs
Processed 90 RSOs
Upserted batch of 100 vectors
Processed 100 RSOs
Processed 110 RSOs
Processed 120 RSOs
Processed 130 RSOs
Processed 140 RSOs
Processed 150 RSOs
Processed 160 RSOs
Processed 170 RSOs
Processed 180 RSOs
Processed 190 RSOs
Error processing RSO Midwave Radio: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 11 Jan 2025 23:36:27 GMT', 'Content-Type': 'application/json', 'Content-Length': '83', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '376', 'x-pinecone-request-id': '5661391943124982288', 'x-envoy-upstream-service-time': '39', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Vector ID must be ASCII, but got 'kreyòl-club'","details":[]}

Error processing RSO Midway Ventures: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 11 Jan 2025 23:36:27

KeyboardInterrupt: 