What I need:

Which information to put into db?

Data engineering to make it better:

-labels?
    - have to create
    -types of rsos:
        
        
        -finance

        - investing
        - private equity
        

        -marketing

        -tech

        -science?

            -bio
            -chem
            -physics
            -engineering?
        -sport

        -humanities

        -art?



todo:
-db
-chatbot
-data engineering (labelling, other stuff?)

In [6]:
import json
import json
from typing import Dict, Any, List
import pinecone
from sentence_transformers import SentenceTransformer
import numpy as np
from dotenv import load_dotenv
import os


In [14]:
load_dotenv()
PINECONE_API_KEY=os.environ["PINECONE_API_KEY"]
INDEX_NAME = "rso-chatbot"


In [9]:
class RSOProcessor:
    def __init__(self, pinecone_api_key: str, index_name: str):
        """
        Initialize the RSO processor with Pinecone credentials.
        
        Args:
            pinecone_api_key (str): Your Pinecone API key
            pinecone_environment (str): Pinecone environment
            index_name (str): Name of your Pinecone index
        """
        # Initialize Pinecone

        pc = pinecone.Pinecone(api_key=pinecone_api_key)
        self.index = pc.Index(index_name)
        
        # Initialize the embedding model
        self.model = SentenceTransformer('all-MiniLM-L6-v2')  # 768-dimension embeddings
    
    def transform_rso_data(self, rso_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Transform RSO data for Pinecone database.
        """
        # Extract high confidence AI categories
        high_confidence_categories = [
            cat["name"] 
            for cat in rso_data["ai_categories"] 
            if cat["confidence"] >= 85
        ]
        
        # Combine with original categories, remove duplicates
        all_categories = list(set(rso_data["categories"] + high_confidence_categories))
        
        # Use full_description if available, otherwise fall back to description_preview
        description = (
            rso_data["full_description"]
            if rso_data.get("full_description")
            else rso_data.get("description_preview", "")
        )
        
        # Create transformed dictionary with only desired fields
        transformed_data = {
            "name": rso_data["name"],
            "full_url": rso_data["full_url"],
            "description": description,
            "categories": all_categories,
            "contact": {"email": rso_data["contact"]["email"]},
            "additional_info": rso_data["additional_info"],
            "social_media": rso_data["social_media"]
        }
        
        return transformed_data

    def generate_embedding(self, rso_data: Dict[str, Any]) -> np.ndarray:
        """
        Generate embedding for RSO data.
        """
        # Combine relevant text fields for embedding
        text_to_embed = f"{rso_data['name']} {rso_data['description']} {' '.join(rso_data['categories'])}"
        
        # Generate embedding
        embedding = self.model.encode(text_to_embed)
        return embedding

    def prepare_pinecone_data(self, rso_data: Dict[str, Any], embedding: np.ndarray) -> Dict[str, Any]:
        """
        Prepare data for Pinecone upsert.
        """
        return {
            "id": rso_data["name"].lower().replace(" ", "-"),  # Create URL-friendly ID
            "values": embedding.tolist(),
            "metadata": rso_data
        }

    def process_and_upsert(self, input_path: str, batch_size: int = 100):
        """
        Process RSO data and upsert to Pinecone.
        
        Args:
            input_path (str): Path to input JSON file
            batch_size (int): Size of batches for Pinecone upsert
        """
        try:
            # Read input file
            with open(input_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Convert to list if single object
            if not isinstance(data, list):
                data = [data]
            
            # Process in batches
            vectors = []
            for rso in data:
                # Transform data
                transformed_data = self.transform_rso_data(rso)
                # Generate embedding
                embedding = self.generate_embedding(transformed_data)
                # Prepare for Pinecone
                vector = self.prepare_pinecone_data(transformed_data, embedding)
                vectors.append(vector)
                
                # Upsert when batch is full
                if len(vectors) >= batch_size:
                    self.index.upsert(vectors=vectors)
                    print(f"Upserted batch of {len(vectors)} vectors")
                    vectors = []
            
            # Upsert any remaining vectors
            if vectors:
                self.index.upsert(vectors=vectors)
                print(f"Upserted final batch of {len(vectors)} vectors")
            
            print("Successfully processed all RSO data and upserted to Pinecone")
            
        except FileNotFoundError:
            print(f"Error: Could not find input file {input_path}")
        except json.JSONDecodeError:
            print(f"Error: Invalid JSON in input file {input_path}")
        except Exception as e:
            print(f"Error: An unexpected error occurred: {str(e)}")

In [11]:
def main():
    # Example usage
    #PINECONE_API_KEY = "your-api-key"
    #PINECONE_ENV = "your-environment"
    INDEX_NAME = "rso-chatbot"
    
    processor = RSOProcessor(
        pinecone_api_key=PINECONE_API_KEY,
        #pinecone_environment=PINECONE_ENV,
        index_name=INDEX_NAME
    )
    
    input_file = "categorized_rsos.json"
    processor.process_and_upsert(input_file)

In [16]:
input_file = "categorized_rsos.json"

processor = RSOProcessor(
        pinecone_api_key=PINECONE_API_KEY,
        #pinecone_environment=PINECONE_ENV,
        index_name=INDEX_NAME
    )

In [17]:
processor.process_and_upsert(input_file)

Error: An unexpected error occurred: 'social_media'
