In [36]:
from utils import Config
config = Config('config.yml')
role = config.get_role_prompt()
role

[nltk_data] Downloading package punkt to /Users/halcolo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'label': 'Le rôle du chatbot',
 'value': "Vous êtes un agent conversationnel spécialisé dans le recyclage et la gestion des déchets pour la ville de Lyon.\nVotre rôle est d’aider les utilisateurs à comprendre le processus de recyclage et de gestion des déchets en se basant uniquement sur les informations disponibles dans votre base de connaissances. Si un utilisateur pose une question qui dépasse les informations disponibles, répondez avec :\n« Je suis désolé, mais ma fonction d'agent conversationnel est limitée aux données que j'ai en mémoire. Pour plus d'informations, je vous invite à contacter la mairie de votre localité. »\nAdoptez un ton professionnel, clair et engageant tout en vous assurant que vos réponses respectent les consignes suivantes :\n  Précision : Appuyez-vous exclusivement sur les données disponibles.\n  Accessibilité : Fournissez des explications compréhensibles par le grand public.\n  Orientation : Lorsque les informations sont insuffisantes, guidez les utilisateu

In [2]:
# import os
# from utils import PDFProcessor
# file = os.path.join('data', 'grand_lyon_tri.pdf')
# pdf = PDFProcessor()
# chunks = pdf.process_pdf(file)

In [3]:
# len(chunks)

In [74]:
import json
import re
from typing import List

class JSONProcessor:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """
        Initialize the JSON processor with configurable chunk parameters.
        
        Args:
            chunk_size (int): Target size of each chunk in characters
            chunk_overlap (int): Number of characters to overlap between chunks
        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def read_json(self, file_path: str) -> str:
        """
        Read and extract text from a JSON file.
        
        Args:
            file_path (str): Path to the JSON file
            
        Returns:
            str: Extracted text from the JSON
        """
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            text = json.dumps(data)
                
        return text

    def clean_text(self, text: str) -> str:
        """
        Clean and normalize the extracted text.
        
        Args:
            text (str): Raw text to clean
            
        Returns:
            str: Cleaned text
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Remove special characters but keep periods for sentence boundaries
        text = re.sub(r'[^a-z0-9\s\.]', '', text)
        
        # Fix spacing after periods
        text = re.sub(r'\.(?! )', '. ', text)
        
        # Remove multiple periods
        text = re.sub(r'\.{2,}', '.', text)
        
        return text

    def create_chunks(self, text: str) -> List[str]:
        """
        Split the text into chunks of a specified length.
        
        Args:
            text (str): Cleaned text to chunk
            
        Returns:
            List[str]: List of text chunks
        """
        chunks = []
        start = 0
        while start < len(text):
            end = start + self.chunk_size
            chunks.append(text[start:end])
            start = end - self.chunk_overlap
        
        return chunks

    def process_json(self, file_path: str) -> List[str]:
        """
        Process a JSON file and return chunks ready for RAG.
        
        Args:
            file_path (str): Path to the JSON file
            
        Returns:
            List[str]: List of processed text chunks
        """
        # Extract text from JSON
        raw_text = self.read_json(file_path)
        
        # Clean the text
        cleaned_text = self.clean_text(raw_text)
        
        # Create chunks
        chunks = self.create_chunks(cleaned_text)
        
        return chunks

In [76]:
import os
import json
from pymongo import MongoClient

class MongoDB:
    def __init__(self, db_name: str, collection_name: str, data_dir: str = 'data', host: str = 'localhost', port: int = 27017):
        """
        Initialize the MongoDB client with the specified host and port.
        
        Args:
            db_name (str): Name of the database
            collection_name (str): Name of the collection
            data_dir (str): Directory containing JSON files
            host (str): MongoDB host address
            port (int): MongoDB port number
        """
        self.client = MongoClient(host, port)
        self.db_name = db_name
        self.collection_name = collection_name
        self.data_dir = data_dir
        self.db = self.client[db_name]
        self.collection = self.db[collection_name]
        
        # Check if the collection is empty and load data if necessary
        if self.collection.estimated_document_count() == 0:
            self.setup_database()

    def create_collection(self, db_name: str, collection_name: str):
        """
        Create a collection in the specified database.
        
        Args:
            db_name (str): Name of the database
            collection_name (str): Name of the collection
            
        Returns:
            Collection: The created collection
        """
        db = self.client[db_name]
        collection = db[collection_name]
        return collection
    
    def insert_item(self, db_name: str, collection_name: str, item: dict):
        """
        Insert an item into the specified collection.
        
        Args:
            db_name (str): Name of the database
            collection_name (str): Name of the collection
            item (dict): Item to insert
        """
        collection = self.create_collection(db_name, collection_name)
        collection.update_one(
            {'id': item['id'], 'ville': item['ville']},  # Filter to check if the item already exists
            {'$set': item},      # Update the item if it exists
            upsert=True          # Insert the item if it doesn't exist
        )
        print("Inserted item with ID:", item['id'])
    
    def query_collection(self, db_name: str, collection_name: str, query: dict):
        """
        Query the specified collection.
        
        Args:
            db_name (str): Name of the database
            collection_name (str): Name of the collection
            query (dict): Query to execute
            
        Returns:
            list: List of query results
        """
        collection = self.create_collection(db_name, collection_name)
        results = collection.find(query)
        return list(results)
    
    def setup_database(self):
        """
        Setup the database by inserting data from all JSON files in the specified directory.
        """
        for file_name in os.listdir(self.data_dir):
            if file_name.endswith('.json'):
                file_path = os.path.join(self.data_dir, file_name)
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                for item in data:
                    mandatory = ['id', 'ville', 'type_dechet', 'produits', 'action', 'instructions']
                    for field in mandatory:
                        if field not in item:
                            raise ValueError(f"Missing field '{field}' in item with ID: {item['id']}")
                    self.insert_item(self.db_name, self.collection_name, item)
        
        print(f"Data from all JSON files in '{self.data_dir}' verified and inserted into MongoDB collection '{self.collection_name}' if not already present")


In [80]:
mongoDb = MongoDB('rag', 'dechets')

Inserted item with ID: 1
Inserted item with ID: 2
Inserted item with ID: 3
Inserted item with ID: 4
Inserted item with ID: 5
Inserted item with ID: 6
Inserted item with ID: 7
Inserted item with ID: 8
Inserted item with ID: 9
Inserted item with ID: 10
Inserted item with ID: 11
Data from all JSON files in 'data' verified and inserted into MongoDB collection 'dechets' if not already present


In [43]:
all_documents_list = mongoDb.query_collection('rag', 'dechets', {'ville': 'Grand Lyon Métropole'})

In [44]:
import chromadb
chroma_client = chromadb.Client()
chroma_collection = chroma_client.get_or_create_collection("recycling_db")

In [79]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

json_processor = JSONProcessor()

ids = [str(doc['_id']) for doc in all_documents_list]
documents = [" ".join([f"{k}: {v}" for k, v in doc.items() if k != '_id' and k != 'id']) for doc in all_documents_list]
documents = [json_processor.clean_text(doc) for doc in documents]
# Ensure you have the necessary NLTK data files




In [78]:
ids, documents

(['678d690bcef7ed37800f2fbd',
  '678d690bcef7ed37800f2fbe',
  '678d690bcef7ed37800f2fbf',
  '678d690bcef7ed37800f2fc0',
  '678d690bcef7ed37800f2fc1',
  '678d690bcef7ed37800f2fc2',
  '678d690bcef7ed37800f2fc3',
  '678d690bcef7ed37800f2fc4',
  '678d690bcef7ed37800f2fc5',
  '678d690bcef7ed37800f2fc6',
  '678d690bcef7ed37800f2fc7'],
 ['ville grand lyon mtropole action recyclage avantages chaque tonne consigne trie transforme don recherche contre cancer. recyclage verre permet rduire mission co2. verre recycl utilis fabriquer nouveaux contenants. bouteille recycle peut fabriquer nimporte quel produit verre. recyclage verre conomise lnergie car verre recycl fond temprature infrieure production dorigine. couleurrecipient vert silo verre emplacement ville voie publique territoire silo verre proche chez exception vaisselle casse. vaisselle verre. miroirs. dampoules. plats. porcelaine. faence. dbris vaisselle. instruction dposez contenants vids sans couvercle vrac silo verre. retirez bouchons co

In [54]:
from chromadb.config import Settings

chroma_client = chromadb.PersistentClient(
            path="./chromadb",
            settings=Settings(anonymized_telemetry=False)
        )

In [55]:
chroma_collection = chroma_client.get_or_create_collection(name="recycling_db")

In [84]:
if chroma_collection.count() == 0:
    chroma_collection.add(documents=documents, ids=ids)
    
print(chroma_collection.count())


11


In [60]:
results = chroma_collection.query(
    query_texts=["Recyclage de verres"], # Chroma will embed this for you
    n_results=2
)
print(results)

{'ids': [['678d690bcef7ed37800f2fbd', '678d690bcef7ed37800f2fc4']], 'distances': [[1.0600780625621293, 1.168714399602189]], 'metadatas': [[None, None]], 'embeddings': None, 'documents': [['ville Grand Lyon Mtropole action Recyclage avantages Chaque tonne consigne trie transforme don recherche contre cancer recyclage verre permet rduire mission CO2 verre recycl utilis fabriquer nouveaux contenants bouteille recycle peut fabriquer nimporte quel produit verre recyclage verre conomise lnergie car verre recycl fond temprature infrieure production dorigine couleurrecipient Vert silo verre emplacement ville Voie publique territoire Silo verre proche chez exception vaisselle casse vaisselle verre miroirs dampoules plat porcelaine faence dbris vaisselle instruction Dposez contenants vids sans couvercle vrac silo verre Retirez bouchons couvercles pointscollecte Silo verre proche chez produits nom Bouteille verre synonymes Rcipient verre Bouteille recyclable tag verre bouteille recyclage nom Bout

In [68]:
from bson import ObjectId

# Extract the list of IDs from the results variable
result_ids = results['ids'][0]

# Convert the IDs to ObjectId instances if necessary
result_object_ids = [ObjectId(id) for id in result_ids]

# Query MongoDB using the extracted IDs
queried_documents = mongoDb.query_collection('rag', 'dechets', {'_id': {'$in': result_object_ids}})
print(queried_documents)

[{'_id': ObjectId('678d690bcef7ed37800f2fbd'), 'id': '1', 'ville': 'Grand Lyon Métropole', 'action': 'Recyclage', 'avantages': ['Chaque tonne consignée ou triée se transforme en un don pour la recherche contre le cancer.', 'Le recyclage du verre permet de réduire les émissions de CO2.', 'Le verre recyclé est utilisé pour fabriquer de nouveaux contenants.', 'Avec une bouteille recyclée, on peut fabriquer n’importe quel produit en verre.', 'Le recyclage du verre économise de l’énergie, car le verre recyclé se fond à une température inférieure à sa production d’origine.'], 'couleur_recipient': 'Vert (silo à verre)', 'emplacement': ['Dans la ville', 'Voie publique du territoire', 'Silo à verre proche de chez vous'], 'exceptions': ['Pas de vaisselle cassée.', 'Pas de vaisselle en verre.', 'Pas de miroirs.', "Pas d'ampoules.", 'Pas de plats.', 'Pas de porcelaine.', 'Pas de faïence.', 'Pas de débris de vaisselle.'], 'instructions': ['Déposez les contenants vidés, sans couvercle et en vrac, da

In [71]:
import chromadb
from chromadb.config import Settings


chroma_client = chromadb.HttpClient(
    host='localhost',
    port=8000,
    settings=Settings(
        # chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
        # chroma_client_auth_credentials='test-token'
    )
)


ValueError: Could not connect to tenant default_tenant. Are you sure it exists?