# Building a Retrieval Augmented Generation (RAG) Chatbot

In [341]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
!pip install -qU langchain-mistralai
!pip install -qU langchain-chroma
# !pip install -qU chromadb
# !pip install xmltodict

# To correct UnstructuredXMLLoader errors
# !pip install "unstructured>=0.6.7"

## Setup

In [165]:
import getpass
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter API key for Langsmith: ")

Enter API key for Langsmith:  ········


## Components

In [396]:
import getpass
import os

# Chat model

MODEL_NAME = "mistral-small-latest"
MISTRAL_API_KEY = getpass.getpass("Enter API key for Mistral AI: ")

Enter API key for Mistral AI:  ········


In [None]:
from langchain_mistralai import ChatMistralAI, MistralAIEmbeddings

# Chat model initialization
llm = ChatMistralAI(
    mistral_api_key=MISTRAL_API_KEY,
    model="mistral-small-latest",
    streaming=True,
    temperature=0.6
)

# Embeddings model
embeddings = MistralAIEmbeddings(model="mistral-embed")

In [408]:
from langchain_chroma import Chroma

# Vector store
vector_store = Chroma(embedding_function=embeddings)

## Loading JSON and XML files

Transforming the data JSON and XML files we got online into LangChain Document objects

### Test: loading a single JSON file

In [11]:
import json
from pathlib import Path
from pprint import pprint

file_path="../parcoursup/parcoursup_data.json"
data = json.loads(Path(file_path).read_text())
pprint(data)

[{'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Chimie',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Langues et littÃ©ratures franÃ§aises',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Philosophie, EpistÃ©mologie',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Physique',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Pluri Droit - Sciences Ã©co - AES',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Pluri Langues',
  'propositions_d_admissions': 0.0

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [12]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="../parcoursup/parcoursup_data.json",
    jq_schema=".[]",
    text_content=False
)

documents = loader.load()
pprint(documents[0])

Document(metadata={'source': 'C:\\Users\\Laura GALINDO\\Documents\\Academique_et_pro\\Epitech Digital School\\Bootcamp Chatbot IA\\chatbot-orientation-pro\\parcoursup\\parcoursup_data.json', 'seq_num': 1}, page_content='{"annee_du_bac": 2021, "doublette": "Art,Art", "formation": "Licence Chimie", "voeux": 0, "propositions_d_admissions": 0, "acceptations": 0}')


### Test: loading a single XML file

In [14]:
from bs4 import BeautifulSoup

with open('../onisep/Onisep_Fiches_Formations.xml', 'r', encoding="utf-8") as f:
    data = f.read()

bs_data = BeautifulSoup(data, 'xml')

print(bs_data.prettify()[:500])

<?xml version="1.0" encoding="utf-8"?>
<formations xmlns:exsl="http://exslt.org/common">
 <formation>
  <identifiant>
   FOR.1000
  </identifiant>
  <code_scolarite>
   16X20003
  </code_scolarite>
  <libelle_complet>
   mastère spé. management de la qualité, de la sécurité et de l'environnement
  </libelle_complet>
  <sigle/>
  <libelle_generique/>
  <libelle_specifique/>
  <type_option/>
  <type_Formation>
   <type_formation_sigle>
    mastère spé.
   </type_formation_sigle>
   <type_formation


In [15]:
from langchain_community.document_loaders import UnstructuredXMLLoader
from pprint import pprint

loader = UnstructuredXMLLoader('../onisep/Onisep_Fiches_Formations.xml')

documents = loader.load()

print(documents[0].page_content[:500])

FOR.1000

16X20003

mastère spé. management de la qualité, de la sécurité et de l'environnement

mastère spé.

MASTERE SPE

mastère spécialisé

"1 an"

https://www.francecompetences.fr/recherche/rncp/37084/

200

Spécialités pluritechnologiques de la production

R

Contrôle, prévention, entretien

niveau 7

REF.423

bac + 6

16X

Label de la Conférence des grandes écoles (CGE)

Formation inscrite au RNCP

MET.878

chargé/e hygiène sécurité environnement (HSE)

chargée hygiène sécurité environnem


### Converting XML files to JSON files

The layout of Document objects loaded from XML files is too different from JSON files, and less well structured, so the MistralAI API cannot later embed them into the vector store

In [259]:
import json
import xmltodict

def convert_xml_to_json(xml_file: str, json_file: str):
    try:
        with open(xml_file, 'r', encoding='utf-8') as f:
            xml_content = f.read()

        data_dict = xmltodict.parse(xml_content)

        formations = data_dict.get("formations", {})
        if isinstance(formations, dict):
            formations = formations.get("formation", [])
        else:
            formations = []

        processed_formations = []

        for formation in formations:
            processed_entry = {}

            # Direct string fields
            processed_entry["identifiant"] = formation["identifiant"] if "identifiant" in formation else None
            processed_entry["libelle_complet"] = formation["libelle_complet"] if "libelle_complet" in formation else None
            processed_entry["duree_formation"] = formation["duree_formation"].strip('"') if "duree_formation" in formation else None
            processed_entry["url"] = formation["url"] if "url" in formation else None
            processed_entry["niveau_certification"] = formation["niveau_certification"] if "niveau_certification" in formation else None
            processed_entry["poursuites_etudes"] = formation["poursuites_etudes"] if "poursuites_etudes" in formation else None
            processed_entry["publications"] = formation["publications"] if "publications" in formation else None

            # Nested structures
            if "type_Formation" in formation and isinstance(formation["type_Formation"], dict):
                processed_entry["type_Formation"] = formation["type_Formation"].get("type_formation_libelle", None)
            else:
                processed_entry["type_Formation"] = None

            if "niveau_etudes" in formation and isinstance(formation["niveau_etudes"], dict):
                processed_entry["niveau_etudes"] = formation["niveau_etudes"].get("libelle", None)
            else:
                processed_entry["niveau_etudes"] = None

            if "nsf_discipline" in formation and isinstance(formation["nsf_discipline"], dict):
                processed_entry["nsf_discipline"] = formation["nsf_discipline"].get("NSF_discipline_libelle", None)
            else:
                processed_entry["nsf_discipline"] = None

            if "nature_certificat" in formation and isinstance(formation["nature_certificat"], dict):
                certif = formation["nature_certificat"].get("libelle_nature_certificat", None)
                processed_entry["nature_certificat"] = ", ".join(certif) if isinstance(certif, list) else certif
            else:
                processed_entry["nature_certificat"] = None

            # Handling "metiers_formation" properly
            processed_entry["metiers_formation"] = []
            if "metiers_formation" in formation and isinstance(formation["metiers_formation"], dict):
                metiers = formation["metiers_formation"].get("metier", [])

                if isinstance(metiers, dict):  # If there is only one "metier"
                    metiers = [metiers]

                for metier in metiers:
                    metier_data = {"nom_metier": metier.get("nom_metier", None), "synonymes": []}

                    if "synonymes" in metier and isinstance(metier["synonymes"], dict):
                        synonyms = metier["synonymes"].get("synonyme", [])

                        if isinstance(synonyms, dict):  # If there is only one synonym
                            synonyms = [synonyms]

                        metier_data["synonymes"] = [syn.get("nom_metier") for syn in synonyms if isinstance(syn, dict)]

                    processed_entry["metiers_formation"].append(metier_data)

            processed_formations.append(processed_entry)

        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(processed_formations, f, ensure_ascii=False, indent=4)

        print(f"Conversion réussie : '{xml_file}' → '{json_file}'")

    except Exception as e:
        print(f"Erreur lors de la conversion : {e}")

# Exemple d'utilisation
convert_xml_to_json("../onisep/Onisep_Fiches_Formations.xml", "../onisep/Onisep_Fiches_Formations.json")

Conversion réussie : '../onisep/Onisep_Fiches_Formations.xml' → '../onisep/Onisep_Fiches_Formations.json'


In [728]:
import json
import xmltodict

def convert_xml_to_json_2(xml_file: str, json_file: str):
    try:
        with open(xml_file, 'r', encoding='utf-8') as f:
            xml_content = f.read()

        data_dict = xmltodict.parse(xml_content)

        metiers = data_dict.get("metiers", {})
        if isinstance(metiers, dict):
            metiers = metiers.get("metier", [])
        else:
            metiers = []

        processed_metiers = []

        for metier in metiers:
            processed_entry = {}

            # Direct string fields
            processed_entry["identifiant"] = metier.get("identifiant")
            processed_entry["nom_metier"] = metier.get("nom_metier")
            processed_entry["libelle_feminin"] = metier.get("libelle_feminin")
            processed_entry["libelle_masculin"] = metier.get("libelle_masculin")
            processed_entry["competences"] = metier.get("competences")
            processed_entry["condition_travail"] = metier.get("condition_travail")
            processed_entry["nature_travail"] = metier.get("nature_travail")
            processed_entry["acces_metier"] = metier.get("acces_metier")
            processed_entry["vie_professionnelle"] = metier.get("vie_professionnelle")
            processed_entry["accroche_metier"] = metier.get("accroche_metier")

            # Nested structures
            if "romesV3" in metier and isinstance(metier["romesV3"], dict):
                processed_entry["romesV3"] = metier["romesV3"].get("romesV3")
            else:
                processed_entry["romesV3"] = None

            if "niveau_acces_min" in metier and isinstance(metier["niveau_acces_min"], dict):
                processed_entry["niveau_acces_min"] = metier["niveau_acces_min"].get("libelle")
            else:
                processed_entry["niveau_acces_min"] = None

            # Deep-nested structures

            # Traitement des synonymes
            processed_entry["synonymes"] = []
            if "synonymes" in metier and isinstance(metier["synonymes"], dict):
                synonymes = metier["synonymes"].get("synonyme", [])
                if isinstance(synonymes, dict):  # Si il n'y a qu'un "synonyme"
                    synonymes = [synonymes]
                for synonyme in synonymes:
                    synonyme_data = {"nom_metier": synonyme.get("nom_metier")}
                    processed_entry["synonymes"].append(synonyme_data)

            # Traitement des statuts
            processed_entry["statuts"] = []
            if "statuts" in metier and isinstance(metier["statuts"], dict):
                statuts = metier["statuts"].get("statut", [])
                if isinstance(statuts, dict):  # Si il n'y a qu'un "statut"
                    statuts = [statuts]
                for statut in statuts:
                    statut_data = {"libelle": statut.get("libelle")}
                    processed_entry["statuts"].append(statut_data)

            # Traitement des metiers_associes
            processed_entry["metiers_associes"] = []
            if "metiers_associes" in metier and isinstance(metier["metiers_associes"], dict):
                metiers_associes = metier["metiers_associes"].get("metier_associe", [])
                if isinstance(metiers_associes, dict):  # Si il n'y a qu'un "metier_associe"
                    metiers_associes = [metiers_associes]
                for metier_associe in metiers_associes:
                    metier_associe_data = {"libelle": metier_associe.get("libelle")}
                    processed_entry["metiers_associes"].append(metier_associe_data)

            # Traitement des formations_min_requise
            processed_entry["formations_min_requise"] = []
            if "formations_min_requise" in metier and isinstance(metier["formations_min_requise"], dict):
                formations_min_requise = metier["formations_min_requise"].get("formation_min_requise", [])
                if isinstance(formations_min_requise, dict):  # Si il n'y a qu'une "formation_min_requise"
                    formations_min_requise = [formations_min_requise]
                for formation in formations_min_requise:
                    formation_data = {"libelle": formation.get("libelle")}
                    processed_entry["formations_min_requise"].append(formation_data)

            # Traitement des secteurs_activite
            processed_entry["secteurs_activite"] = []
            if "secteurs_activite" in metier and isinstance(metier["secteurs_activite"], dict):
                secteurs_activite = metier["secteurs_activite"].get("secteur_activite", [])
                if isinstance(secteurs_activite, dict):  # Si il n'y a qu'un "secteur_activite"
                    secteurs_activite = [secteurs_activite]
                for secteur in secteurs_activite:
                    secteur_data = {"libelle": secteur.get("libelle")}
                    processed_entry["secteurs_activite"].append(secteur_data)

            # Traitement des centres_interet
            processed_entry["centres_interet"] = []
            if "centres_interet" in metier and isinstance(metier["centres_interet"], dict):
                centres_interet = metier["centres_interet"].get("centre_interet", [])
                if isinstance(centres_interet, dict):  # Si il n'y a qu'un "centre_interet"
                    centres_interet = [centres_interet]
                for centre in centres_interet:
                    centre_data = {"libelle": centre.get("libelle")}
                    processed_entry["centres_interet"].append(centre_data)

            processed_metiers.append(processed_entry)

        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(processed_metiers, f, ensure_ascii=False, indent=4)

        print(f"Conversion réussie : '{xml_file}' → '{json_file}'")

    except Exception as e:
        print(f"Erreur lors de la conversion : {e}")

# Exemple d'utilisation
convert_xml_to_json_2("../onisep/Onisep_Fiches_Metiers.xml", "../onisep/Onisep_Fiches_Metiers.json")

Conversion réussie : '../onisep/Onisep_Fiches_Metiers.xml' → '../onisep/Onisep_Fiches_Metiers.json'


### Loading multiple JSON files as LangChain Document objects

In [730]:
import os
import json
from pathlib import Path
from langchain.docstore.document import Document
from langchain_community.document_loaders import JSONLoader, UnstructuredXMLLoader

# To correct UnstructuredXMLLoader errors
# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

def load_json_documents(file_path: str) -> list:
    """
    Load a JSON file using JSONLoader and return a list of Documents
    """
    loader = JSONLoader(
        file_path=file_path,
        jq_schema=".[]",
        text_content=False,
    )
    try:
        documents = loader.load()
        print(f"[JSON] Loaded file: {file_path} ({len(documents)} document(s))")
        return documents
    except Exception as e:
        print(f"Error while loading JSON file {file_path} : {e}")
        return []

# def load_xml_documents(file_path: str) -> list:
#    """
#    Load an XML file using UnstructuredXMLLoader and return a list of Documents
#    """
#    loader = UnstructuredXMLLoader(file_path)
#    try:
#        documents = loader.load()
#        print(f"[XML] Loaded file: {file_path} ({len(documents)} document(s))")
#        return documents
#    except Exception as e:
#        print(f"Error while loading XML file {file_path} : {e}")
#        return []

def load_documents(file_paths: list) -> list:
    """
    For each file in the list, we determine its extension and call the appropriate loader
    (JSONLoader or UnstructuredXMLLoader), and then return a complete list of Documents
    """
    documents = []
    for fp in file_paths:
        ext = Path(fp).suffix.lower()
        if ext == ".json":
            documents.extend(load_json_documents(fp))
#        elif ext == ".xml":
#            documents.extend(load_xml_documents(fp))
        else:
            print(f"Unsupported file (extension {ext}) : {fp}")
    return documents

def export_documents(documents: list):
    """
    Save the Document objects to a text file for inspection or debugging
    """
    with open("loaded_documents.txt", "w", encoding="utf-8") as f:
        for doc in documents:
            f.write(f"Source: {doc.metadata.get('source')}\n")
            f.write(f"Content:\n{doc.page_content}\n\n---\n\n")
    
    print("Documents saved to 'loaded_documents.txt'")

In [732]:
# List of JSON and XML files to load
file_list = [
    # DONE:
    # "../parcoursup/parcoursup_data.json",
    # "../mon_master/monmaster_data.json",
    # "../salons/salons_january.json",
    # "../salons/salons_february.json",
    # "../salons/salons_march.json",
    # "../salons/salons_may_june.json"
    # "../France_travail/Competences-themes.json",
    # "../France_travail/Francetravail Scraper debutant.json",
    # "../France_travail/Francetravail Scraper junior .json",
    # "../France_travail/Francetravail_Scraper_Global.json",
    # "../France_travail/Nombre_offres_themes.json",
    # "../mbti/mbti_jobs.json",
    # "../onisep/Onisep_Fiches_Formations.json",
    # TODO:
    "../onisep/Onisep_Fiches_Metiers.json",
    
]

# Load all the documents
all_documents = load_documents(file_list)

print(f"\nTotal number of loaded documents: {len(all_documents)}")

export_documents(all_documents)

[JSON] Loaded file: ../onisep/Onisep_Fiches_Metiers.json (958 document(s))

Total number of loaded documents: 958
Documents saved to 'loaded_documents.txt'


In [734]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def dynamic_split_documents(documents, base_chunk_size=1000, min_chunks=3, max_chunks=200):
    """
    Dynamically split a list of documents into smaller chunks based on document size
    
    Parameters:
        - documents: List of LangChain Document objects
        - base_chunk_size: Default chunk size in characters
        - min_chunks: Minimum number of chunks per document
        - max_chunks: Maximum number of chunks per document
    
    Returns:
        - List of split documents
    """
    all_splits = []
    
    for doc in documents:
        doc_length = len(doc.page_content)
        
        # Dynamically adjust the chunk size:
        # - Ensure chunks are not too small by using at least `base_chunk_size`
        # - Ensure chunks are not too big by keeping at least `min_chunks` per document
        # - This balances the number and size of chunks for better retrieval
        chunk_size = max(base_chunk_size, doc_length // max_chunks)  # Prevents chunks from being too tiny
        chunk_size = min(chunk_size, doc_length // min_chunks)  # Ensures we don't have too few large chunks
        
        # Define the text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_size // 5,  # 20% overlap
            add_start_index=True # Keeps track of where in the original document each chunk came from
        )
        
        # Split the document and extend the list
        splits = text_splitter.split_documents([doc])
        # print(f"Document: {doc.metadata.get('source')} | Original Length: {len(doc.page_content)} | Chunks Created: {len(splits)}")
        
        all_splits.extend(splits) # Adds all the elements in splits to all_splits, one by one
    
    print(f"\nTotal number of chunks created: {len(all_splits)}")
    return all_splits

def export_split_documents(chunks):
    """
    Save the split documents to a text file with delimiters between chunks

    Each document chunk's source and content are written to 'split_documents.txt', 
    with '---' as a separator between different chunks.

    Parameters:
        - chunks (list): A list of LangChain Document objects after splitting

    Returns:
        - File containing all split document chunks with delimiters
    """
    with open("split_documents.txt", "w", encoding="utf-8") as f:
        for i, chunk in enumerate(chunks):
            # Add delimiter before every chunk except the first one
            if i > 0:
                f.write("\n---\n\n")

            f.write(f"Source: {chunk.metadata.get('source')}\n")
            f.write(f"Content:\n{chunk.page_content}\n")

    print("Split documents saved to 'split_documents.txt'")


# Split the loaded documents
split_documents = dynamic_split_documents(all_documents)

export_split_documents(split_documents)


Total number of chunks created: 9947
Split documents saved to 'split_documents.txt'


## Storing Document splits

Indexing our chunks of documents to be able to vector search over them at runtime, which entails embedding the contents of each chunk of Document into the Chroma vector store

In [737]:
import time

def batch_embed_documents(split_documents, vector_store, batch_size=10, sleep_time=2):
    """
    Embed Documents in batches and add them to the vector store

    Parameters:
        - split_documents (list): List of document chunks to embed
        - vector_store: The vector database where embeddings are stored
        - batch_size (int): Number of documents per batch (default is 10)
        - sleep_time (int): Time to wait (in seconds) between batches to avoid API rate limits (default is 2)
    
    Returns:
        - None
    """
    for i in range(0, len(split_documents), batch_size):
        batch = [chunk for chunk in split_documents[i:i + batch_size]]

        try:
            document_ids = vector_store.add_documents(documents=batch)
            print(f"Batch {i // batch_size + 1}: {document_ids[:3]}")
        except Exception as e:
            print(f"Error in batch {i // batch_size + 1}: {e}")

        time.sleep(sleep_time)  # Pause to prevent API rate limiting


batch_embed_documents(split_documents, vector_store)

Batch 1: ['c8e1a078-5c5e-4fb6-8c23-958e7663d69c', '0d661a41-93dd-411a-91ea-32f922798b72', '7540fd83-ff69-4421-ac88-47ac51c3e403']
Batch 2: ['ed634a21-63ec-4614-a3cb-39ba9cc7f66f', '3f2fbb40-11a2-4f95-b6af-27b070def675', '197ab50f-8735-408f-a705-208b37e4a7a2']
Batch 3: ['121f21bf-8315-4a6e-a02d-b2732fd9d5ef', 'd25e203d-f322-4e03-9f79-107284f68797', 'f1d85903-4132-4080-b26f-08509f545f97']
Batch 4: ['9f6fd665-c2ec-4d7c-a3aa-5d2e2088d5c3', '780c5471-6f53-44f2-8f8e-6a777822779e', 'fbff4c4a-b28a-4f70-a8cd-7de58e181392']
Batch 5: ['dc6e584d-bed7-4f42-bd8d-b0ef34c131ba', '5bcc678b-e07f-4359-b3b2-cce35533d2ea', 'd85e69bc-df4e-4e65-8640-b94ecf408e39']
Batch 6: ['c78fe4d3-1ca0-4864-80ff-87a0f2928469', 'b1c1864d-5060-4ca6-8dcb-17cadd8952c9', 'ecac62b4-c82d-489b-b57b-c40b90258067']
Batch 7: ['56b08afe-41ad-44fb-a87d-44b8455ad25f', '00cf0a81-a0c8-4311-93e2-99680fa0bbc5', '8bfaa851-c006-4f4f-862e-e13a286df21b']
Batch 8: ['1b83e3da-a254-43f7-96a8-aa65cef7218d', 'd7253c1c-8335-46ff-8630-801aef395efc', 

An error occurred with MistralAI: Client error '429 Too Many Requests' for url 'https://api.mistral.ai/v1/embeddings'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429


Error in batch 157: Client error '429 Too Many Requests' for url 'https://api.mistral.ai/v1/embeddings'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
Batch 158: ['24048338-0398-4607-bf7e-8d065294d8c6', '6ca5f2a8-caad-4463-a769-f61660e67d7f', '1dcb16fb-2f6c-4ca6-b382-f099c8092e8a']
Batch 159: ['9035a67c-e972-4125-9641-fd49a3366562', '2f2ac2b4-26d2-47db-93cb-58fe390c9c96', '396ef3cf-84f9-49d0-ba3d-98db8256d7d1']
Batch 160: ['26704657-41b7-4410-b9f6-9e8fade977e3', '8a5102c0-e24d-4bdc-b438-e396283c6453', 'd4425ea1-c130-4d66-aa60-e2cd1ed2acf5']
Batch 161: ['a231d30e-7410-487c-8ebf-2e31b8f92b74', '7634146e-1a78-49be-9344-785c63b9e18a', 'bbcfcdb5-5f5f-4396-b67a-fe4c6b65cfa3']
Batch 162: ['bbf0aba6-ce90-4267-9c33-9efb68a4be59', 'ee5d8481-ce24-442b-b809-e956d2597413', '5ad4fc1d-5ad6-46ac-89f1-6171bb5a1ac5']
Batch 163: ['6e9a94d0-db24-45dd-87a4-6d7d633b0dc0', 'e7220815-f033-45b9-b281-b3a9078fd26c', '4273cf71-ef30-4b3f-9a95-39a513b3829e']
Batch 164: ['b4

An error occurred with MistralAI: Client error '429 Too Many Requests' for url 'https://api.mistral.ai/v1/embeddings'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429


Error in batch 192: Client error '429 Too Many Requests' for url 'https://api.mistral.ai/v1/embeddings'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429


An error occurred with MistralAI: Client error '429 Too Many Requests' for url 'https://api.mistral.ai/v1/embeddings'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429


Error in batch 193: Client error '429 Too Many Requests' for url 'https://api.mistral.ai/v1/embeddings'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
Batch 194: ['feeea531-0aa1-4881-b172-b13e6abb5c90', 'f3f6f07a-ed74-43db-ae05-5a2851de7c58', '93aadedb-9fdc-4a46-92f8-4851d5050d11']
Batch 195: ['ceab4e3b-1fd6-466c-8a8e-d9594c408599', 'bf45b91b-1c97-4668-abea-b2052c86ba26', 'ff54f97a-ec3f-4b9b-b4e1-9d34136d75fa']
Batch 196: ['eebeb991-7ca5-41b6-adcf-8a52a95d8311', '004b0171-5423-4092-b901-d074e25f37a8', '8427eb43-8dbf-4553-bdc9-10c603859edd']
Batch 197: ['f15e6d13-850c-4cb2-a6d2-acbd61edb86a', '6120bdab-a18d-4e61-95b2-cf12377b4455', '6dc19206-2fd9-4b90-b2ae-73b79b795520']
Batch 198: ['fcd7e5b1-b495-4213-a3e2-cd827129f3b7', 'f3000b71-c742-4494-8bbb-ff9d2add86b5', '7e13a963-c280-4ee1-92fc-291c0c8c9d72']
Batch 199: ['d3f381ef-7d9c-402c-8b6b-c0b7e039a98a', '7dd3c3ca-0ea5-45ad-987e-a9a32f957192', '78d4514a-275e-4b9c-9557-2f7442c2135c']
Batch 200: ['36

An error occurred with MistralAI: Server disconnected without sending a response.


Error in batch 373: Server disconnected without sending a response.
Batch 374: ['5aec6b43-c13a-414c-96a3-0264b7f7c0cd', '7ed408e3-d983-4c5a-8a6d-e50b15d61b10', 'b45dfcb1-a8d4-46bc-9670-74d2bfbc0287']
Batch 375: ['f2753abb-4f34-432d-a88b-e6a89c1ab8bc', 'c59c49cc-90b8-4286-8c60-2db72a572a98', 'a9344780-31a4-477f-bdcd-806f6eac54ee']
Batch 376: ['4e32a408-10d7-4150-88cc-8ff8f29c6d1a', 'cf3b03cb-6660-496e-8f2d-dc7a793277a5', '62988e2a-e213-400c-9e52-94e4bf4b913d']
Batch 377: ['ddef36b3-f207-4670-ab60-248462b95ac9', 'b5d3dcde-0f98-44ac-82be-9f858ac3ee80', '52e01c91-5bad-4313-8e9d-d53a3c08d1c4']
Batch 378: ['add0d8a3-45b9-4444-bb12-e0baeb6151cf', 'b27536dc-69b9-4508-8acf-1d660a055160', '4db4575c-9ca2-482e-b8d4-58a3f0fc84db']
Batch 379: ['d24dd9b7-a25c-4f12-ba98-ac2312ce0d60', '40b2efa4-8426-4b41-99b8-48e857f2d0fd', '055900e0-2a1b-4fb5-b949-df7890659443']
Batch 380: ['21520002-34f0-42d4-9ded-92e52e0902ea', 'ca882993-471f-45cc-88d9-c92f1fe85e58', '81721512-291c-48c6-91a2-e927afcac655']
Batch 38

In [739]:
failed_batches = [157, 192, 193, 373]

import time

def rerun_failed_batches(split_documents, vector_store, failed_batches, batch_size=10, sleep_time=2):
    """
    Re-run failed batches of document embeddings.

    Parameters:
        - split_documents (list): List of document chunks to embed
        - vector_store: The vector database where embeddings are stored
        - failed_batches (list): List of failed batch indices (starting from 1)
        - batch_size (int): Number of documents per batch (default is 10)
        - sleep_time (int): Time to wait (in seconds) between batches to avoid API rate limits (default is 2)
    
    Returns:
        - None
    """
    for batch_num in failed_batches:
        start_idx = (batch_num - 1) * batch_size
        batch = split_documents[start_idx:start_idx + batch_size]

        if not batch:  # Skip if the batch is empty
            print(f"Skipping empty batch {batch_num}")
            continue

        try:
            document_ids = vector_store.add_documents(documents=batch)
            print(f"Re-ran batch {batch_num}: {document_ids[:3]}")
        except Exception as e:
            print(f"Error re-running batch {batch_num}: {e}")

        time.sleep(sleep_time)  # Pause to prevent API rate limiting

# Example usage:
rerun_failed_batches(split_documents, vector_store, failed_batches)

Re-ran batch 157: ['aaab792e-bbba-49e5-b25e-aea53bb81c47', 'ba3fd212-bf69-416b-9274-dc46213e8683', 'e6d222d3-554d-4dc7-8f68-1331544727fc']
Re-ran batch 192: ['aacb0185-d14b-45e4-ac53-7ee6ec113920', 'bd807fe9-932b-4ade-bc9f-515abb70de79', '23157a83-54df-4faa-9d5a-ba64ec1617b9']
Re-ran batch 193: ['fe25484d-7a97-4856-99e9-4cce3ba852a9', '39e76b1a-05a3-4c76-ad59-4cfce75aa559', '80fe5896-9189-45b0-938b-a1c748eb05e0']
Re-ran batch 373: ['40847e1a-c5b3-4828-aa3a-c69cf6ea3f03', '5f499c7f-754d-41bc-a606-4ea2a90898fe', 'd1830fd9-106b-4bdb-9578-b8417988ce9c']


In [745]:
# Number of Document objects stored in our vector store
print("Number of Documents:")
print(vector_store._collection.count())

# IDs of the first 10 Documents stored in our vector store
print("\nIDs of the first 10 Documents stored in the vector store:")
print(vector_store._collection.get()["ids"][:10])

# Test query on our vector store
query = "gouvernement"
print(f'\nQuerying "{query}" on our vector store:')
results = vector_store.similarity_search(query, k=3)
for doc in results:
    print(doc.page_content)

Number of Documents:
160472

IDs of the first 10 Documents stored in the vector store:
['e4e290e7-3ad0-4d26-8c13-078ae83824f4', '719fe31e-7cbd-4152-9cf2-5262553ffa01', '0493492c-a637-4bcb-a4f7-2f597daca379', 'e4ea8808-f8f7-4690-9304-bf222d227ee7', '5e5ee20a-1dac-4b75-aa25-86c65dcf4257', 'ff2bfd8a-083f-4d48-a908-829cac595dd7', '818fcd79-c451-4f65-bfa0-8d1068b32c0b', 'eb59f67c-0fdd-4435-b1da-7251d52c1983', '6a2ebf7c-ca2c-4706-b176-1c8514bd7059', 'd7669c3a-e27e-4c6d-8cac-ad2e666db1e9']

Querying "gouvernement" on our vector store:
G\u00e9opolitique et Sciences politiques", "formation": "Formations
G\u00e9opolitique et Sciences politiques", "formation": "Formations
G\u00e9opolitique et Sciences politiques", "formation": "Formations


## Persisting the vector store to a ChromaDB vectorial database

In [None]:
import chromadb
from chromadb import PersistentClient
from tqdm.notebook import tqdm  # Notebook-friendly version to display loading progress

# Define database and collection names
db_path = "./final_chroma_db"  # New database
collection_name = "final_vector_store_collection"  # New collection

# 1️⃣ Create a new persistent database
client = PersistentClient(path=db_path)

# 2️⃣ Delete the old collection if it exists (to avoid conflicts)
try:
    client.delete_collection(name=collection_name)
    print(f"🗑️ Old collection '{collection_name}' deleted.")
except Exception:
    pass  # If it doesn't exist, no issue

# 3️⃣ Create the new collection
collection = client.get_or_create_collection(name=collection_name)
print(f"✅ New collection '{collection_name}' created in '{db_path}'.")

# 4️⃣ Load data from the vector store
stored_data = vector_store.get()
num_embeddings = len(stored_data["ids"])
print(f"🔍 {num_embeddings} embeddings found in the vector store.")

# 5️⃣ Insert embeddings into the new collection with progress tracking
batch_size = 1000  # Insert in batches for better performance
for i in tqdm(range(0, num_embeddings, batch_size), desc="📥 Adding embeddings to ChromaDB"):
    collection.add(
        ids=stored_data["ids"][i:i+batch_size],
        embeddings=stored_data["embeddings"][i:i+batch_size],
        documents=stored_data.get("documents")[i:i+batch_size] if stored_data.get("documents") else None,
        metadatas=stored_data.get("metadatas")[i:i+batch_size] if stored_data.get("metadatas") else None
    )

print(f"🎉 All embeddings have been successfully stored in '{db_path}/{collection_name}'!")


In [431]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain import hub

def build_retrieval_chain(vector_store, llm, k=5):
    """
    Build a retrieval chain using a given vector store and LLM
    
    Parameters:
        - vector_store: The vector store containing document embeddings
        - llm: The language model to generate responses
        - k: Number of most relevant document chunks to retrieve (default is 5)
    
    Returns:
        - retrieval_chain: A LangChain retrieval chain object.
    """
    # Create a retriever from the vector store
    retriever = vector_store.as_retriever(search_kwargs={"k": k})
    
    # Load a pre-defined prompt template for retrieval-based Q&A
    retrieval_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
    
    # Create a document combination chain
    combine_docs_chain = create_stuff_documents_chain(llm, retrieval_chat_prompt)
    
    # Create and return the retrieval chain
    return create_retrieval_chain(retriever, combine_docs_chain)

## LLM Prompt-Engineering

In this part, we implement an advanced AI chatbot using LangChain and LangGraph.
It integrates :
- RAG (Retrieval-Augmented Generation) with source display
- Streaming responses for enhanced user experience
- Conversational memory for tracking interactions

In [817]:
#  Importation des bibliothèques
import os
import json
import getpass
import requests

from langchain_mistralai.chat_models import ChatMistralAI
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.messages import HumanMessage
from langgraph.checkpoint.memory import MemorySaver
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.prompts import PipelinePromptTemplate

### Prompt definition

In [820]:
prompt_principal = ChatPromptTemplate.from_template(
    """
    Tu es **Crystal Bot**, un assistant intelligent d'orientation professionnelle.
    Tu dois fournir des réponses basées sur les informations récupérées.
    
    Ton objectif est d'aider chaque utilisateur à **clarifier son avenir** en lui proposant **des conseils sur-mesure et motivants**.

    **Contexte pertinent issu de la base de connaissances :** 
    {context}
    
    ➤ **Adapte-toi à l’utilisateur** : si indécis, propose des pistes d’exploration.
    ➤ **Si comparaison de plusieurs options**, aide-le à peser le pour et le contre.
    ➤ Ajoute parfois une touche légère de **vision d’avenir**, sans trop en faire.
    """
)

topic_prompt = PromptTemplate.from_template(
    """
    Analyse la question et identifie son domaine principal :
    Question: {input}
    """
)

intent_prompt = PromptTemplate.from_template(
    """
    Analyse la question et identifie l’intention principale de l’utilisateur :
    Question: {input}
    """
)

chatbot_prompt = PromptTemplate.from_template(
    """
    L'utilisateur a demandé : {input}.
    Domaine détecté : {topic} (caché à l'utilisateur)
    Intention détectée : {intent} (caché à l'utilisateur)
    
    ➤ **Si l'intention est "Métiers"** :
        - Propose des métiers adaptés et innovants.
        - Donne une vision inspirante et dynamique.
        
    ➤ **Si l'intention est "Formations"** :
        - Propose des formations adaptées et alternatives.
        - Explique les opportunités de spécialisation.

    ➤ **Si l'intention est "Salaire"** :
        - Explique les différences selon l’expérience et le secteur.
        - Évoque les tendances actuelles.

    ➤ **Si l'intention est "Offres d'emploi"** :
        - Donne des conseils personnalisés sur la recherche d’emploi.
        - Propose une approche proactive.

    ➤ **Si l'intention est "Parcoursup"** :
        - Explique comment optimiser son dossier et lettre de motivation.

    ➤ **Si l'intention est "Autre"** :
        - Propose une réponse originale en fonction du contexte.
    """
)

closing_prompt = PromptTemplate.from_template(
    """
    J’espère que cette réponse **t’a aidé à y voir plus clair !** 🔮  
    **As-tu d’autres questions ?** Je suis là pour explorer toutes les possibilités avec toi.  
    """
)

### Setting up conversation memory

In [823]:
# Mise en place de la mémoire conversationnelle
memory = MemorySaver()
workflow = StateGraph(state_schema=MessagesState)

def call_model(state: MessagesState):
    """Appelle le modèle avec historique de conversation"""
    response = llm.invoke(state["messages"])
    return {"messages": response}

workflow.add_edge(START, "model")
workflow.add_node("model", call_model)
chatbot = workflow.compile(checkpointer=memory)

## Creating the retrieval chain with LangChain

Creating a retrieval chain that, based on a user question, will retrieve relevant Documents from Chroma and provide them in context to the MistralAI language model to generate an informed answer

In [833]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain import hub

def build_retrieval_chain(vector_store, llm, k=5):
    """
    Build a retrieval chain using a given vector store and LLM
    
    Parameters:
        - vector_store: The vector store containing document embeddings
        - llm: The language model to generate responses
        - k: Number of most relevant document chunks to retrieve (default is 5)
    
    Returns:
        - retrieval_chain: A LangChain retrieval chain object.
    """
    # Create a retriever from the vector store
    retriever = vector_store.as_retriever(search_kwargs={"k": k})
    
    # Load a pre-defined prompt template for retrieval-based Q&A
    retrieval_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
    
    # Create a document combination chain
    combine_docs_chain = create_stuff_documents_chain(llm, retrieval_chat_prompt)
    
    # Create and return the retrieval chain
    return create_retrieval_chain(retriever, combine_docs_chain)

def retrieve_and_format_context(user_query, vector_store, llm):
    """Retrieve documents from vector store and format the context"""
    retrieval_chain = build_retrieval_chain(vector_store, llm)
    retrieved_docs = retrieval_chain.invoke({"input": user_query})

    documents = retrieved_docs["context"]

    formatted_context = "\n".join([doc.page_content for doc in documents])
    sources = [doc.metadata.get("source", "Unknown source") for doc in documents]

    return formatted_context, sources

## Building a full ChatBot response

In [845]:
def generate_response(user_query, vector_store, llm):
    """Generate an optimized response with CrystalBot and keep the history"""

    context, sources = retrieve_and_format_context(user_query, vector_store, llm)
    
    global conversation_history  # Utilisation d'une variable distincte pour stocker les messages

    # Charger l'historique des messages de manière locale
    past_messages = conversation_history if 'conversation_history' in globals() else []  

    # **🔹 Étape 1 : Détection du domaine / topic**
    topic_output = topic_prompt.format(input=user_query)

    # **🔹 Étape 2 : Détection de l’intention utilisateur**
    intent_output = intent_prompt.format(input=user_query)
    if not intent_output:
        intent_output = "Autre"

    # **🔹 Étape 3 : Génération de la réponse principale**
    chatbot_prompt_hidden = chatbot_prompt.format(
        input=user_query, 
        topic=topic_output, 
        intent=intent_output,
        context=context,
    )

    chatbot_output = chatbot_prompt_hidden.replace(topic_output, "").replace(intent_output, "")

    # **🔹 Étape 4 : Génération des suggestions d’exploration selon l’intention détectée**
    exploration_suggestions = ""

    if "Métiers" in intent_output:
        exploration_suggestions += """
        **🌍 Pour aller plus loin :**  
        - 🔍 **Découvre des métiers similaires** via des plateformes comme Onisep ou Studyrama.  
        - 🎭 **Participe à des salons professionnels** et rencontres avec des experts du domaine.  
        - 👥 **Échange avec des professionnels** sur LinkedIn ou lors d’événements.  
        """

    if "Formations" in intent_output or "Parcoursup" in intent_output:
        exploration_suggestions += """
        **📚 Pour approfondir ton parcours :**  
        - 🎤 **Découvre les parcours d’anciens étudiants** via des témoignages en ligne.  
        - 🔎 **Explore les formations en alternance** et internationales.  
        - 🏫 **Assiste aux journées portes ouvertes** des écoles et universités.  
        """

    if "Salaire" in intent_output:
        exploration_suggestions += """
        **💰 Pour mieux comprendre les salaires et évolutions de carrière :**  
        - 📊 **Consulte des études de rémunération** sur Glassdoor et l’APEC.  
        - 📈 **Analyse les évolutions de carrière possibles** en fonction de ton secteur.  
        """

    if "Offres d'emploi" in intent_output:
        exploration_suggestions += """
        **📝 Pour trouver des opportunités professionnelles :**  
        - 🔍 **Consulte des plateformes spécialisées** comme Indeed ou Pôle Emploi.  
        - 📩 **Optimise ton CV et ta lettre de motivation** pour te démarquer.  
        """

    if "Les deux" in intent_output:
        exploration_suggestions += """
        **🌟 Pour lier formations et métiers :**  
        - 📚 **Découvre les formations qui recrutent le plus**.  
        - 👀 **Explore les tendances de recrutement dans ton secteur**.  
        """

    if "Autre" in intent_output:
        exploration_suggestions += """
        **🔍 Explorons d’autres pistes !**  
        - 🤔 **Précise un peu plus ta demande**, veux-tu parler de reconversion, d’entrepreneuriat, d’études à l’étranger ?  
        - 💡 **Inspiration** : Parfois, explorer d’autres secteurs peut ouvrir des portes inattendues.  
        - 🔎 **Découvre des parcours inspirants** : interviews, conférences, podcasts sur des choix de carrière atypiques.  
        """

    # **🔹 Étape 5 : Conclusion dynamique**
    closing_output = closing_prompt.format()

    # ✅ **Ajout d’une introduction engageante pour la toute première réponse**
    intro_message = f"""
    Bonjour ! 😊  

    Je suis **Crystal Bot**, ton conseiller d'orientation professionnelle. 🔮✨  
    Mon objectif est de **t’aider à clarifier ton avenir** en te proposant des pistes adaptées et personnalisées.  
    Voici quelques idées et conseils pour t’aider à avancer :  
    """

    # ✅ **Création du message final à envoyer au modèle**
    final_prompt = f"""
    {intro_message}  

    **💡 Recommandation de Crystal Bot :**  
    {chatbot_output}  

    {exploration_suggestions}  

    {closing_output}  
    """

    # ✅ **Ajout du message dans l'historique**
    messages = past_messages + [HumanMessage(content=final_prompt)]

    # ✅ **Envoi de la requête complète au modèle**
    response = llm.invoke(messages)

    # ✅ **Mise à jour de l'historique de la conversation**
    conversation_history = messages + [response]

    return {
        "response": response.content,
        "sources": sources
    }

def refine_response(chatbot_output, user_query, llm):
    """Improve clarity and precision of response with LLM"""
    refinement_prompt = f"""
    Améliore la réponse suivante en la rendant plus précise et engageante :
    Contexte : {user_query}
    Réponse initiale : {chatbot_output}
    Réponse améliorée :
    """

    refined_response = llm.invoke([HumanMessage(content=refinement_prompt)]).content

    return refined_response

## Testing the ChatBot

In [863]:
user_query = "Quels métiers sont accessibles après un diplôme en informatique ?"

# Tester la récupération des documents depuis ChromaDB
context, sources = retrieve_and_format_context(user_query, vector_store, llm)

print("🔹 Contexte récupéré :")
print(context)
print("\n🔹 Sources associées :")
print(sources)

# Tester la réponse initiale générée par le chatbot
initial_result = generate_response(user_query, vector_store, llm)

print("\n🔹 Réponse initiale du chatbot :")
print(initial_result["response"])
print("\n🔹 Sources utilisées :")
print(initial_result["sources"])

# 🔥 Appel de refine_response pour améliorer la réponse
refined_result = refine_response(initial_result["response"], user_query, llm)

print("\n🔹 Réponse affinée du chatbot :")
print(refined_result)

🔹 Contexte récupéré :
en informatique est surtout accessible aux dipl\u00f4m\u00e9s de niveau bac\u00a0+\u00a04, +\u00a05\u00a0. Qu'il s'agisse des \u00e9coles d'ing\u00e9nieurs ou des universit\u00e9s, ces fili\u00e8res permettent d'acqu\u00e9rir une double comp\u00e9tence technologique et manag\u00e9riale. \u00c0 compl\u00e9ter par une exp\u00e9rience professionnelle de 5 ou 6\u00a0ans minimum.</p>\n            <p>Niveau bac\u00a0+\u00a05</p>\n            <p>Master en informatique</p>\n            <p>Dipl\u00f4me d'ing\u00e9nieur</p>\n            <p>Niveau bac\u00a0+\u00a06</p>\n            <p>Mast\u00e8re sp\u00e9cialis\u00e9</p>", "vie_professionnelle": "<h3>Salaire</h3>\n            <h5>Salaire du d\u00e9butant</h5>\n            <p>2915 euros brut par mois </p>\n            <h3>Int\u00e9grer le march\u00e9 du travail</h3>\n            <h5>Recrutements \u00e0 la hausse</h5>\n            <p>Secteur priv\u00e9 ou public : le poste de chef de projet b\u00e9n\u00e9ficie de recrutements

## Alternative (with COT + refinement)

In [857]:
def generate_response_with_cot(user_query, vector_store, llm):
    """ Génère une réponse en utilisant Chain of Thought et le LLM """

    context, sources = retrieve_and_format_context(user_query, vector_store, llm)
    
    # Étape 1 : Identification du sujet et de l'intention
    topic_output = llm.invoke([HumanMessage(content=topic_prompt.format(input=user_query))]).content
    intent_output = llm.invoke([HumanMessage(content=intent_prompt.format(input=user_query))]).content
    
    # Étape 2 : Génération d'une réponse avec le chatbot_prompt
    chatbot_input = chatbot_prompt.format(input=user_query, topic=topic_output, intent=intent_output)
    chatbot_output = llm.invoke([HumanMessage(content=chatbot_input)]).content

    # Étape 3 : Ajout d'un message de clôture
    closing_output = llm.invoke([HumanMessage(content=closing_prompt.format())]).content

    # ✅ Correction : Retourne un dictionnaire structuré
    return {
        "response": chatbot_output + "\n" + closing_output,
        "sources": sources
    }

In [None]:
user_query = "Quels métiers sont accessibles après un diplôme en informatique ?"

# Tester la récupération des documents depuis ChromaDB
context, sources = retrieve_and_format_context(user_query, vector_store, llm)

# Tester la réponse initiale générée par le chatbot
initial_result = generate_response_with_cot(user_query, vector_store, llm)

# 🔥 Appel de refine_response pour améliorer la réponse
refined_result = refine_response(initial_result["response"], user_query, llm)

print("\n🔹 Réponse affinée du chatbot :")
print(refined_result)