# Building a Retrieval Augmented Generation (RAG) Chatbot

In [None]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph
!pip install -qU langchain-mistralai
!pip install -qU langchain-chroma

# To correct UnstructuredXMLLoader errors
!pip install "unstructured>=0.6.7"

## Setup

In [3]:
import getpass
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter API key for Langsmith: ")

Enter API key for Langsmith:  ········


## Components

In [5]:
import getpass
import os

# Chat model

if not os.environ.get("MISTRAL_API_KEY"):
  os.environ["MISTRAL_API_KEY"] = getpass.getpass("Enter API key for Mistral AI: ")

Enter API key for Mistral AI:  ········


In [None]:
from langchain_mistralai import ChatMistralAI, MistralAIEmbeddings

# Chat model
llm = ChatMistralAI(model="mistral-large-latest")

# Embeddings model
embeddings = MistralAIEmbeddings(model="mistral-embed")

In [9]:
from langchain_chroma import Chroma

# Vector store
vector_store = Chroma(embedding_function=embeddings)

## Loading JSON and XML files

#### Test: loading a single JSON file

In [13]:
import json
from pathlib import Path
from pprint import pprint

file_path="../parcoursup/parcoursup_data.json"
data = json.loads(Path(file_path).read_text())
pprint(data)

[{'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Chimie',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Langues et littÃ©ratures franÃ§aises',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Philosophie, EpistÃ©mologie',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Physique',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Pluri Droit - Sciences Ã©co - AES',
  'propositions_d_admissions': 0.0,
  'voeux': 0},
 {'acceptations': 0,
  'annee_du_bac': 2021,
  'doublette': 'Art,Art',
  'formation': 'Licence Pluri Langues',
  'propositions_d_admissions': 0.0

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [15]:
from langchain_community.document_loaders import JSONLoader

loader = JSONLoader(
    file_path="../parcoursup/parcoursup_data.json",
    jq_schema=".[]",
    text_content=False
)

documents = loader.load()
pprint(documents[0])

Document(metadata={'source': 'C:\\Users\\Laura GALINDO\\Documents\\Academique_et_pro\\Epitech Digital School\\Bootcamp Chatbot IA\\chatbot-orientation-pro\\parcoursup\\parcoursup_data.json', 'seq_num': 1}, page_content='{"annee_du_bac": 2021, "doublette": "Art,Art", "formation": "Licence Chimie", "voeux": 0, "propositions_d_admissions": 0, "acceptations": 0}')


#### Test: loading a single XML file

In [44]:
from bs4 import BeautifulSoup

with open('../onisep/Onisep_Fiches_Formations.xml', 'r', encoding="utf-8") as f:
    data = f.read()

bs_data = BeautifulSoup(data, 'xml')

print(bs_data.prettify()[:500])

<?xml version="1.0" encoding="utf-8"?>
<formations xmlns:exsl="http://exslt.org/common">
 <formation>
  <identifiant>
   FOR.1000
  </identifiant>
  <code_scolarite>
   16X20003
  </code_scolarite>
  <libelle_complet>
   mastère spé. management de la qualité, de la sécurité et de l'environnement
  </libelle_complet>
  <sigle/>
  <libelle_generique/>
  <libelle_specifique/>
  <type_option/>
  <type_Formation>
   <type_formation_sigle>
    mastère spé.
   </type_formation_sigle>
   <type_formation


In [48]:
from langchain_community.document_loaders import UnstructuredXMLLoader
from pprint import pprint

loader = UnstructuredXMLLoader('../onisep/Onisep_Fiches_Formations.xml')

documents = loader.load()

print(documents[0].page_content[:500])

FOR.1000

16X20003

mastère spé. management de la qualité, de la sécurité et de l'environnement

mastère spé.

MASTERE SPE

mastère spécialisé

"1 an"

https://www.francecompetences.fr/recherche/rncp/37084/

200

Spécialités pluritechnologiques de la production

R

Contrôle, prévention, entretien

niveau 7

REF.423

bac + 6

16X

Label de la Conférence des grandes écoles (CGE)

Formation inscrite au RNCP

MET.878

chargé/e hygiène sécurité environnement (HSE)

chargée hygiène sécurité environnem


#### Loading multiple JSON and XML documents into LangChain Document objects

In [3]:
import os
import json
from pathlib import Path
from langchain.docstore.document import Document
from langchain_community.document_loaders import JSONLoader, UnstructuredXMLLoader

# To correct UnstructuredXMLLoader errors
# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

def load_json_documents(file_path: str) -> list:
    """
    Load a JSON file using JSONLoader and return a list of Documents
    """
    loader = JSONLoader(
        file_path=file_path,
        jq_schema=".[]",
        text_content=False,
    )
    try:
        documents = loader.load()
        print(f"[JSON] Loaded file: {file_path} ({len(documents)} document(s))")
        return documents
    except Exception as e:
        print(f"Error while loading JSON file {file_path} : {e}")
        return []

def load_xml_documents(file_path: str) -> list:
    """
    Load an XML file using UnstructuredXMLLoader and return a list of Documents
    """
    loader = UnstructuredXMLLoader(file_path)
    try:
        documents = loader.load()
        print(f"[XML] Loaded file: {file_path} ({len(documents)} document(s))")
        return documents
    except Exception as e:
        print(f"Error while loading XML file {file_path} : {e}")
        return []

def load_documents(file_paths: list) -> list:
    """
    For each file in the list, we determine its extension and call the appropriate loader
    (JSONLoader or UnstructuredXMLLoader), and then return a complete list of Documents
    """
    documents = []
    for fp in file_paths:
        ext = Path(fp).suffix.lower()
        if ext == ".json":
            documents.extend(load_json_documents(fp))
        elif ext == ".xml":
            documents.extend(load_xml_documents(fp))
        else:
            print(f"Unsupported file (extension {ext}) : {fp}")
    return documents

def export_documents(documents: list):
    """
    Save the Document objects to a text file for inspection or debugging
    """
    with open("loaded_documents.txt", "w", encoding="utf-8") as f:
        for doc in documents:
            f.write(f"Source: {doc.metadata.get('source')}\n")
            f.write(f"Content:\n{doc.page_content}\n\n---\n\n")
    
    print("Documents saved to 'loaded_documents.txt'")

In [5]:
# List of JSON and XML files to load
file_list = [
    "../parcoursup/parcoursup_data.json",
    "../mon_master/monmaster_data.json",
    "../onisep/Onisep_Fiches_Formations.xml",
    "../onisep/Onisep_Fiches_Metiers.xml"
]

# Load all the documents
all_documents = load_documents(file_list)

print(f"\nTotal number of loaded documents: {len(all_documents)}")

# export_documents(all_documents)

[JSON] Loaded file: ../parcoursup/parcoursup_data.json (12610 document(s))
[JSON] Loaded file: ../mon_master/monmaster_data.json (19603 document(s))
[XML] Loaded file: ../onisep/Onisep_Fiches_Formations.xml (1 document(s))
[XML] Loaded file: ../onisep/Onisep_Fiches_Metiers.xml (1 document(s))

Total number of loaded documents: 32215
Documents saved to 'loaded_documents.txt'


In [41]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def dynamic_split_documents(documents, base_chunk_size=1000, min_chunks=3, max_chunks=200):
    """
    Dynamically split a list of documents into smaller chunks based on document size
    
    Parameters:
        - documents: List of LangChain Document objects
        - base_chunk_size: Default chunk size in characters
        - min_chunks: Minimum number of chunks per document
        - max_chunks: Maximum number of chunks per document
    
    Returns:
        - List of split documents
    """
    all_splits = []
    
    for doc in documents:
        doc_length = len(doc.page_content)
        
        # Dynamically adjust the chunk size:
        # - Ensure chunks are not too small by using at least `base_chunk_size`
        # - Ensure chunks are not too big by keeping at least `min_chunks` per document
        # - This balances the number and size of chunks for better retrieval
        chunk_size = max(base_chunk_size, doc_length // max_chunks)  # Prevents chunks from being too tiny
        chunk_size = min(chunk_size, doc_length // min_chunks)  # Ensures we don't have too few large chunks
        
        # Define the text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_size // 5,  # 20% overlap
            add_start_index=True # Keeps track of where in the original document each chunk came from
        )
        
        # Split the document and extend the list
        splits = text_splitter.split_documents([doc])
        # print(f"Document: {doc.metadata.get('source')} | Original Length: {len(doc.page_content)} | Chunks Created: {len(splits)}")
        
        all_splits.extend(splits) # Adds all the elements in splits to all_splits, one by one
    
    print(f"\nTotal number of chunks created: {len(all_splits)}")
    return all_splits

def export_split_documents(chunks):
    """
    Save the split documents to a text file with delimiters between chunks

    Each document chunk's source and content are written to 'split_documents.txt', 
    with '---' as a separator between different chunks.

    Parameters:
        - chunks (list): A list of LangChain Document objects after splitting

    Output:
        - File containing all split document chunks with delimiters
    """
    with open("split_documents.txt", "w", encoding="utf-8") as f:
        for i, chunk in enumerate(chunks):
            # Add delimiter before every chunk except the first one
            if i > 0:
                f.write("\n---\n\n")

            f.write(f"Source: {chunk.metadata.get('source')}\n")
            f.write(f"Content:\n{chunk.page_content}\n")

    print("Split documents saved to 'split_documents.txt'")

In [43]:
# Split the loaded documents
split_documents = dynamic_split_documents(all_documents)

# Save the split documents to a file
export_split_documents(split_documents)


Total number of chunks created: 129801
Split documents saved to 'split_documents.txt'


## création de la base de donnée vectorielle 

In [None]:

# Import du module Chroma pour la base vectorielle
from langchain_chroma import Chroma

# Création et stockage des embeddings dans Chroma
db = Chroma.from_documents(
    documents_chunks,  # Documents segmentés en chunks
    embedding=embeddings,  
    persist_directory="chroma_db"  # Dossier où stocker la base Chroma
)

# Sauvegarde de la base de données vectorielle
db.persist()

print("✅ Base de données vectorielle Chroma créée et sauvegardée avec succès !")


##  Recherche dynamique avec réglage de K

# Fonction de recherche améliorée avec ajustement dynamique de k

def query_chroma(query_text, initial_k=3, threshold=0.5):
    """
    Recherche des documents les plus pertinents en fonction d'une requête utilisateur.

    Args:
        query_text (str): La question posée par l'utilisateur.
        initial_k (int): Nombre initial de documents à retourner.
        threshold (float): Seuil de similarité minimum.

    Returns:
        list: Liste des documents les plus proches de la requête.
    """

    # Convertir la base Chroma en un outil de recherche
    retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": initial_k})

    # Recherche des documents
    results = retriever.invoke(query_text)

    # Vérification des scores de similarité
    avg_similarity = sum(doc.metadata.get("similarity_score", 0) for doc in results) / len(results)

    # Ajustement de k si nécessaire
    if avg_similarity < threshold:
        print(f"⚠️ Similarité moyenne faible ({avg_similarity:.2f}), augmentation de k...")
        retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": initial_k + 2})
        results = retriever.invoke(query_text)

    # Affichage des résultats
    print(f"🔍 Résultats pour la requête : "{query_text}" (k={len(results)})")
    for i, doc in enumerate(results):
        print(f"📄 Document {i+1}:\n{doc.page_content[:300]}\n{'-'*50}")
    
    return results

# Exemple de requête utilisateur avec ajustement de k
query_chroma("Quels sont les métiers accessibles après un BTS Informatique ?")


# vérification des embedding

In [None]:

# Création de la base de données vectorielle Chroma
from langchain_chroma import Chroma

db = Chroma.from_documents(
    documents_chunks,
    embedding=embeddings,
    persist_directory="chroma_db"
)

# Sauvegarde des embeddings
db.persist()

# Vérification de la base
print("Base Chroma créée avec succès.")
print(f" Nombre total de documents indexés: {db._collection.count()}")

## Optimisation de K 

In [None]:

# Fonction améliorée pour le retrieval avec ajustement dynamique de k

def query_chroma(query_text, initial_k=3, max_k=8, threshold=0.5):
    """
    Recherche des documents les plus pertinents en fonction d'une requête utilisateur.
    
    Args:
        query_text (str): La question posée.
        initial_k (int): Valeur initiale de k.
        max_k (int): Valeur maximale de k en cas de faible pertinence.
        threshold (float): Seuil de pertinence minimale.

    Returns:
        list: Liste des documents les plus pertinents.
    """
    
    # Convertir Chroma en outil de recherche
    retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": initial_k})
    results = retriever.invoke(query_text)

    # Vérifier la pertinence des résultats
    avg_similarity = sum(doc.metadata.get("similarity_score", 1) for doc in results) / len(results)

    if avg_similarity < threshold and initial_k < max_k:
        print(f"⚠️ Similarité moyenne faible ({avg_similarity:.2f}), augmentation de k...")
        retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": max_k})
        results = retriever.invoke(query_text)

    # Affichage des résultats
    print(f"🔍 Résultats pour '{query_text}' (k={len(results)})")
    for i, doc in enumerate(results):
        print(f"📄 Document {i+1}:\n{doc.page_content[:300]}\n{'-'*50}")
    
    return results

# Test avec une requête d'orientation professionnelle
query_chroma("Quels sont les métiers accessibles après un BTS Informatique ?")