# **Mission Extraction SCR**

1. Extraction SCR avec GPT-4o
2. Classification documents avec GPT-4o
3. Classification documents avec Mistral 7B Q4
4. Extraction SCR regex
5. Comparaison résultats entre regex et GPT-4o

## **1. Extraction SCR avec GPT-4o**

**Note:** Cette extraction se fait via colab. Les scripts ont donc été recopiés.

#### Script script_extract_classif_scr_openai.py


In [None]:
!pip install PyPDF2
!pip install json_repair
!pip install openai
!pip install tqdm
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [None]:
import pandas as pd
from PyPDF2 import PdfReader
from json_repair import repair_json
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from openai import OpenAI
import tiktoken
import time

In [None]:
class ExtracteurSCROpenAI:
    def __init__(self, url, openai_api_key):
        """
        - url : chemin complet vers le PDF (ex : "/content/drive/MyDrive/chemin/vers/doc.pdf")
        - openai_api_key : clé API OpenAI (doit être définie pour accéder à GPT-4)
        """
        self.url = url
        self.pdf_reader = PdfReader(self.url)
        # Créer le client OpenAI avec la nouvelle interface
        self.client = OpenAI(api_key=openai_api_key)
        self.df_scr = pd.DataFrame()

    def extract_equipment(self):
        """
        Extrait le nom de l'équipement depuis la première page du document.
        """
        first_page_text = self.pdf_reader.pages[0].extract_text()
        prompt = f"""
{first_page_text}

What is the equipment this document is dealing with? Return only the response.
"""
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
            max_tokens=100
        )
        return response.choices[0].message.content.strip()

    def alimentation_df(self, data_json, page_num, equipment):
        """
        Transforme la sortie JSON en DataFrame.
        Gère le cas où chaque item est un dict ou une liste.
        """
        rows = []
        for defect in data_json:
            if isinstance(defect, dict):
                symptom = defect.get("symptom", "Unknown")
                cause = defect.get("cause", "Unknown")
                remedy = defect.get("remedy", "Unknown")
            elif isinstance(defect, list):
                symptom = defect[0] if len(defect) > 0 else "Unknown"
                cause = defect[1] if len(defect) > 1 else "Unknown"
                remedy = defect[2] if len(defect) > 2 else "Unknown"
            else:
                symptom, cause, remedy = "Unknown", "Unknown", "Unknown"
            row = {
                "URL": self.url.rsplit("/", 1)[-1],
                "equipment": equipment,
                "page": page_num,
                "symptom": symptom,
                "cause": cause,
                "remedy": remedy,
            }
            rows.append(row)
        return pd.DataFrame(rows)

    def process_page(self, page_num, equipment):
        """
        Traite une page du PDF pour extraire les défauts (SCR) via le prompt.
        """
        page_text = self.pdf_reader.pages[page_num - 1].extract_text()
        if not page_text.strip():
            return pd.DataFrame([], columns=["URL", "equipment", "page", "symptom", "cause", "remedy"])

        prompt = f"""
{page_text}

Extract all defects and their associated causes and remedies from the provided text. Return a JSON array.
There can be more than one line for a single defect if it has different causes and if a cause has different remedies,
one line should represent a unique group of a symptom, a cause, and a remedy.

* symptom: A description of the defect. Include any error codes mentioned.
* cause: A possible explanation for the defect.
* remedy: The suggested solution or troubleshooting steps.

If a defect lacks one or more of these components (symptom, cause, or remedy), include the missing information as "Unknown".

Example JSON format:
[
    {{
        "symptom": "PNT1-166 Linear Potentiometer Unstable",
        "cause": "During Auto Calibration, the feedback from the linear potentiometer revealed large fluctuations.",
        "remedy": "Change the applicator and repair the malfunctioning linear potentiometer."
    }}
]
"""
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
            max_tokens=2000
        )
        generated_text = response.choices[0].message.content
        repaired_json = repair_json(generated_text)
        try:
            data_json = json.loads(repaired_json)
        except json.JSONDecodeError:
            data_json = []

        return self.alimentation_df(data_json, page_num, equipment)

    def extract_defects(self, start_page=0, end_page=0):
        """
        Extrait les défauts SCR du document entre start_page et end_page
        avec respect de la limite de tokens par minute.
        """
        if end_page == 0:
            end_page = len(self.pdf_reader.pages)

        equipment = self.extract_equipment()
        all_results = []

        for page in tqdm(range(start_page, end_page + 1), desc="Extraction SCR"):
            df_page = self.process_page(page, equipment)
            all_results.append(df_page)
            time.sleep(60)  # pour rester sous les 10 000 tokens/min

        self.df_scr = pd.concat(all_results, ignore_index=True)
        self.df_scr = self.df_scr[["URL", "equipment", "page", "symptom", "cause", "remedy"]]
        self.df_scr.sort_values(by="page", ascending=True, inplace=True)
        return self.df_scr

    def classify_document(self, start_page=0, end_page=0, chunk_size_tokens=50000):
        """
        Découpe le document en chunks d'environ chunk_size_tokens tokens.
        Classe chaque chunk avec GPT-4, en respectant un délai de 60s entre chaque.
        Affiche une barre de progression avec tqdm et le temps par chunk.
        Retourne : "x/y chunks structured SCR"
        """
        if end_page == 0:
            end_page = len(self.pdf_reader.pages)

        # Texte brut
        texts = []
        for i in range(start_page, end_page):
            text = self.pdf_reader.pages[i].extract_text()
            if text and text.strip():
                texts.append(text)
        full_text = "\n\n".join(texts)

        # Découpage en tokens
        encoding = tiktoken.encoding_for_model("gpt-4")
        tokens = encoding.encode(full_text)
        chunks = []
        for i in range(0, len(tokens), chunk_size_tokens):
            chunk_tokens = tokens[i:i + chunk_size_tokens]
            chunk_text = encoding.decode(chunk_tokens)
            chunks.append(chunk_text)

        results = []

        for idx, chunk in enumerate(tqdm(chunks, desc="Classification SCR chunks"), 1):
            start_time = time.time()

            prompt = f"""
    Here is an excerpt from a technical document:

    {chunk}

    Based on the text above, determine whether this excerpt is structured in a way that facilitates the extraction of defects and their associated causes and remedies using a simple regex extraction. In particular, check if the excerpt clearly delineates sections or markers corresponding to:
    * symptom: a description of the defect (including any error codes),
    * cause: a possible explanation for the defect,
    * remedy: the suggested solution or troubleshooting steps.

    Answer only "Yes" or "No".
    """

            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.0,
                max_tokens=10
            )
            answer = response.choices[0].message.content.strip().lower()
            results.append("yes" in answer)

            elapsed = round(time.time() - start_time, 2)
            print(f"✅ Chunk {idx} terminé en {elapsed} secondes.")
            time.sleep(30)  # Limite TPM

        yes_count = sum(results)
        total_chunks = len(results)
        return f"{yes_count}/{total_chunks} chunks structured SCR"



#### Connexion à l'API OpenAI

In [None]:
from openai import OpenAI

In [None]:
# Création du client OpenAI avec clé API
client = OpenAI(api_key="sk-secret")

models = client.models.list()

for m in models.data:
    print(m.id)

gpt-4o-realtime-preview-2024-12-17
gpt-4o-audio-preview-2024-12-17
dall-e-3
dall-e-2
gpt-4o-audio-preview-2024-10-01
gpt-4o-realtime-preview-2024-10-01
gpt-4o-transcribe
gpt-4o-mini-transcribe
gpt-4o-realtime-preview
babbage-002
gpt-4o-mini-tts
tts-1-hd-1106
text-embedding-3-large
gpt-4
text-embedding-ada-002
omni-moderation-latest
tts-1-hd
gpt-4o-mini-audio-preview
gpt-4o-audio-preview
o1-preview-2024-09-12
gpt-4o-mini-realtime-preview
gpt-4o-mini-realtime-preview-2024-12-17
gpt-3.5-turbo-instruct-0914
gpt-4o-mini-search-preview
tts-1-1106
davinci-002
gpt-3.5-turbo-1106
gpt-4-turbo
gpt-4-0125-preview
gpt-3.5-turbo-instruct
gpt-3.5-turbo
gpt-4-turbo-preview
chatgpt-4o-latest
gpt-4o-mini-search-preview-2025-03-11
gpt-4o-2024-11-20
whisper-1
gpt-3.5-turbo-0125
gpt-4o-2024-05-13
gpt-3.5-turbo-16k
gpt-4-turbo-2024-04-09
gpt-4-1106-preview
o1-preview
gpt-4-0613
gpt-4o-search-preview
gpt-4.5-preview
gpt-4.5-preview-2025-02-27
gpt-4o-search-preview-2025-03-11
tts-1
omni-moderation-2024-09-26


#### Test de la connexion à l'API

In [None]:
# Fonction pour interroger GPT-4
def call_gpt(prompt, model="gpt-4o", max_tokens=1024, temperature=0.0):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=max_tokens
    )
    return response.choices[0].message.content.strip()

# Exemple d’appel
prompt = "What is the capital of France?"
response = call_gpt(prompt)
print(response)

The capital of France is Paris.


### **Extraction SCR**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# from script_extract_classif_scr_openai import ExtracteurSCROpenAI
import pandas as pd

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', None)

# Clé API OpenAI
OPENAI_API_KEY = "sk-secret"

# Chemin complet vers le PDF dans Google Drive
url = "/content/drive/MyDrive/projet_fil_rouge/data/doc_simple/doc-R-30iB.pdf"

# Initialisation de l'extracteur via l'API OpenAI (GPT-4)
extracteur = ExtracteurSCROpenAI(
    url=url,
    openai_api_key=OPENAI_API_KEY
)

# Extraction des défauts (SCR) entre la page 40 et 60
df_result = extracteur.extract_defects(start_page=40, end_page=60)

display(df_result)


Extraction SCR: 100%|██████████| 21/21 [24:54<00:00, 71.18s/it]


Unnamed: 0,URL,equipment,page,symptom,cause,remedy
0,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,ERROR RECOVER Y MARRBERCD04121E REV N,Overtravel of the robot axis,"Press and hold down the SHIFT key until you have completed Step 7b through Step 7d. Press and continue pressing SHIFT and press F2, RESET. Wait for servo power. Continuously press and hold the DEADMAN switch and turn the teach pendant ON/OFF switch to ON. Jog the overtraveled axis off the overtravel switch. When you have finished jogging, you can release the SHIFT key."
1,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,Robot is not calibrated,Unknown,"Press and hold down the SHIFT key until you have completed Step 8a through Step 8d. Press and continue pressing SHIFT and press F2, RESET. Wait for servo power. Press COORD until you select the JOINT coordinate system. Continuously press and hold the DEADMAN switch and turn the teach pendant ON/OFF switch to ON. Jog the overtraveled axis off the overtravel switch. When you have finished jogging, you can release the SHIFT key."
2,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,Robot is not in an actual overtravel condition,Unknown,Check CRM68 & CRF7 connection on the amplifier PCB
3,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,Hand breakage error,"The hand breakage detection switch is tripped on robots equipped with hand breakage hardware. The switch is tripped when the robot tool strikes an obstacle, which could possibly cause the tool to break.",Unknown
4,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,41,Hand breakage error message is displayed,"The tool struck an object, causing the hand to break","1. Continuously press and hold the DEADMAN switch and turn the teach pendant ON/OFF switch to ON. 2. Hold down the SHIFT key and press RESET. The robot can now be moved. 3. Jog the robot to a safe position. 4. Press the EMERGENCY STOP button. 5. Request a trained service person to inspect and, if necessary, repair the tool. 6. Determine what caused the tool to strike an object, causing the hand to break. 7. If the hand breakage occurred while a program was being executed, you might need to reteach positions, modify the program, or move the object that was struck. 8. Test run the program if it has been modified, if new positions have been recorded, or if objects in the work envelope have been moved."
...,...,...,...,...,...,...
81,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,60,3.1.1.70 ACAL-069 Auto Update is ON.,Auto Update is on therefore the frame has been updated automatically .,An update is not required.
82,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,60,3.1.1.71 ACAL-070 No update on record points.,No update on record points. The teach pendant program has a different number of DETECT instructions than expected. The positions cannot automatically update in the screen.,Manually record positions to create the calibration program.
83,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,60,3.1.1.72 ACAL-071 Invalid joint number .,The axis number specified in the DETECT Joint instruction is invalid.,Change the axis number to a valid one.
84,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,60,3.1.1.73 ACAL-072 Invalid joint sensor type.,The Detect Joint instruction cannot use T OS WRIST sensor type for contact detection.,Change sensor type to either IO or T OS all axes in the detection schedule.


In [None]:
# Chemin de destination dans ton Google Drive
output_path = "/content/drive/MyDrive/projet_fil_rouge/extraction_scr/extract_scr_openai_test.csv"

# Sauvegarde du DataFrame en .csv
df_result.to_csv(output_path, index=False)

print(f"✅ CSV sauvegardé ici : {output_path}")

✅ CSV sauvegardé ici : /content/drive/MyDrive/projet_fil_rouge/extraction_scr/extract_scr_openai_test.csv


## **2. Classification Documents avec GPT-4o**

**Note:** Cette classification se fait via colab. Les scripts ont donc été recopiés.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# from script_extract_classif_scr_openai import ExtracteurSCROpenAI
import pandas as pd

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', None)

# Chemin complet vers votre PDF dans Google Drive
url = "/content/drive/MyDrive/projet_fil_rouge/data/doc_simple/doc-R-30iB.pdf"

# Clé API OpenAI
OPENAI_API_KEY = "sk-secret"

# Initialisation de l'extracteur via l'API OpenAI (GPT-4 ou autre)
extracteur = ExtracteurSCROpenAI(
    url=url,
    openai_api_key=OPENAI_API_KEY
)

# Classification du document par chunks de 5000 tokens
result = extracteur.classify_document(start_page=40, end_page=60, chunk_size_tokens=5000)

print(f"Résultat classification : {result}")


Classification SCR chunks:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Chunk 1 terminé en 1.61 secondes.


Classification SCR chunks:  20%|██        | 1/5 [01:01<04:06, 61.61s/it]

✅ Chunk 2 terminé en 1.32 secondes.


Classification SCR chunks:  40%|████      | 2/5 [02:02<03:04, 61.44s/it]

✅ Chunk 3 terminé en 1.12 secondes.


Classification SCR chunks:  60%|██████    | 3/5 [03:04<02:02, 61.29s/it]

✅ Chunk 4 terminé en 1.19 secondes.


Classification SCR chunks:  80%|████████  | 4/5 [04:05<01:01, 61.25s/it]

✅ Chunk 5 terminé en 0.66 secondes.


Classification SCR chunks: 100%|██████████| 5/5 [05:05<00:00, 61.18s/it]

Résultat classification : 2/5 chunks structured SCR





### Conclusions:

1. **Code mis en place fonctionne...**

Le code mis en place marche. La classe ExtracteurSCROpenAI peut à l'aide d'un LLM GPT-4o:

- Classifier les documents selon si oui ou non ils ont une structure SCR bien visible par le LLM. Le résultats est renvoyé en une fraction qui correspond à {nombre de chunks structuré SCR}/{nombre de chunks total}. Un chunk = 5000 tokens.
- Extraire les SCRs d'un documents et les stocker dans un dataframe

---

2. **...mais impossible à appliquer en tant qu'étudiant.**

- Une limite de 10000 tokens d'Openai par minutes qui fait exploser les temps d'execution(les documents font parfois plus de 700000 tokens).
- Un coût beaucoup trop important, 1 euros pour 100 000 tokens dans l'API OpenAI.




## **3. Classification de documents via LLM local Mistral 7B Q4 via llama.cpp**

#### Test LLM local Mistral 7B Q4

In [4]:
import os
from llama_cpp import Llama

model_path = os.path.expanduser("~/llama_models/mistral/mistral-7b-instruct-v0.2.Q4_K_M.gguf")

llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_gpu_layers=-1,
    use_mlock=True,
    verbose=False
)

prompt = "[INST] Is this excerpt structured in SCR format? [/INST] Yes or No."
output = llm(prompt, max_tokens=20)
print(output['choices'][0]['text'].strip())


llama_init_from_model: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_b

I'd be happy to help, but I need more context to provide an accurate answer. S


#### Script de classification et extraction

In [16]:
import os
import pandas as pd
from PyPDF2 import PdfReader
from json_repair import repair_json
import json
from tqdm import tqdm
from llama_cpp import Llama


class Extracteur_SCR_Mistral:
    def __init__(self, url, model_path="~/llama_models/mistral/mistral-7b-instruct-v0.2.Q4_K_M.gguf", n_ctx=4096, n_gpu_layers=-1, verbose=False):
        self.url = url
        self.pdf_reader = PdfReader(self.url)
        self.df_scr = pd.DataFrame()

        self.model_path = os.path.expanduser(model_path)
        self.llm = Llama(
            model_path=self.model_path,
            n_ctx=n_ctx,
            n_gpu_layers=n_gpu_layers,
            use_mlock=True,
            verbose=verbose
        )

    def query_llm(self, prompt, max_tokens=1024):
        response = self.llm(
            f"[INST] {prompt.strip()} [/INST]",
            max_tokens=max_tokens,
            stop=["</s>"]
        )
        return response["choices"][0]["text"].strip()

    def extract_equipment(self):
        first_page_text = self.pdf_reader.pages[0].extract_text()
        prompt = f"""
{first_page_text}

What is the equipment this document is dealing with? Return only the response.
"""
        return self.query_llm(prompt, max_tokens=100)

    def alimentation_df(self, data_json, page_num, equipment):
        rows = []
        for defect in data_json:
            row = {
                "URL": self.url.rsplit("/", 1)[-1],
                "equipment": equipment,
                "page": page_num,
                "symptom": defect.get("symptom", "Unknown"),
                "cause": defect.get("cause", "Unknown"),
                "remedy": defect.get("remedy", "Unknown"),
            }
            rows.append(row)
        return pd.DataFrame(rows)

    def process_page(self, page_num, equipment):
        page_text = self.pdf_reader.pages[page_num - 1].extract_text()
        if not page_text.strip():
            return pd.DataFrame([], columns=["URL", "equipment", "page", "symptom", "cause", "remedy"])

        prompt = f"""
{page_text}

Extract all defects and their associated causes and remedies from the provided text. Return a JSON array.
There can be more than one line for a single defect if it has different causes and if a cause has different remedies,
one line should represent a unique group of a symptom, a cause, and a remedy.

* symptom: A description of the defect. Include any error codes mentioned.
* cause: A possible explanation for the defect.
* remedy: The suggested solution or troubleshooting steps.

If a defect lacks one or more of these components (symptom, cause, or remedy), include the missing information as "Unknown".

Example JSON format:
[
    {
        "symptom": "PNT1-166 Linear Potentiometer Unstable",
        "cause": "During Auto Calibration, the feedback from the linear potentiometer revealed large fluctuations.",
        "remedy": "Change the applicator and repair the malfunctioning linear potentiometer."
    }
]
"""
        response = self.query_llm(prompt, max_tokens=2000)
        repaired_json = repair_json(response)
        try:
            data_json = json.loads(repaired_json)
        except json.JSONDecodeError:
            data_json = []
        return self.alimentation_df(data_json, page_num, equipment)

    def extract_defects(self, start_page=0, end_page=0):
        if end_page == 0:
            end_page = len(self.pdf_reader.pages)
        equipment = self.extract_equipment()
        results = []
        for page in tqdm(range(start_page, end_page + 1), desc="Extraction SCR"):
            df_page = self.process_page(page, equipment)
            results.append(df_page)
        self.df_scr = pd.concat(results, ignore_index=True)
        self.df_scr = self.df_scr[["URL", "equipment", "page", "symptom", "cause", "remedy"]]
        self.df_scr.sort_values(by="page", ascending=True, inplace=True)
        return self.df_scr

    def classify_document_par_chunks(self, start_page=0, end_page=0, chunk_size_tokens=1000):
        if end_page == 0:
            end_page = len(self.pdf_reader.pages)

        # 📄 Récupération du texte complet
        texts = []
        for i in range(start_page, end_page):
            text = self.pdf_reader.pages[i].extract_text()
            if text and text.strip():
                texts.append(text)
        full_text = "\n\n".join(texts)

        # 🧠 Tokenisation complète du texte
        token_ids = self.llm.tokenize(full_text.encode("utf-8"))

        # ✂️ Découpage en chunks de tokens
        chunks = [token_ids[i:i + chunk_size_tokens] for i in range(0, len(token_ids), chunk_size_tokens)]

        results = []

        for idx, chunk in enumerate(tqdm(chunks, desc="Classification SCR chunks"), 1):
            chunk_text = self.llm.detokenize(chunk).decode("utf-8", errors="ignore")

            prompt = f"""
    Here is an excerpt from a technical document:

    {chunk_text}

    Based on the text above, determine whether this excerpt is structured in a way that facilitates the extraction of defects and their associated causes and remedies using a simple regex extraction. In particular, check if the excerpt clearly delineates sections or markers corresponding to:
    * symptom: a description of the defect (including any error codes),
    * cause: a possible explanation for the defect,
    * remedy: the suggested solution or troubleshooting steps.

    Answer only "Yes" or "No".
    """

            try:
                response = self.query_llm(prompt, max_tokens=10).lower()
                results.append("yes" in response)
            except RuntimeError as e:
                print(f"[⚠️] Chunk {idx} skipped due to decode error: {e}")
                results.append(False)

        yes_count = sum(results)
        total_chunks = len(results)
        return f"{yes_count}/{total_chunks} chunks structured SCR"


### **Classification**

In [18]:
import pandas as pd

# Configuration d'affichage des DataFrames
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', None)

# 🗂️ Chemin complet vers le document PDF
url = "/Users/robinguiavarch/Documents/Telecom Paris/Projet-Fil-Rouge/data/doc simple/doc-R-30iB.pdf"

# 🧠 Chemin vers le modèle GGUF local
model_path = "/Users/robinguiavarch/llama_models/mistral/mistral-7b-instruct-v0.2.Q4_K_M.gguf"

# 📥 Initialisation de l'extracteur local
extracteur = Extracteur_SCR_Mistral(
    url=url,
    model_path=model_path,
    n_ctx=4096,
    n_gpu_layers=-1,  # -1 = tout sur GPU (Apple M4 via Metal)
    verbose=False
)

# ✅ Classification du document par chunks (de 4000 caractères par défaut)
result = extracteur.classify_document_par_chunks(start_page=40, end_page=60, chunk_size_tokens=512)

print(f"Résultat classification : {result}")


llama_init_from_model: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_b

[⚠️] Chunk 1 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 2 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 3 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 4 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 5 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 6 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 7 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 8 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 9 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 10 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 11 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 12 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 13 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 14 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 15 skipped due to decode error: llama_decode r

Classification SCR chunks: 100%|██████████| 68/68 [00:00<00:00, 266.92it/s]

[⚠️] Chunk 59 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 60 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 61 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 62 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 63 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 64 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 65 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 66 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 67 skipped due to decode error: llama_decode returned -3
[⚠️] Chunk 68 skipped due to decode error: llama_decode returned -3
Résultat classification : 0/68 chunks structured SCR





**Comment**:

Pourquoi les chunks échouaient avec llama.cpp en local (erreur -3):

Lorsque le prompt + chunk dépasse la fenêtre de contexte du modèle (ici n_ctx=4096),
llama.cpp échoue silencieusement avec l'erreur `llama_decode returned -3`.
Cela signifie que la taille totale (entrée + sortie) dépasse la capacité maximale.

Problème rencontré :
Même avec chunk_size_chars ou chunk_size_tokens raisonnables, le prompt système ajouté (avec [INST]...[/INST]) augmente le nombre de tokens.
Le modèle local ne fait pas de vérification automatique → il plante au moment du décodage.

---

L'approche par classification en local via llama.cpp est trop instable :
- On ne peut pas contrôler précisément la taille des tokens
- Les erreurs sont silencieuses (decode -3)
- Les chunks deviennent trop courts pour être utiles
- Pas de support natif du tokenizer/token count pour faire des découpages fiables

Mieux vaut déléguer cette tâche à une API OpenAI (comme GPT-4o) qui gère la tokenisation, le contexte, les prompts multi-lignes et les erreurs proprement.

Alternative possible :
Revenir à un modèle HuggingFace quantifié en 4-bit (comme Mistral 7B Instruct avec AutoModelForCausalLM), qui offre plus de contrôle + pipeline stable + tokenizer intégré.


## **4. Extraction SCR Regex**

**Note:** Cette extraction se fait via colab. Les scripts ont donc été recopiés.

#### Script script_extract_scr_regex_habibata.py

In [8]:
!pip install pdfplumber
!pip install fitz
!pip install tools


Collecting tools
  Downloading tools-0.1.9.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytils (from tools)
  Downloading pytils-0.4.3.tar.gz (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.4/101.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: tools, pytils
  Building wheel for tools (setup.py) ... [?25l[?25hdone
  Created wheel for tools: filename=tools-0.1.9-py3-none-any.whl size=46730 sha256=e45e2a55f9ad237f88f35e0d9b7a93114b0563202eb27844f53c80eef8cd115b
  Stored in directory: /root/.cache/pip/wheels/bc/d8/9d/52ad6058db295741fe0b776c0fcfdb6670036acab59ce4ccfd
  Building wheel for pytils (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pytils: filename=pytils-0.4.3-py3-none-any.whl size=32806 sha2

In [16]:
import re
import pdfplumber
import fitz  # PyMuPDF
from tqdm import tqdm
import json
import pandas as pd

class TreeNodeV2:
    """
    Classe représentant un nœud dans un arbre hiérarchique.
    Permet d'extraire des causes et remèdes à partir de son contenu.
    """
    def __init__(self, name, content=None):
        self.name = name
        self.content = content if content else ""
        self.children = []
        self.cause = None
        self.remedy = None

    def add_child(self, node):
        self.children.append(node)

    def find_or_create_child(self, name):
        for child in self.children:
            if child.name == name:
                return child
        new_child = TreeNodeV2(name)
        self.add_child(new_child)
        return new_child

    def is_child(self, name):
        """Vérifie si un nœud est un enfant direct basé sur sa hiérarchie."""
        return self._is_child(self.name, name)

    @staticmethod
    def _is_child(name1, name2):
        if name1 == 'root':
            return True  # tout le monde peut être enfant de root
        return name1 in name2 and name2.count('.') == name1.count('.') + 1

    def extract_cause_remedy(self):
        """Analyse le contenu pour extraire les causes et remèdes."""
        cause_match = re.search(r"Cause:\s*(.*?)(\n|$)", self.content, re.DOTALL)
        remedy_match = re.search(r"Remedy:\s*(.*?)(\n|$)", self.content, re.DOTALL)

        if cause_match:
            self.cause = cause_match.group(1).strip()
            self.content = self.content.replace(cause_match.group(0), "").strip()
        if remedy_match:
            self.remedy = remedy_match.group(1).strip()
            self.content = self.content.replace(remedy_match.group(0), "").strip()

    def to_dict(self):
        """Convertit le nœud et ses enfants en dictionnaire JSON."""
        node_dict = {
            "name": self.name,
            "content": self.content,
            "children": [child.to_dict() for child in self.children]
        }
        if self.cause:
            node_dict["cause"] = self.cause
        if self.remedy:
            node_dict["remedy"] = self.remedy
        return node_dict

    def to_string(self, level=0):
        """Affiche l'arbre sous forme textuelle."""
        ret = "  " * level + f"{self.name}: {self.content}\n"
        if self.cause:
            ret += "  " * (level + 1) + f"Cause: {self.cause}\n"
        if self.remedy:
            ret += "  " * (level + 1) + f"Remedy: {self.remedy}\n"
        for child in self.children:
            ret += child.to_string(level + 1)
        return ret


    def _to_rows(self, url=None, equipment=None, page=None, parent_symptom=""):
        """
        Parcourt récursivement le nœud et ses descendants pour construire une liste de dictionnaires contenant les informations SCR.
        """
        rows = []
        current_symptom = self.content.strip() if self.content.strip() else parent_symptom
        if current_symptom:
            rows.append({
                "URL": url if url is not None else "",
                "equipment": equipment if equipment is not None else "",
                "page": page if page is not None else "",
                "symptom": current_symptom,
                "cause": self.cause if self.cause else "Unknown",
                "remedy": self.remedy if self.remedy else "Unknown"
            })
        for child in self.children:
            rows.extend(child._to_rows(url=url, equipment=equipment, page=page, parent_symptom=current_symptom))
        return rows

    def to_dataframe(self, url=None, equipment=None, page=None):
        """
        Convertit ce nœud et tous ses descendants en un DataFrame structuré.
        Colonnes : URL, equipment, page, symptom, cause, remedy.
        """
        rows = self._to_rows(url=url, equipment=equipment, page=page)
        return pd.DataFrame(rows)

    @staticmethod
    def from_dict(d):
        """
        Reconstitue un objet TreeNodeV2 à partir d'un dictionnaire.
        """
        node = TreeNodeV2(d.get("name", ""), d.get("content", ""))
        node.cause = d.get("cause")
        node.remedy = d.get("remedy")
        for child_d in d.get("children", []):
            child_node = TreeNodeV2.from_dict(child_d)
            node.add_child(child_node)
        return node


class PDFProcessor:
    """
    Classe pour convertir un PDF en JSON avec extraction des causes et remèdes.
    """
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.json_data = {"FileName": "Doc fournisseur", "Text": []}

    def convert_with_pdfplumber(self, start_page=1, end_page=0):
        """
        Convertit un PDF en JSON en utilisant pdfplumber,
        en traitant seulement les pages entre start_page et end_page.
        Si end_page vaut 0, toutes les pages à partir de start_page sont traitées.
        """
        try:
            with pdfplumber.open(self.pdf_path) as pdf:
                for page in tqdm(pdf.pages, total=len(pdf.pages), desc="Processing PDF"):
                    # pages sont 1-indexées (page.page_number)
                    if page.page_number < start_page:
                        continue
                    if end_page and page.page_number > end_page:
                        continue
                    raw_content = page.extract_text()
                    clean_content_tree = self._build_tree(raw_content).to_dict()
                    self.json_data["Text"].append({
                        "PageNumber": page.page_number,
                        "Raw Content": raw_content,
                        "Clean Content": clean_content_tree
                    })
        except FileNotFoundError:
            raise Exception(f"File not found: {self.pdf_path}")
        except Exception as e:
            raise Exception(f"Error processing PDF: {e}")
        return self.json_data

    def convert_with_pymupdf(self, start_page=1, end_page=0):
        """
        Convertit un PDF en JSON en utilisant PyMuPDF,
        en traitant seulement les pages entre start_page et end_page.
        """
        try:
            pdf_document = fitz.open(self.pdf_path)
            for page_num in tqdm(range(pdf_document.page_count), desc="Processing PDF"):
                current_page = page_num + 1
                if current_page < start_page:
                    continue
                if end_page and current_page > end_page:
                    continue
                page = pdf_document[page_num]
                raw_content = page.get_text("text")
                clean_content_tree = self._build_tree(raw_content).to_dict()
                self.json_data["Text"].append({
                    "PageNumber": current_page,
                    "Raw Content": raw_content,
                    "Clean Content": clean_content_tree
                })
            pdf_document.close()
        except FileNotFoundError:
            raise Exception(f"File not found: {self.pdf_path}")
        except Exception as e:
            raise Exception(f"Error processing PDF: {e}")
        return self.json_data

    @staticmethod
    def _build_tree(text):
        """Construit un arbre hiérarchique à partir du texte structuré."""
        root = TreeNodeV2("root")
        nodes_by_level = {"root": root}

        for line in text.split("\n"):
            line = re.sub(r"\s+", " ", line.strip())
            match = re.match(r"^(\d+(\.\d+)*\.?)\s+(.*)", line)

            if match:
                section, _, content = match.groups()
                section = section.strip(".")
                level = section.count(".")

                parent_section = ".".join(section.split(".")[:-1]) if level > 0 else "root"
                parent_node = nodes_by_level.get(parent_section, root)

                if TreeNodeV2._is_child(parent_section, section):
                    new_node = parent_node.find_or_create_child(section)
                    new_node.content = content
                    new_node.extract_cause_remedy()
                    nodes_by_level[section] = new_node
                else:
                    print(f"Avertissement : '{section}' ne peut pas être ajouté comme enfant de '{parent_section}'.")
            else:
                last_section = list(nodes_by_level.keys())[-1] if nodes_by_level else "root"
                last_node = nodes_by_level.get(last_section, root)
                last_node.content += f"\n{line.strip()}"
                last_node.extract_cause_remedy()

        return root



class JSONHandler:
    """
    Classe pour gérer la sauvegarde des fichiers JSON.
    """
    @staticmethod
    def save_json_to_file(obj, file_path):
        """Sauvegarde un objet JSON dans un fichier."""
        try:
            with open(file_path, "w", encoding="utf-8") as json_file:
                json.dump(obj, json_file, indent=4, ensure_ascii=False)
            print(f"JSON saved successfully to {file_path}")
        except Exception as e:
            raise Exception(f"Failed to save JSON to file: {e}")

### **Extraction SCR regex**

#### Fonction qui convertit le JSON généré en Dataframe standardisé SCR

In [10]:
# from script_extract_scr_regex_habibata_maj.py import TreeNodeV2 --> Google colab donc uncomment

def json_to_dataframe(json_data, url=None, equipment=None, start_page=1, end_page=0):
    """
    Convertit le fichier JSON généré par PDFProcessor en un DataFrame SCR.
    - json_data : dictionnaire JSON généré par PDFProcessor.
    - url : nom ou URL du document.
    - equipment : nom de l'équipement.
    - start_page / end_page : plage (1-indexée) de pages à traiter.
    """

    dfs = []
    for page_entry in json_data.get("Text", []):
        page_number = page_entry.get("PageNumber", 0)
        if page_number < start_page:
            continue
        if end_page and page_number > end_page:
            continue
        tree_dict = page_entry.get("Clean Content")
        if tree_dict:

            tree_root = TreeNodeV2.from_dict(tree_dict)
            df_page = tree_root.to_dataframe(url=url, equipment=equipment, page=page_number)
            dfs.append(df_page)
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        return pd.DataFrame()


#### Extraction

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
import pandas as pd
import json
# from script_extract_scr_regex_habibata_maj.py import PDFProcessor, TreeNodeV2, JSONHandler

# Définir le chemin vers le PDF (Google Drive ou local)
pdf_path = "/content/drive/MyDrive/projet_fil_rouge/data/doc_simple/doc-R-30iB.pdf"

# Initialiser le processeur PDF avec les pages souhaitées
processor = PDFProcessor(pdf_path)
# Par exemple, pour extraire les pages 40 à 60, vous utilisez:
json_data = processor.convert_with_pdfplumber(start_page=40, end_page=60)

# Sauvegarde du JSON si besoin (optionnel)
output_json_path = "/content/drive/MyDrive/projet_fil_rouge/extraction_scr_json/extract_scr_regex_test.json"
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=4, ensure_ascii=False)

print(f"JSON saved successfully to {output_json_path}")

# Convertir le JSON en DataFrame SCR standardisé
# Assurez-vous que 'TreeNodeV2.from_dict' et 'to_dataframe' sont correctement définis dans votre module TreeNodeV2
df_scr_regex = json_to_dataframe(
    json_data,
    url="doc-R-30iB.pdf",
    equipment="SYSTEM R-30 i B and R-30 i B Mate Controller",
    start_page=40,
    end_page=60
)

display(df_scr_regex)

# Chemin de destination dans ton Google Drive
output_path = "/content/drive/MyDrive/projet_fil_rouge/extraction_scr/extract_scr_regex_test.csv"

# Sauvegarde du DataFrame en .csv
df_scr_regex.to_csv(output_path, index=False)

print(f"✅ CSV sauvegardé ici : {output_path}")


Processing PDF: 100%|██████████| 2007/2007 [00:02<00:00, 995.07it/s]


JSON saved successfully to /content/drive/MyDrive/projet_fil_rouge/extraction_scr_json/extract_scr_regex_test.json


Unnamed: 0,URL,equipment,page,symptom,cause,remedy
0,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,ERROR RECOVERY MARRBERCD04121E REV N\nNote For...,Unknown,Unknown
1,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,"Iftherobotisnotcalibrated, performthefollowing...",Unknown,Unknown
2,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,TurntheteachpendantON/OFFswitchtoOFFandrelease...,Unknown,Unknown
3,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,Check CRM68 & CRF7 connection on the amplifier...,Unknown,Unknown
4,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,Hand Breakage Recovery\nAhandbreakageerroroccu...,Unknown,Unknown
...,...,...,...,...,...,...
249,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,60,ACAL-069 Auto Update is ON.,AutoUpdateisonthereforetheframehasbeenupdateda...,An update is not required.
250,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,60,ACAL-070 No update on record points.\ninstruct...,Noupdateonrecordpoints. Theteachpendantprogram...,Manuallyrecordpositionstocreatethecalibrationp...
251,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,60,ACAL-071 Invalid joint number.,TheaxisnumberspecifiedintheDETECTJointinstruct...,Change the axis number to a valid one.
252,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,60,ACAL-072 Invalid joint sensor type.,TheDetectJointinstructioncannotuseTOSWRISTsens...,ChangesensortypetoeitherIOorTOSallaxesinthedet...


✅ CSV sauvegardé ici : /content/drive/MyDrive/projet_fil_rouge/extraction_scr/extract_scr_regex_test.csv


In [15]:
import json

with open("/content/drive/MyDrive/projet_fil_rouge/extraction_scr_json/extract_scr_regex_test.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Exemple d’accès
print(data)



## **5. Comparaison extractions GPT-4o & Regex**

**Note:** Cette comparaison se fait via colab. Les scripts ont donc été recopiés.

Plusieurs métriques:

**a. Distance de Levenshtein**

- Mesure combien d'opérations (insertion, suppression, substitution) il faut pour passer d'une chaîne à l'autre.

**b. Similarité Cosine avec TF-IDF**

- Vectorisation par TF-IDF des textes, puis calcul du cosinus entre les vecteurs.

**c. Evaluation structuré: Précision Rappel F1-score**

- Calculer sur chaque champ (« symptom », « cause », « remedy ») ces métriques en considérant l'une des extractions comme référence (par exemple, celle effectuée par GPT-4o).

**d. Indices de Jaccard**

- Exactitude mot-à-mot pas nécessaire, vérifier l'intersection entre les deux ensembles extraits. Intersection / Union des ensembles de mots.

**e. Comparaison sémantique avancée : Sentence-BERT embeddings avec similarité cosinus**

- Cela permet de capturer une similarité sémantique beaucoup plus robuste que TF-IDF.

##### **a. Distance de Levenshtein**

In [19]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.27.1 rapidfuzz-3.13.0


In [20]:
import Levenshtein

# 1. Distance de Levenshtein
def levenshtein_ratio(str1, str2):
    """
    Retourne le ratio de similarité de Levenshtein (entre 0 et 1)
    entre str1 et str2.
    """
    return Levenshtein.ratio(str1, str2)

def levenshtein_distance(str1, str2):
    """
    Retourne le nombre d'opérations (insertion, suppression, substitution) nécessaires
    pour passer de str1 à str2.
    """
    return Levenshtein.distance(str1, str2)

##### **b. Similarité Cosine avec TF-IDF**

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 2. Similarité Cosine avec TF-IDF
def tfidf_cosine_similarity(text1, text2):
    """
    Calcule la similarité cosine entre deux textes après vectorisation TF-IDF.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return similarity

##### **c.  Évaluation structurée: Précision, Rappel et F1-score (basée sur une comparaison tokenisée)**

In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 3. Évaluation structurée: Précision, Rappel et F1-score (basée sur une comparaison tokenisée)
def token_based_prf(gold, pred):
    """
    Calcule précision, rappel et F1 à partir des tokens extraits.
    On considère la référence 'gold' et la prédiction 'pred'.
    """
    gold_set = set(gold.split())
    pred_set = set(pred.split())

    if len(pred_set) == 0:
        precision = 0.0
    else:
        precision = len(gold_set.intersection(pred_set)) / len(pred_set)

    if len(gold_set) == 0:
        recall = 0.0
    else:
        recall = len(gold_set.intersection(pred_set)) / len(gold_set)

    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)

    return precision, recall, f1

##### **d. Indice de Jaccard**

In [23]:
# 4. Indice de Jaccard
def jaccard_similarity(text1, text2):
    """
    Calcule l'indice de Jaccard entre les ensembles de mots de text1 et text2.
    """
    set1 = set(text1.split())
    set2 = set(text2.split())
    if not set1 and not set2:
        return 1.0  # Si les deux ensembles sont vides, considérer comme identiques
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

##### **e. Comparaison sémantique avec Sentence-BERT**

In [24]:
from sentence_transformers import SentenceTransformer, util

# 5. Comparaison sémantique avec Sentence-BERT
def sentence_bert_similarity(text1, text2, model_name='all-MiniLM-L6-v2'):
    """
    Utilise Sentence-BERT pour calculer la similarité cosine entre text1 et text2.
    """
    model = SentenceTransformer(model_name)
    emb1 = model.encode(text1, convert_to_tensor=True)
    emb2 = model.encode(text2, convert_to_tensor=True)
    similarity = util.cos_sim(emb1, emb2).item()
    return similarity

### **Résultats comparaison extractions Regex & GPT-4o**

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
import pandas as pd

df_openai = pd.read_csv("/content/drive/MyDrive/projet_fil_rouge/extraction_scr/extract_scr_openai_test.csv")
df_regex = pd.read_csv("/content/drive/MyDrive/projet_fil_rouge/extraction_scr/extract_scr_regex_test.csv")

#### **Caractéristique des 2 dataframes**

In [28]:
import pandas as pd

def compute_caracteristics(df):
    """
    Calcule deux métriques pour un DataFrame d'extraction SCR :
      - total_scr : nombre total de lignes (SCR extraits)
      - complete_scr : nombre de SCR complets (aucune des colonnes 'symptom', 'cause', 'remedy' ne contient "Unknown")
    """
    total_scr = len(df)
    complete_scr = len(df[(df["symptom"] != "Unknown") &
                           (df["cause"] != "Unknown") &
                           (df["remedy"] != "Unknown")])
    return total_scr, complete_scr

In [29]:
total_openai, complete_openai = compute_caracteristics(df_openai)
total_regex, complete_regex   = compute_caracteristics(df_regex)

# Création d'un DataFrame de comparaison avec deux colonnes
df_caracteristics = pd.DataFrame({
    "df_openai": [total_openai, complete_openai],
    "df_regex": [total_regex, complete_regex]
}, index=["Nombre total de SCR", "Nombre de SCR complets"])

display(df_caracteristics)


Unnamed: 0,df_openai,df_regex
Nombre total de SCR,86,254
Nombre de SCR complets,79,73


#### **Comparaison des 2 dataframes**

#### **1ère méthode:**

- De manière brut, **sans prendre en compte un alignement des SCRs** entre les 2 dataframes, on calcule les métriques pour chaque colonnes "symptom", "cause", "remedy".

In [44]:
import pandas as pd

# 1. Concaténer les colonnes de chaque extraction dans des variables distinctes
text_openai_symptom = " ".join(df_openai["symptom"].astype(str).tolist())
text_regex_symptom  = " ".join(df_regex["symptom"].astype(str).tolist())

text_openai_cause   = " ".join(df_openai["cause"].astype(str).tolist())
text_regex_cause    = " ".join(df_regex["cause"].astype(str).tolist())

text_openai_remedy  = " ".join(df_openai["remedy"].astype(str).tolist())
text_regex_remedy   = " ".join(df_regex["remedy"].astype(str).tolist())

# 2. Initialiser un dictionnaire pour stocker les résultats
metrics_dict = {}

# Pour chaque champ, calculer les métriques en comparant le texte extrait par OpenAI et par regex
for field, (text_openai, text_regex) in {
    "symptom": (text_openai_symptom, text_regex_symptom),
    "cause":   (text_openai_cause, text_regex_cause),
    "remedy":  (text_openai_remedy, text_regex_remedy)
}.items():
    # Calculer le ratio de Levenshtein
    lev_ratio = levenshtein_ratio(text_openai, text_regex)
    # Calculer la similarité cosinus via TF-IDF
    tfidf_sim = tfidf_cosine_similarity(text_openai, text_regex)
    # Calculer l'indice de Jaccard
    jac_sim = jaccard_similarity(text_openai, text_regex)
    # Calculer la similarité semantique via Sentence-BERT
    sen_sim = sentence_bert_similarity(text_openai, text_regex)
    # Calculer précision, rappel et F1-score (basé sur une comparaison tokenisée)
    precision, recall, f1 = token_based_prf(text_openai, text_regex)

    metrics_dict[field] = {
        "levenshtein_ratio": lev_ratio,
        "tfidf_cosine": tfidf_sim,
        "jaccard": jac_sim,
        "sentence_bert": sen_sim,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# 3. Convertir le dictionnaire en DataFrame
df_sim1 = pd.DataFrame(metrics_dict).T  # Transpose pour avoir une ligne par champ

print("Métriques moyennes pour chaque champ d'extraction:")
display(df_sim1)

Métriques moyennes pour chaque champ d'extraction:


Unnamed: 0,levenshtein_ratio,tfidf_cosine,jaccard,sentence_bert,precision,recall,f1
symptom,0.232321,0.806326,0.322967,0.458055,0.343949,0.841121,0.488246
cause,0.786822,0.221976,0.314861,0.3715,0.694444,0.365497,0.478927
remedy,0.67365,0.158887,0.189687,0.316888,0.64375,0.211934,0.318885


#### **2e méthode:**

- On commence par aligner les SCRs par similarité sémantique **sur les symptoms seulement**. Puis on se débarrasse des SCRs non-alignés. Et on calcule les métriques **lignes par lignes** pour chaque symptom, chaque cause, chaque remedy. Et on fait la **moyenne** sur les lignes pour avoir un score pour symptom, pour cause et pour remedy.


##### Fonction d'alignement

In [30]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from scipy.optimize import linear_sum_assignment

def align_scr_dataframes(df_openai, df_regex, threshold=0.7, model_name='all-MiniLM-L6-v2'):
    """
    Aligne de manière one-to-one les SCR des DataFrames df_openai et df_regex basées sur le champ "symptom".
    Seules les paires dont la similarité semantique (cosine) dépasse le seuil spécifié sont conservées.

    Paramètres:
      - df_openai : DataFrame issu de l'extraction par OpenAI (doit contenir une colonne "symptom").
      - df_regex  : DataFrame issu de l'extraction par regex (doit contenir une colonne "symptom").
      - threshold : seuil minimal de similarité (entre 0 et 1) pour considérer une paire comme correctement alignée.
      - model_name: le modèle SentenceTransformer à utiliser pour générer les embeddings.

    Retourne:
      - df_openai_aligned : DataFrame avec les SCR alignés provenant de df_openai.
      - df_regex_aligned  : DataFrame avec les SCR alignés provenant de df_regex.
      - similarity_matrix : la matrice de similarité (optionnellement, pour analyse complémentaire).

    Dans les deux DataFrames retournés, la ligne i de df_openai_aligned correspond au SCR aligné en i dans df_regex_aligned.
    """
    # Extraire la colonne "symptom" de chaque DataFrame
    symptoms_openai = df_openai['symptom'].astype(str).tolist()
    symptoms_regex = df_regex['symptom'].astype(str).tolist()

    # Charger le modèle SentenceTransformer et générer les embeddings
    model = SentenceTransformer(model_name)
    embeddings_openai = model.encode(symptoms_openai, convert_to_tensor=True)
    embeddings_regex = model.encode(symptoms_regex, convert_to_tensor=True)

    # Calculer la matrice de similarité cosine (dimensions: [n_openai x n_regex])
    cosine_scores = util.cos_sim(embeddings_openai, embeddings_regex).cpu().numpy()

    # Pour obtenir un alignement one-to-one, on utilise l'algorithme hongrois
    # Comme nous voulons maximiser la similarité, on minimise le coût défini par -cosine_scores
    cost_matrix = -cosine_scores
    row_ind, col_ind = linear_sum_assignment(cost_matrix)

    aligned_openai_indices = []
    aligned_regex_indices = []

    # Filtrer uniquement les paires dont la similarité dépasse le seuil
    for i, j in zip(row_ind, col_ind):
        if cosine_scores[i, j] >= threshold:
            aligned_openai_indices.append(i)
            aligned_regex_indices.append(j)

    # Extraire les lignes correspondantes et les réinitialiser (pour qu'ils aient les mêmes indices)
    df_openai_aligned = df_openai.iloc[aligned_openai_indices].copy().reset_index(drop=True)
    df_regex_aligned = df_regex.iloc[aligned_regex_indices].copy().reset_index(drop=True)

    return df_openai_aligned, df_regex_aligned, cosine_scores

In [32]:
# Aligner les SCR avec un seuil de similarité de 0.7 (par exemple)
df_openai_aligned, df_regex_aligned, sim_matrix = align_scr_dataframes(df_openai, df_regex, threshold=0.7)

# Affichage des premières lignes des DataFrames alignés

display(df_openai_aligned.head())
display(df_regex_aligned.head())
print("Nombre de SCR alignés :", len(df_openai_aligned))

Unnamed: 0,URL,equipment,page,symptom,cause,remedy
0,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,40,ERROR RECOVER Y MARRBERCD04121E REV N,Overtravel of the robot axis,Press and hold down the SHIFT key until you ha...
1,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,41,Pulse Coder SRVO-062 Alarm,The pulse counts at power up do not match the ...,"1. Press MENU. 2. Select SYSTEM. 3. Press F1, ..."
2,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,44,ERROR RECOVER MARRBERCD04121E,Unknown,Unknown
3,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,44,ALARM: Active SRVO-230 Chain 1(+24V) abnormal,Unknown,"Press F4, RES_CH1. You will see a screen simil..."
4,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,44,ALARM: Active SRVO-230 Chain 1(+24V) abnormal,Chain failure fault,Fix the cause of the chain failure before you ...


Unnamed: 0,URL,equipment,page,symptom,cause,remedy
0,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,46,ERROR CODES MARRBERCD04121E REV N,Unknown,Unknown
1,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,41,Pulse Coder Alarm Recovery\nIfthepulsecountsat...,Unknown,Unknown
2,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,44,ERROR RECOVERY MARRBERCD04121E REV N,Unknown,Unknown
3,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,44,SelectALARMS.Youwillseeascreensimilartothefoll...,Unknown,Unknown
4,doc-R-30iB.pdf,SYSTEM R-30 i B and R-30 i B Mate Controller,44,"PressF4,RES_CH1. Youwillseeascreensimilartothe...",Unknown,Unknown


Nombre de SCR alignés : 77


**A vérifier:** Est-ce que un symptom de df_open_ai peut-être aligné avec plusieurs symptoms de df_regex et vice versa.

##### Résultats

In [43]:
# Liste des champs à comparer
fields = ["symptom", "cause", "remedy"]
metrics = ["levenshtein_ratio", "tfidf_cosine", "jaccard", "sentence_bert", "precision", "recall", "f1"]

similarities = []

# Parcours des lignes alignées
for idx, row in df_openai_aligned.iterrows():
    metrics_dict = {}
    for field in fields:
        text_openai = str(row.get(field, ""))
        text_regex = str(df_regex_aligned.loc[idx, field]) if idx < len(df_regex_aligned) else ""

        metrics_dict[f"{field}_levenshtein_ratio"] = levenshtein_ratio(text_openai, text_regex)
        metrics_dict[f"{field}_tfidf_cosine"] = tfidf_cosine_similarity(text_openai, text_regex)
        metrics_dict[f"{field}_jaccard"] = jaccard_similarity(text_openai, text_regex)
        metrics_dict[f"{field}_sentence_bert"] = sentence_bert_similarity(text_openai, text_regex)

        pr, rc, f1 = token_based_prf(text_openai, text_regex)
        metrics_dict[f"{field}_precision"] = pr
        metrics_dict[f"{field}_recall"] = rc
        metrics_dict[f"{field}_f1"] = f1

    similarities.append(metrics_dict)

# DataFrame plat
df_flat = pd.DataFrame(similarities)

# Moyenne des colonnes
mean_series = df_flat.mean()

# Reformattage en DataFrame lignes = fields, colonnes = metrics
data = []
for field in fields:
    row = [mean_series[f"{field}_{metric}"] for metric in metrics]
    data.append(row)

df_sim = pd.DataFrame(data, index=fields, columns=metrics)

# Résultat final
print("Moyenne des métriques par champ:")
print(df_sim)

# Optionnel : pour vérifier visuellement
print("\ndf_sim.head():")
print(df_sim.head())


Moyenne des métriques par champ:
         levenshtein_ratio  tfidf_cosine   jaccard  sentence_bert  precision  \
symptom           0.800108      0.807268  0.685416       0.920871   0.780400   
cause             0.903969      0.422919  0.416558       0.643622   0.428571   
remedy            0.843672      0.333778  0.310563       0.590812   0.358442   

           recall        f1  
symptom  0.850783  0.787963  
cause    0.416558  0.421640  
remedy   0.312418  0.325060  

df_sim.head():
         levenshtein_ratio  tfidf_cosine   jaccard  sentence_bert  precision  \
symptom           0.800108      0.807268  0.685416       0.920871   0.780400   
cause             0.903969      0.422919  0.416558       0.643622   0.428571   
remedy            0.843672      0.333778  0.310563       0.590812   0.358442   

           recall        f1  
symptom  0.850783  0.787963  
cause    0.416558  0.421640  
remedy   0.312418  0.325060  


#### **Résultats combinés**

In [45]:
print("1ère méthode")
display(df_sim1)
print("2e méthode")
display(df_sim)


1ère méthode


Unnamed: 0,levenshtein_ratio,tfidf_cosine,jaccard,sentence_bert,precision,recall,f1
symptom,0.232321,0.806326,0.322967,0.458055,0.343949,0.841121,0.488246
cause,0.786822,0.221976,0.314861,0.3715,0.694444,0.365497,0.478927
remedy,0.67365,0.158887,0.189687,0.316888,0.64375,0.211934,0.318885


2e méthode


Unnamed: 0,levenshtein_ratio,tfidf_cosine,jaccard,sentence_bert,precision,recall,f1
symptom,0.800108,0.807268,0.685416,0.920871,0.7804,0.850783,0.787963
cause,0.903969,0.422919,0.416558,0.643622,0.428571,0.416558,0.42164
remedy,0.843672,0.333778,0.310563,0.590812,0.358442,0.312418,0.32506
