### Classificateur de documents

In [1]:
!pip install -q huggingface_hub transformers PyPDF2 pandas tqdm json_repair


#### **Chargement du modèle Mistral**

In [2]:
!pip install bitsandbytes




In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
from transformers import BitsAndBytesConfig

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

# Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)



quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # passer en float16 pour accélérer
    bnb_4bit_quant_type="nf4",             # ou une autre valeur recommandée
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    quantization_config=quant_config
)

# Créer le pipeline de génération
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
    max_new_tokens=1024,
    do_sample=False, # pour des réponses déterministes
    pad_token_id=tokenizer.eos_token_id
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Device set to use cuda:0


In [15]:
import pandas as pd
from PyPDF2 import PdfReader
from json_repair import repair_json
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

class Extracteur_SCR_Mistral:
    def __init__(self, url, model_id="mistralai/Mistral-7B-Instruct-v0.2", load_in_4bit=True):
        # Utiliser directement le chemin complet vers le PDF
        self.url = url
        self.pdf_reader = PdfReader(self.url)
        self.df_scr = pd.DataFrame()
        # Charger le modèle local en 4-bit via bitsandbytes
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            load_in_4bit=load_in_4bit,
            torch_dtype=torch.float16,
            trust_remote_code=True
        )
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            torch_dtype=torch.float16,
            device_map="auto",
            max_new_tokens=1024,
            do_sample=False  # Pour des réponses déterministes
        )

    def extract_equipment(self):
        first_page_text = self.pdf_reader.pages[0].extract_text()
        prompt = f"""
{first_page_text}

What is the equipment this document is dealing with? Return only the response.
"""
        response = self.pipe(prompt, max_new_tokens=100)[0]["generated_text"]
        return response.strip()

    def alimentation_df(self, data_json, page_num, equipment):
        rows = []
        for defect in data_json:
            row = {
                "URL": self.url.rsplit("/", 1)[-1],
                "equipment": equipment,
                "page": page_num,
                "symptom": defect.get("symptom", "Unknown"),
                "cause": defect.get("cause", "Unknown"),
                "remedy": defect.get("remedy", "Unknown"),
            }
            rows.append(row)
        return pd.DataFrame(rows)

    def process_page(self, page_num, equipment):
        page_text = self.pdf_reader.pages[page_num - 1].extract_text()
        if not page_text.strip():
            return pd.DataFrame([], columns=["URL", "equipment", "page", "symptom", "cause", "remedy"])
        prompt = f"""
{page_text}

Extract all defects and their associated causes and remedies from the provided text. Return a JSON array.
There can be more than one line for a single defect if it has different causes and if a cause has different remedies,
one line should represent a unique group of a symptom, a cause, and a remedy.

* symptom: A description of the defect. Include any error codes mentioned.
* cause: A possible explanation for the defect.
* remedy: The suggested solution or troubleshooting steps.

If a defect lacks one or more of these components (symptom, cause, or remedy), include the missing information as "Unknown".

Example JSON format:
[
    {{
        "symptom": "PNT1-166 Linear Potentiometer Unstable",
        "cause": "During Auto Calibration, the feedback from the linear potentiometer revealed large fluctuations.",
        "remedy": "Change the applicator and repair the malfunctioning linear potentiometer."
    }}
]
"""
        response = self.pipe(prompt, max_new_tokens=2000)[0]["generated_text"]
        repaired_json = repair_json(response)
        data_json = json.loads(repaired_json)
        return self.alimentation_df(data_json, page_num, equipment)

    def extract_defects(self, start_page=0, end_page=0):
        if end_page == 0:
            end_page = len(self.pdf_reader.pages)
        equipment = self.extract_equipment()
        with ThreadPoolExecutor() as executor:
            results = list(tqdm(
                executor.map(lambda page: self.process_page(page, equipment), range(start_page, end_page + 1)),
                total=end_page - start_page + 1,
                desc="Extraction SCR",
                leave=False
            ))
        self.df_scr = pd.concat(results, ignore_index=True)
        self.df_scr = self.df_scr[["URL", "equipment", "page", "symptom", "cause", "remedy"]]
        self.df_scr.sort_values(by="page", ascending=True, inplace=True)
        return self.df_scr

    def classify_document(self, start_page=0, end_page=0):
        """
        Classify the document to verify whether it is structured explicitly into sections "Symptoms", "Causes", "Remedies".
        The function extracts text from the pages between start_page and end_page (if end_page is 0, it uses the entire document),
        constructs a QA prompt, and returns True if the generated answer contains "yes".
        """
        if end_page == 0:
            end_page = len(self.pdf_reader.pages)
        texts = []
        for i in range(start_page, end_page):
            text = self.pdf_reader.pages[i].extract_text()
            if text and text.strip():
                texts.append(text)
        full_text = "\n\n".join(texts)
        prompt = f"""
Here is an excerpt from a technical document:

{full_text}

Based on the text above, determine whether this document is structured in a way that facilitates the extraction of defects and their associated causes and remedies using a simple regex extraction. In particular, check if the document clearly delineates sections or markers corresponding to:
* symptom: a description of the defect (including any error codes),
* cause: a possible explanation for the defect,
* remedy: the suggested solution or troubleshooting steps.

Answer only "Yes" or "No".
"""
        answer = self.pipe(prompt, max_new_tokens=10)[0]["generated_text"].strip().lower()
        return "yes" in answer


#### **Extraction**

In [None]:
### TEST ###

#from script_extracteur_scr_mistral import Extracteur_SCR_Mistral ---> uncomment
import pandas as pd

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', None)

TOKEN_HF = "secret_token"  # Remplacez par votre token Hugging Face

# Chemin complet vers ton PDF dans Google Drive
url = "/content/drive/MyDrive/projet_fil_rouge/data/doc_simple/doc-R-30iB.pdf"


# Initialisation de l'extracteur local (en 4 bits, sans token)
extracteur = ExtracteurSCRMistralLocal(
    url=url,
    model_id="mistralai/Mistral-7B-Instruct-v0.2",
    load_in_4bit=True
)

# Extraction des défauts (SCR) entre la page 42 et 60
df_result = extracteur.extract_defects(start_page=42, end_page=60)

display(df_result)


FileNotFoundError: [Errno 2] No such file or directory: 'doc/doc-R-30iB.pdf'

#### **Classification**

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
#from script_extracteur_scr_mistral import Extracteur_SCR_Mistral

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 30)
pd.set_option('display.max_colwidth', None)

# Chemin complet vers votre PDF dans Google Drive
url = "/content/drive/MyDrive/projet_fil_rouge/data/doc_simple/doc-R-30iB.pdf"

TOKEN_HF = "secret" "

# Initialisation de l'extracteur en passant les bons arguments
extracteur = Extracteur_SCR_Mistral(
    url=url,
    model_id="mistralai/Mistral-7B-Instruct-v0.2"
)

# Utilisation de la méthode de classification sur le document
# Ici, nous analysons les pages 48 à 60.
is_structured = extracteur.classify_document(start_page=48, end_page=60)

if is_structured:
    print("Le document est structuré pour une extraction SCR.")
else:
    print("Le document n'est pas bien structuré pour une extraction SCR.")


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Le document est structuré pour une extraction SCR.
