In [1]:
import os 
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from includes.functions import *
import pandas as pd
import json
import os
os.chdir(r'./')

from gliner import GLiNER
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from tqdm import tqdm



In [2]:
import re
import time
from difflib import SequenceMatcher

def filtrer_entites(donnees, types_a_supprimer):

    resultat = []
    
    for item in donnees:
        # Créer une copie de l'objet pour ne pas modifier l'original
        item_copie = item.copy()
        
        # Si l'élément a des références, les filtrer
        if 'reference' in item_copie and isinstance(item_copie['reference'], list):
            item_copie['reference'] = [
                ref for ref in item_copie['reference'] 
                if ref[2] not in types_a_supprimer
            ]
        
        resultat.append(item_copie)
    
    return resultat

def evaluate_model(sample, segments_data, concepts, model_inference, mode="thematic", threshold=0.4, tokenizer = None, max_tokens=512, stride = 512):

    def normalize_text(text):
        text = re.sub(r'[\n\t@_]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def find_segment_position(text, segment, threshold=0.8):
        if segment in text:
            return text.find(segment)
        
        best_match = 0
        best_pos = -1
        step = min(20, max(1, len(text) // 100)) 
        
        for i in range(0, len(text) - min(len(segment), len(text)) + 1, step):
            window_size = min(len(segment) + 50, len(text) - i)
            sub_text = text[i:i + window_size]
            ratio = SequenceMatcher(None, segment[:min(100, len(segment))], sub_text[:min(100, len(sub_text))]).ratio()
            if ratio > best_match and ratio > threshold:
                best_match = ratio
                best_pos = i
        
        return best_pos
    
    def has_similar_entity(predictions, pred_start, pred_end, pred_label, tolerance=5):
        for p in predictions:
            if (p[2] == pred_label and 
                abs(p[0] - pred_start) <= tolerance and 
                abs(p[1] - pred_end) <= tolerance):
                return True
        return False 
    
    results = []
    total_inference_time = 0
    total_segments_processed = 0
    
    # Préparation des segments selon le mode
    if mode == "thematic":
        # Organiser les segments par document pour le mode thématique
        segments_by_doc = {}
        for result in segments_data:
            doc_idx = result["Document_Index"]
            segment = result["Segment"]
            
            if doc_idx not in segments_by_doc:
                segments_by_doc[doc_idx] = []
            
            segments_by_doc[doc_idx].append(segment)
        
        # Traiter chaque document
        for doc_idx, (text, annotations) in enumerate(sample):
            reference_entities = annotations['entities']
            normalized_text = normalize_text(text)
            
            article_predictions = []
            
            segments = segments_by_doc.get(doc_idx, [])
            
            if not segments:
                print(f"Aucun segment trouvé pour le document {doc_idx}")
                results.append({
                    'text': text,
                    'reference': reference_entities,
                    'predictions': []
                })
                continue
            
            found_segments = 0
            total_segments = len(segments)
            sub_segments_processed = 0
            
            for segment_idx, segment in enumerate(segments):
                if not segment or not segment.strip():
                    continue
                
                # Diviser les segments trop longs
                sub_segments = split_text_into_sliding_windows(segment, tokenizer, max_tokens, stride = stride)
                found_segments += 1
                
                for sub_segment in sub_segments:
                    normalized_segment = normalize_text(sub_segment)
                    sub_segments_processed += 1
                    
                    segment_start = text.find(sub_segment)
                    
                    if segment_start == -1:
                        segment_start = normalized_text.find(normalized_segment)
                    
                    if segment_start == -1:
                        segment_start = find_segment_position(normalized_text, normalized_segment, threshold=0.7)
                    
                    if segment_start == -1:
                        if len(sub_segments) > 1:
                            print(f"Sous-segment non trouvé (segment {segment_idx+1}/{total_segments}, sous-segment {sub_segments.index(sub_segment)+1}/{len(sub_segments)}): {sub_segment[:50]}...")
                        else:
                            print(f"Segment non trouvé (segment {segment_idx+1}/{total_segments}): {sub_segment[:50]}...")
                        continue
                    
                    try:
                        # Mesure du temps pour l'inférence du modèle
                        start_time = time.perf_counter()
                        predictions = model_inference.predict_entities(sub_segment, concepts, threshold=threshold)
                        end_time = time.perf_counter()
                        
                        inference_time = end_time - start_time
                        total_inference_time += inference_time
                        total_segments_processed += 1
                        
                        for pred in predictions:
                            pred_start = segment_start + pred['start']
                            pred_end = segment_start + pred['end']
                            
                            if pred_end > len(text):
                                pred_end = len(text)
                            
                            if not has_similar_entity(article_predictions, pred_start, pred_end, pred['label']):
                                article_predictions.append([pred_start, pred_end, pred['label']])
                    except Exception as e:
                        print(f"Erreur lors du traitement du segment: {str(e)}")
                        continue
            
            print(f"Document {doc_idx}: Segments originaux trouvés: {found_segments}/{total_segments} ({found_segments/total_segments*100:.1f}%)")
            if sub_segments_processed > total_segments:
                print(f"  → Sous-segments traités: {sub_segments_processed} (segments divisés: {sub_segments_processed - total_segments})")
            
            results.append({
                'text': text,
                'reference': reference_entities,
                'predictions': article_predictions
            })
    
    elif mode == "segments":
        # Mode d'évaluation sur segments directs
        for doc_idx, ((text, annotations), segments) in enumerate(zip(sample, segments_data)):
            reference_entities = annotations['entities']
            
            article_predictions = []
            found_segments = 0
            total_segments = len(segments)
            sub_segments_processed = 0
            
            for segment_idx, segment in enumerate(segments):
                if not segment.strip():
                    continue
                
                # Diviser les segments trop longs
                sub_segments = split_text_into_sliding_windows(segment, tokenizer, max_tokens, stride = stride) 
                found_segments += 1
                
                for sub_segment in sub_segments:
                    sub_segments_processed += 1
                    
                    segment_start = text.find(sub_segment)
                    if segment_start == -1:
                        normalized_text = normalize_text(text)
                        normalized_segment = normalize_text(sub_segment)
                        segment_start = normalized_text.find(normalized_segment)
                    
                    if segment_start == -1:
                        segment_start = find_segment_position(text, sub_segment, threshold=0.7)
                    
                    if segment_start == -1:
                        if len(sub_segments) > 1:
                            print(f"Sous-segment non trouvé (segment {segment_idx+1}/{total_segments}, sous-segment {sub_segments.index(sub_segment)+1}/{len(sub_segments)}): {sub_segment[:50]}...")
                        else:
                            print(f"Segment non trouvé (segment {segment_idx+1}/{total_segments}): {sub_segment[:50]}...")
                        continue
                    
                    # Mesure du temps pour l'inférence du modèle
                    start_time = time.perf_counter()
                    predictions = model_inference.predict_entities(sub_segment, concepts, threshold=threshold)
                    end_time = time.perf_counter()
                    
                    inference_time = end_time - start_time
                    total_inference_time += inference_time
                    total_segments_processed += 1
                    
                    for pred in predictions:
                        pred_start = segment_start + pred['start']
                        pred_end = segment_start + pred['end']
                        article_predictions.append([pred_start, pred_end, pred['label']])
            
            print(f"Document {doc_idx}: Segments originaux trouvés: {found_segments}/{total_segments} ({found_segments/total_segments*100:.1f}%)")
            if sub_segments_processed > total_segments:
                print(f"  → Sous-segments traités: {sub_segments_processed} (segments divisés: {sub_segments_processed - total_segments})")
            
            # Dédupliquer les prédictions
            article_predictions = list(set(tuple(pred) for pred in article_predictions))
            article_predictions = [list(pred) for pred in article_predictions]
            
            results.append({
                'text': text,
                'reference': reference_entities,
                'predictions': article_predictions
            })
    
    else:
        raise ValueError(f"Mode '{mode}' non reconnu. Utilisez 'thematic' ou 'segments'.")
    
    # Calcul des statistiques de temps
    if total_segments_processed > 0:
        avg_time_per_segment = total_inference_time / total_segments_processed
        print(f"Temps total d'inférence: {total_inference_time:.4f} secondes")
        print(f"Nombre de segments traités: {total_segments_processed}")
        print(f"Temps moyen par segment: {avg_time_per_segment:.4f} secondes")
    else:
        print("Aucun segment traité.")
    
    # Statistiques de temps
    timing_stats = {
        'total_inference_time': total_inference_time,
        'total_segments_processed': total_segments_processed,
        'avg_time_per_segment': total_inference_time / total_segments_processed if total_segments_processed > 0 else 0
    }
    
    return results, timing_stats

In [3]:
def calculate_metrics(all_results):
    total_metrics = {
        'tp': 0, 'fp': 0, 'fn': 0,
        'by_type': {}
    }
    
    for result in all_results:
        ref_entities = result['reference']
        pred_entities = result['predictions']
        
        for pred in pred_entities:
            found_match = False
            pred_type = pred[2]
            
            if pred_type not in total_metrics['by_type']:
                total_metrics['by_type'][pred_type] = {'tp': 0, 'fp': 0, 'fn': 0, 'support': 0}
            
            for ref in ref_entities:
                if (ref[2] == pred[2] and 
                    abs(ref[0] - pred[0]) <= 5 and  
                    abs(ref[1] - pred[1]) <= 5):    
                    total_metrics['tp'] += 1
                    total_metrics['by_type'][pred_type]['tp'] += 1
                    found_match = True
                    break
            
            if not found_match:
                total_metrics['fp'] += 1
                total_metrics['by_type'][pred_type]['fp'] += 1
        
        for ref in ref_entities:
            ref_type = ref[2]
            
            if ref_type not in total_metrics['by_type']:
                total_metrics['by_type'][ref_type] = {'tp': 0, 'fp': 0, 'fn': 0, 'support': 0}
            
            total_metrics['by_type'][ref_type]['support'] += 1
            
            found_match = False
            for pred in pred_entities:
                if (ref[2] == pred[2] and
                    abs(ref[0] - pred[0]) <= 5 and
                    abs(ref[1] - pred[1]) <= 5):
                    found_match = True
                    break
            
            if not found_match:
                total_metrics['fn'] += 1
                total_metrics['by_type'][ref_type]['fn'] += 1
    
    return total_metrics

def print_metrics(metrics):
    print("\nMétriques par type d'entité :")
    print("=" * 80)
    print(f"{'Type':<20} {'Précision':>10} {'Rappel':>10} {'F1':>10} {'Support':>10}")
    print("-" * 80)
    
    total_tp = 0
    total_fp = 0
    total_fn = 0
    
    all_precision = []
    all_recall = []
    all_f1 = []
    
    for entity_type, type_metrics in metrics['by_type'].items():
        tp = type_metrics['tp']
        fp = type_metrics['fp']
        fn = type_metrics['fn']
        support = type_metrics['support']
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        all_precision.append(precision)
        all_recall.append(recall)
        all_f1.append(f1)
        
        print(f"{entity_type:<20} {precision:>10.3f} {recall:>10.3f} {f1:>10.3f} {support:>10d}")
        
        total_tp += tp
        total_fp += fp
        total_fn += fn
    
    print("-" * 80)
    
    micro_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    micro_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0
    

    macro_precision = sum(all_precision) / len(all_precision) if all_precision else 0
    macro_recall = sum(all_recall) / len(all_recall) if all_recall else 0
    macro_f1 = sum(all_f1) / len(all_f1) if all_f1 else 0
    
    print(f"{'micro avg':<20} {micro_precision:>10.3f} {micro_recall:>10.3f} {micro_f1:>10.3f}")
    print(f"{'macro avg':<20} {macro_precision:>10.3f} {macro_recall:>10.3f} {macro_f1:>10.3f}")
    
    print("\nStatistiques globales:")
    print(f"Nombre total d'entités dans les références: {sum(m['support'] for m in metrics['by_type'].values())}")
    print(f"Nombre total de prédictions: {total_tp + total_fp}")
    print(f"Vrais positifs (TP): {total_tp}")
    print(f"Faux positifs (FP): {total_fp}")
    print(f"Faux négatifs (FN): {total_fn}")


In [4]:
def remove_text_duplicates(annotations):

    seen_texts = {}
    unique_annotations = []
    
    for item in annotations:

        if "data" in item and "text" in item["data"]:
            text = item["data"]["text"]
            
            if text not in seen_texts:
                seen_texts[text] = True
                unique_annotations.append(item)
                
    return unique_annotations

with open('CORPUS/special/129_annotated_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

data = remove_text_duplicates(data)

print(f"Annotations uniques: {len(data)}")

def convert_labelstudio(data_list):
    converted = []

    for item in data_list:

        text = item['data']['text']
        entities = []

        annotations = item.get('annotations', [])
        for annotation in annotations:
            results = annotation.get('result', [])
            for result in results:
                value = result.get('value', {})
                start = value.get('start')
                end = value.get('end')
                label_list = value.get('labels', [])
                for label in label_list:  # au cas où il y aurait plusieurs labels
                    entities.append((start, end, label))

        converted.append((text, {'entities': entities}))
    
    return converted

# SLIDING WINDOW
def split_sliding_window(sample , stride = 512):
    tokenizer = create_tokenizer()
    res = []
    for i in range(len(sample)):
        text_segmented = split_text_into_sliding_windows(sample[i][0], tokenizer, max_tokens=510, stride = stride)
        res.append(text_segmented)
    
    return res
    

Annotations uniques: 127


In [5]:
def evaluate_gliner_with_segmentation(data, model_name="urchade/gliner_multi-v2.1", device="cuda"):
    """
    Évalue les performances du modèle GLiNER sur des données au format Label Studio
    avec différentes stratégies de segmentation et détermine la meilleure segmentation.
    
    Args:
        data: Données au format Label Studio
        model_name: Nom du modèle GLiNER à utiliser
        device: Appareil pour exécuter le modèle ("cuda" ou "cpu")
        
    Returns:
        dict: Dictionnaire contenant les métriques pour chaque type de segmentation
              et la recommandation de la meilleure segmentation
    """
    import time
    from includes.DynamicSegmentation import ThematicSegmenter
    from includes.SemanticQueryV1 import SemanticSearchEngine, SearchConfig
    from gliner import GLiNER
    
    # Conversion des données Label Studio
    sample = convert_labelstudio(data)
    
    # Chargement du modèle GLiNER
    model_gliner = GLiNER.from_pretrained(model_name, max_length=512)
    model_gliner.to(device)
    
    # Labels et types à supprimer
    labels = [
        "Magasin de producteurs", 
        "Information temporelle", 
        "Lieu",
        "Produit",
        "Producteur/Artisan",
        "Organisation",
    ]
    
    types_a_supprimer = [
        "Réseau de magasins",
        "Nombre de magasins",
        "Déclencheur",
        "Nombre de producteurs",
        "Point de vente",
        "Autres" 
    ]
    
    # Création du tokenizer
    tokenizer = create_tokenizer()
    
    # Dictionnaire pour stocker les résultats
    results_dict = {}
    
    # 1. THEMATIC + PASSAGE RETRIEVER
    window_size = 2
    segmenter = ThematicSegmenter(window_size=window_size)
    
    # Configuration pour la recherche sémantique
    concepts = {
        'query': "Le magasin de producteurs est un point de vente collectif qui regroupe plusieurs agriculteurs qui ont pour objectif de valoriser leur exploitation et leur production fermière",
    }
    config = SearchConfig(threshold=0.1, batch_size=16, max_sequence_length=512)
    search_engine = SemanticSearchEngine("paraphrase-multilingual-MiniLM-L12-v2", config)
    
    # Segmentation thématique
    thematic_seg = split_thematic_segment(sample, segmenter)
    
    # Récupération des segments pertinents pour la requête
    query = concepts['query']
    segments_retrieved = search_engine.search(query=query, documents=thematic_seg)
    
    # Évaluation avec THEMATIC + PASSAGE RETRIEVER
    print("Évaluation avec THEMATIC + PASSAGE RETRIEVER...")
    results_gliner, execution_time = evaluate_model(
        sample, segments_retrieved, labels, model_gliner, 
        mode="thematic", tokenizer=tokenizer, max_tokens=510, stride=510
    )
    print(f"Temps d'exécution : {execution_time}")
    
    res_gliner = filtrer_entites(results_gliner, types_a_supprimer)
    metrics = calculate_metrics(res_gliner)
    print_metrics(metrics)
    results_dict["thematic_retriever"] = metrics
    
    # 2. ONLY THEMATIC
    print("\nÉvaluation avec ONLY THEMATIC...")
    results_gliner, execution_time = evaluate_model(
        sample, thematic_seg, labels, model_gliner, 
        mode="segments", tokenizer=tokenizer, max_tokens=510, stride=510
    )
    print(f"Temps d'exécution : {execution_time}")
    
    res_gliner = filtrer_entites(results_gliner, types_a_supprimer)
    metrics = calculate_metrics(res_gliner)
    print_metrics(metrics)
    results_dict["thematic"] = metrics
    
    # 3. SLIDING WINDOW
    print("\nÉvaluation avec SLIDING WINDOW...")
    overlap = 0.1  # represents 10% of overlapping
    sliding_window = split_sliding_window(sample, stride=int(510 * (1 - overlap)))
    
    results, execution_time = evaluate_model(
        sample, sliding_window, labels, model_gliner, 
        mode="segments", tokenizer=tokenizer, max_tokens=510, stride=510
    )
    print(f"Temps d'exécution : {execution_time}")
    
    res_gliner = filtrer_entites(results, types_a_supprimer)
    metrics = calculate_metrics(res_gliner)
    print_metrics(metrics)
    results_dict["sliding_window"] = metrics
    
    # 4. CHUNKING
    print("\nÉvaluation avec CHUNKING...")
    overlap = 0  # non-overlap
    chunking = split_sliding_window(sample, int(510 * (1 - overlap)))
    
    results, execution_time = evaluate_model(
        sample, chunking, labels, model_gliner, 
        mode="segments", tokenizer=tokenizer, max_tokens=510, stride=510
    )
    print(f"Temps d'exécution : {execution_time}")
    
    res_gliner = filtrer_entites(results, types_a_supprimer)
    metrics = calculate_metrics(res_gliner)
    print_metrics(metrics)
    results_dict["chunking"] = metrics
    
    best_segmentation = None
    min_fp_fn_sum = float('inf')  # Initialiser avec une valeur infinie
    segment_metrics = {}
    
    for segmentation_type, metrics_data in results_dict.items():
        if segmentation_type == "best_segmentation":
            continue
        
        # Récupérer le total des FP et FN pour cette segmentation
        total_fp = metrics_data.get('fp', 0)
        total_fn = metrics_data.get('fn', 0)
        total_tp = metrics_data.get('tp', 0)
        
        # Somme des FP et FN (l'erreur totale)
        fp_fn_sum = total_fp + total_fn
        
        # Calculer la précision, le rappel et le F1-score global
        precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        # Stocker les métriques pour cette segmentation
        segment_metrics[segmentation_type] = {
            'tp': total_tp,
            'fp': total_fp,
            'fn': total_fn,
            'precision': precision,
            'recall': recall,
            'f1': f1_score,
            'fp_fn_sum': fp_fn_sum
        }
        
        # Mettre à jour la meilleure segmentation si celle-ci a moins d'erreurs
        if fp_fn_sum < min_fp_fn_sum:
            min_fp_fn_sum = fp_fn_sum
            best_segmentation = segmentation_type
    
    # Ajouter la meilleure segmentation et les scores détaillés au dictionnaire de résultats
    results_dict["best_segmentation"] = {
        "method": best_segmentation,
        "fp_fn_sum": min_fp_fn_sum,
        "detailed_metrics": segment_metrics
    }
    
    # Afficher un résumé des métriques pour faciliter la compréhension
    print("\n=== Résumé des métriques par méthode de segmentation ===")
    for seg_type, metrics in segment_metrics.items():
        print(f"\n{seg_type}:")
        print(f"  TP: {metrics['tp']}")
        print(f"  FP: {metrics['fp']}")
        print(f"  FN: {metrics['fn']}")
        print(f"  Somme FP+FN: {metrics['fp_fn_sum']}")
        print(f"  Précision: {metrics['precision']:.4f}")
        print(f"  Rappel: {metrics['recall']:.4f}")
        print(f"  F1-score: {metrics['f1']:.4f}")
    
    print(f"\nMeilleure méthode recommandée: {best_segmentation}")
    print(f"(Somme FP+FN minimale: {min_fp_fn_sum})")
    
    return results_dict

all_res = evaluate_gliner_with_segmentation(data[114:], model_name="urchade/gliner_multi-v2.1", device="cuda")




Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]



Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:includes.SemanticQueryV1:Début de la recherche avec 43 segments issus de 13 documents
Calcul des embeddings:   0%|                                                                     | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Calcul des embeddings: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 115.73it/s]
Calcul des embeddings:   0%|                                                                     | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Calcul des embeddings: 100%|█████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 25.73it/s]
INFO:includes.SemanticQueryV1:Recherche terminée: 41 résultats retournés
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Évaluation avec THEMATIC + PASSAGE RETRIEVER...
Document 0: Segments originaux trouvés: 6/6 (100.0%)
Document 1: Segments originaux trouvés: 3/3 (100.0%)
Document 2: Segments originaux trouvés: 5/5 (100.0%)
Document 3: Segments originaux trouvés: 2/2 (100.0%)
Document 4: Segments originaux trouvés: 2/2 (100.0%)
Document 5: Segments originaux trouvés: 5/5 (100.0%)
Document 6: Segments originaux trouvés: 2/2 (100.0%)
Document 7: Segments originaux trouvés: 2/2 (100.0%)
Document 8: Segments originaux trouvés: 2/2 (100.0%)
Document 9: Segments originaux trouvés: 2/2 (100.0%)
Document 10: Segments originaux trouvés: 2/2 (100.0%)
Document 11: Segments originaux trouvés: 4/4 (100.0%)
Document 12: Segments originaux trouvés: 4/4 (100.0%)
Temps total d'inférence: 1.6948 secondes
Nombre de segments traités: 41
Temps moyen par segment: 0.0413 secondes
Temps d'exécution : {'total_inference_time': 1.6947579011321068, 'total_segments_processed': 41, 'avg_time_per_segment': 0.041335558564197725}

Mét



Document 0: Segments originaux trouvés: 4/4 (100.0%)
  → Sous-segments traités: 5 (segments divisés: 1)
Document 1: Segments originaux trouvés: 2/2 (100.0%)
Document 2: Segments originaux trouvés: 2/2 (100.0%)
Document 3: Segments originaux trouvés: 1/1 (100.0%)
Document 4: Segments originaux trouvés: 1/1 (100.0%)
Document 5: Segments originaux trouvés: 2/2 (100.0%)
Document 6: Segments originaux trouvés: 2/2 (100.0%)
Document 7: Segments originaux trouvés: 1/1 (100.0%)
Document 8: Segments originaux trouvés: 1/1 (100.0%)
Document 9: Segments originaux trouvés: 2/2 (100.0%)
Document 10: Segments originaux trouvés: 1/1 (100.0%)
Document 11: Segments originaux trouvés: 2/2 (100.0%)
Document 12: Segments originaux trouvés: 2/2 (100.0%)
Temps total d'inférence: 1.6817 secondes
Nombre de segments traités: 24
Temps moyen par segment: 0.0701 secondes
Temps d'exécution : {'total_inference_time': 1.6816569063812494, 'total_segments_processed': 24, 'avg_time_per_segment': 0.0700690377658854}

Mé

In [6]:
print(all_res)

{'thematic_retriever': {'tp': 184, 'fp': 188, 'fn': 116, 'by_type': {'Produit': {'tp': 77, 'fp': 33, 'fn': 34, 'support': 111}, 'Lieu': {'tp': 65, 'fp': 52, 'fn': 13, 'support': 80}, 'Organisation': {'tp': 15, 'fp': 45, 'fn': 8, 'support': 23}, 'Producteur/Artisan': {'tp': 11, 'fp': 28, 'fn': 5, 'support': 16}, 'Magasin de producteurs': {'tp': 5, 'fp': 13, 'fn': 23, 'support': 28}, 'Information temporelle': {'tp': 11, 'fp': 17, 'fn': 33, 'support': 44}}}, 'thematic': {'tp': 184, 'fp': 194, 'fn': 116, 'by_type': {'Produit': {'tp': 77, 'fp': 33, 'fn': 34, 'support': 111}, 'Lieu': {'tp': 65, 'fp': 54, 'fn': 13, 'support': 80}, 'Producteur/Artisan': {'tp': 11, 'fp': 29, 'fn': 5, 'support': 16}, 'Organisation': {'tp': 15, 'fp': 47, 'fn': 8, 'support': 23}, 'Magasin de producteurs': {'tp': 5, 'fp': 13, 'fn': 23, 'support': 28}, 'Information temporelle': {'tp': 11, 'fp': 18, 'fn': 33, 'support': 44}}}, 'sliding_window': {'tp': 173, 'fp': 138, 'fn': 128, 'by_type': {'Produit': {'tp': 76, 'fp':