In [None]:
# Enhanced SpaCy NER System for CoNLL2003

# 1. Install and Import Required Libraries
# ----------------------------------------
!pip install -q datasets evaluate tabulate spacy seqeval
!python -m spacy download en_core_web_sm

import spacy
import re
from collections import defaultdict, Counter
from datasets import load_dataset
from evaluate import load
from tabulate import tabulate
from spacy.lang.en.stop_words import STOP_WORDS

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m128.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restar

In [None]:

# Enhanced Analysis Tools for SpaCy NER System
# --------------------------------------------

import spacy
import re
from collections import defaultdict, Counter
from datasets import load_dataset
from evaluate import load
from tabulate import tabulate
from spacy.lang.en.stop_words import STOP_WORDS
from IPython.display import display, HTML

# 1. Add detailed analysis methods to the EnhancedSpacyNER class
# ----------------------------------------------------------
class EnhancedSpacyNER:
    def __init__(self):
        # [Previous initialization code remains the same...]
        self.nlp = spacy.load("en_core_web_sm")

        # Initialize dictionaries
        self.allowed_entities = {"PER", "LOC", "ORG", "MISC"}
        self.spacy_to_conll = {
            "PERSON": "PER",
            "GPE": "LOC",
            "LOC": "LOC",
            "ORG": "ORG",
            "NORP": "MISC",
            "EVENT": "MISC",
            "PRODUCT": "MISC",
            "WORK_OF_ART": "MISC"
        }


        self.correction_patterns = {
            "PER": {
                "title_patterns": [
                    re.compile(r"^(?:Mr\.|Mrs\.|Ms\.|Dr\.|Prof\.|Sir|Lord|Lady) [A-Z][a-z]+(?: [A-Z][a-z]+)*$"),
                ],
                "suffix_patterns": [
                    re.compile(r"^[A-Z][a-z]+(?: [A-Z][a-z]+)* (?:Jr\.|Sr\.|III|II|IV)$"),
                ]
            },
            "ORG": {
                "company_patterns": [
                    re.compile(r"^[A-Z][a-z]+(?: [A-Z][a-z]+)* (?:Inc|Corp|Co|Ltd|LLC|Limited|PLC|Group|Bank)\.?$"),
                    re.compile(r"^The [A-Z][a-z]+(?: [A-Z][a-z]+)* (?:Inc|Corp|Co|Ltd|LLC|Limited|PLC|Group|Bank)\.?$"),
                ],
                "educational_patterns": [
                    re.compile(r"^(?:University|College|School|Institute) of [A-Z][a-z]+(?: [A-Z][a-z]+)*$"),
                ]
            },
            "LOC": {
                "geographic_patterns": [
                    re.compile(r"^[A-Z][a-z]+(?: [A-Z][a-z]+)* (?:Mountains|River|Valley|Desert|Lake|Ocean|Sea|Bay|Gulf|Island)$"),
                    re.compile(r"^(?:Mount|Mt\.) [A-Z][a-z]+$"),
                ],
                "admin_patterns": [
                    re.compile(r"^[A-Z][a-z]+(?: [A-Z][a-z]+)* (?:State|Province|Prefecture|County|District)$"),
                ]
            },
            "MISC": {
                "event_patterns": [
                    re.compile(r"^[A-Z][a-z]+(?: [A-Z][a-z]+)* (?:Cup|Championship|Olympics|Games|Tournament|Festival|Summit)$"),
                ],
                "award_patterns": [
                    re.compile(r"^[A-Z][a-z]+(?: [A-Z][a-z]+)* (?:Award|Prize|Medal)$"),
                ]
            }
        }

        self.misclassification_corrections = defaultdict(list)

    def load_correction_patterns_from_training(self, dataset):
        """Learn common spaCy misclassifications from training data"""
        id_to_label = dataset["train"].features["ner_tags"].feature.int2str

        spacy_errors = defaultdict(Counter)

        for tokens, tags in zip(dataset["train"]["tokens"], dataset["train"]["ner_tags"]):
            text = " ".join(tokens)
            doc = self.nlp(text)

            spacy_labels = ["O"] * len(tokens)
            for ent in doc.ents:
                if ent.label_ in self.spacy_to_conll:
                    label = self.spacy_to_conll[ent.label_]
                    ent_tokens = ent.text.split()
                    span = self.find_token_span_exact(tokens, ent_tokens)
                    if span:
                        start, end = span
                        spacy_labels[start] = f"B-{label}"
                        for i in range(start + 1, end):
                            spacy_labels[i] = f"I-{label}"

            for i, (token, true_tag) in enumerate(zip(tokens, tags)):
                true_label = id_to_label(true_tag)
                spacy_label = spacy_labels[i]

                if true_label != "O" and spacy_label != true_label:
                    if spacy_label == "O":
                        entity_type = true_label[2:]
                        start_idx = i
                        while start_idx > 0 and id_to_label(tags[start_idx-1])[2:] == entity_type:
                            start_idx -= 1
                        end_idx = i + 1
                        while end_idx < len(tags) and id_to_label(tags[end_idx])[2:] == entity_type:
                            end_idx += 1
                        entity_phrase = " ".join(tokens[start_idx:end_idx])
                        spacy_errors[entity_type][entity_phrase] += 1

        for entity_type, error_counter in spacy_errors.items():
            for phrase, count in error_counter.most_common(100):
                if count > 2:
                    escaped_phrase = re.escape(phrase)
                    pattern = re.compile(r'\b' + escaped_phrase + r'\b')
                    self.misclassification_corrections[entity_type].append(pattern)

    def find_token_span_exact(self, tokens, target_tokens):
        """Find exact token span"""
        for i in range(len(tokens) - len(target_tokens) + 1):
            if tokens[i:i + len(target_tokens)] == target_tokens:
                return i, i + len(target_tokens)
        return None

    def apply_regex_patterns(self, tokens):
        """Apply regex pattern-based entity detection"""
        regex_labels = ["O"] * len(tokens)

        for entity_type, pattern_groups in self.correction_patterns.items():
            for group_name, patterns in pattern_groups.items():
                for pattern in patterns:
                    # Check single tokens
                    for i, token in enumerate(tokens):
                        if pattern.match(token):
                            regex_labels[i] = f"B-{entity_type}"

                    # Check multi-token phrases (up to 5 tokens)
                    for window_size in range(2, 6):
                        for i in range(len(tokens) - window_size + 1):
                            phrase = " ".join(tokens[i:i+window_size])
                            if pattern.match(phrase):
                                regex_labels[i] = f"B-{entity_type}"
                                for j in range(i+1, i+window_size):
                                    regex_labels[j] = f"I-{entity_type}"

        return regex_labels

    def apply_learned_corrections(self, tokens, spacy_labels):
        """Apply corrections based on learned misclassifications"""
        custom_labels = spacy_labels.copy()

        for entity_type, patterns in self.misclassification_corrections.items():
            for pattern in patterns:
                text = " ".join(tokens)
                for match in pattern.finditer(text):
                    matched_text = match.group()
                    matched_tokens = matched_text.split()
                    span = self.find_token_span_exact(tokens, matched_tokens)

                    if span:
                        start, end = span
                        custom_labels[start] = f"B-{entity_type}"
                        for i in range(start + 1, end):
                            custom_labels[i] = f"I-{entity_type}"

        return custom_labels

    def apply_high_confidence_corrections(self, tokens, spacy_labels):
        """Apply only high-confidence corrections to spaCy predictions"""
        corrected_labels = spacy_labels.copy()

        # Apply pattern-based corrections
        for entity_type, pattern_groups in self.correction_patterns.items():
            for group_name, patterns in pattern_groups.items():
                for pattern in patterns:
                    # Check single tokens
                    for i, token in enumerate(tokens):
                        if pattern.match(token) and spacy_labels[i] != f"B-{entity_type}":
                            corrected_labels[i] = f"B-{entity_type}"

                    # Check multi-token phrases (up to 5 tokens)
                    for window_size in range(2, 6):
                        for i in range(len(tokens) - window_size + 1):
                            phrase = " ".join(tokens[i:i+window_size])
                            if pattern.match(phrase):
                                # Only correct if spaCy didn't detect it or misclassified it
                                if spacy_labels[i] == "O" or spacy_labels[i][2:] != entity_type:
                                    corrected_labels[i] = f"B-{entity_type}"
                                    for j in range(i+1, i+window_size):
                                        corrected_labels[j] = f"I-{entity_type}"

        # Apply corrections based on learned misclassifications
        for entity_type, patterns in self.misclassification_corrections.items():
            for pattern in patterns:
                text = " ".join(tokens)
                for match in pattern.finditer(text):
                    matched_text = match.group()
                    matched_tokens = matched_text.split()
                    span = self.find_token_span_exact(tokens, matched_tokens)

                    if span:
                        start, end = span
                        # Only correct if spaCy missed it or got it wrong
                        if spacy_labels[start] == "O" or spacy_labels[start][2:] != entity_type:
                            corrected_labels[start] = f"B-{entity_type}"
                            for i in range(start + 1, end):
                                corrected_labels[i] = f"I-{entity_type}"

        return corrected_labels

    def extract_entities_with_analysis(self, tokens):
        """Extract entities with multiple methods for analysis"""
        text = " ".join(tokens)
        doc = self.nlp(text)

        # Get spaCy predictions
        spacy_labels = ["O"] * len(tokens)
        for ent in doc.ents:
            if ent.label_ in self.spacy_to_conll:
                label = self.spacy_to_conll[ent.label_]
                if label in self.allowed_entities:
                    ent_tokens = ent.text.split()
                    span = self.find_token_span_exact(tokens, ent_tokens)
                    if span:
                        start, end = span
                        spacy_labels[start] = f"B-{label}"
                        for i in range(start + 1, end):
                            spacy_labels[i] = f"I-{label}"

        # Apply pattern-based detection
        regex_labels = self.apply_regex_patterns(tokens)

        # Apply learned corrections
        custom_labels = self.apply_learned_corrections(tokens, spacy_labels)

        # Apply high-confidence corrections
        corrected_labels = self.apply_high_confidence_corrections(tokens, spacy_labels)

        return {
            "spacy": spacy_labels,
            "regex": regex_labels,
            "custom": custom_labels,
            "corrected": corrected_labels
        }

In [None]:
# 2. Analysis function that compares all approaches
# ------------------------------------------------
def analyze_examples(dataset, ner_system, max_examples=30):
    """Analyze examples showing differences between approaches"""
    test_results = []

    # Process test data
    for example in dataset["test"]:
        tokens = example["tokens"]

        # Get predictions from all methods
        predictions = ner_system.extract_entities_with_analysis(tokens)

        # Convert ground truth tags to BIO format
        ner_labels = dataset["train"].features["ner_tags"].feature
        ground_truth = [ner_labels.int2str(tag) for tag in example["ner_tags"]]
        filtered_truth = [tag if any(e in tag for e in ner_system.allowed_entities) else "O" for tag in ground_truth]

        test_results.append({
            "tokens": tokens,
            "ground_truth": filtered_truth,
            "spacy_pred": predictions["spacy"],
            "regex_pred": predictions["regex"],
            "custom_pred": predictions["custom"],
            "corrected_pred": predictions["corrected"]
        })

    # Display analysis
    count = 0
    for i, example in enumerate(test_results):
        tokens = example["tokens"]
        ground_truth = example["ground_truth"]
        spacy_pred = example["spacy_pred"]
        custom_pred = example["custom_pred"]
        regex_pred = example["regex_pred"]
        corrected_pred = example["corrected_pred"]

        if ground_truth != corrected_pred or ground_truth != spacy_pred:
            print(f"\n\033[1;34mExample {i + 1}:\033[0m")  # Blue for example header
            print("\033[1;33mTokens: \033[0m", " ".join(tokens))  # Yellow for tokens

            # Display differences with aligned columns for readability
            print("\033[1;32mGround Truth:    \033[0m", " ".join(ground_truth))
            print("\033[1;31mSpaCy Prediction: \033[0m", " ".join(spacy_pred))
            print("\033[1;36mRegex Patterns:   \033[0m", " ".join(regex_pred))
            print("\033[1;35mLearned Patterns: \033[0m", " ".join(custom_pred))
            print("\033[1;36mFinal Corrected:  \033[0m", " ".join(corrected_pred))

            # Highlight mismatched entities
            mismatched = [
                f"{token}: GT={gt}, SpaCy={sp}, Regex={rp}, Learned={cp}, Final={fp}"
                for token, gt, sp, rp, cp, fp in zip(tokens, ground_truth, spacy_pred, regex_pred, custom_pred, corrected_pred)
                if gt != fp or gt != sp
            ]

            if mismatched:
                print("\n\033[1;35mMismatched Entities:\033[0m")
                print("\n".join(mismatched))

            print("-" * 100)  # Separator for readability
            count += 1

            if count >= max_examples:
                break

    return test_results

In [None]:
# 3. Generate statistical analysis
# -------------------------------
def generate_statistics(test_results):
    """Generate detailed statistics on correction patterns"""
    stats = {
        "spacy_errors": defaultdict(int),
        "corrections": defaultdict(int),
        "regex_improvements": defaultdict(int),
        "learned_improvements": defaultdict(int),
        "total_corrections": 0,
        "total_tokens": 0
    }

    for example in test_results:
        tokens = example["tokens"]
        ground_truth = example["ground_truth"]
        spacy_pred = example["spacy_pred"]
        regex_pred = example["regex_pred"]
        custom_pred = example["custom_pred"]
        corrected_pred = example["corrected_pred"]

        stats["total_tokens"] += len(tokens)

        for i, (token, gt, sp, rp, cp, fp) in enumerate(zip(tokens, ground_truth, spacy_pred, regex_pred, custom_pred, corrected_pred)):
            if gt != sp:
                stats["spacy_errors"][gt] += 1

                if fp == gt and sp != gt:
                    stats["corrections"][gt] += 1
                    stats["total_corrections"] += 1

                # Track which method actually helped
                if rp == gt and sp != gt:
                    stats["regex_improvements"][gt] += 1

                if cp == gt and sp != gt:
                    stats["learned_improvements"][gt] += 1

    return stats

In [None]:
# -------------------
def display_analytics(stats):
    """Display detailed analytics"""
    print("\n\033[1;34mAnalysis Summary:\033[0m")
    print(f"Total tokens analyzed: {stats['total_tokens']}")
    print(f"Total corrections made: {stats['total_corrections']}")
    print(f"Correction rate: {stats['total_corrections'] / stats['total_tokens'] * 100:.2f}%")

    print("\n\033[1;34mError Distribution (spaCy):\033[0m")
    for label, count in stats['spacy_errors'].items():
        print(f"{label}: {count} errors")

    print("\n\033[1;34mCorrections by Entity Type:\033[0m")
    for label, count in stats['corrections'].items():
        print(f"{label}: {count} corrections")

    print("\n\033[1;34mImprovement Source:\033[0m")
    print("\033[1;36mRegex Patterns:\033[0m")
    for label, count in stats['regex_improvements'].items():
        print(f"{label}: {count} improvements")

    print("\n\033[1;35mLearned Patterns:\033[0m")
    for label, count in stats['learned_improvements'].items():
        print(f"{label}: {count} improvements")


In [None]:
# 6. Standard evaluation function
# ------------------------------
def evaluate_ner_system(dataset, ner_system):
    """Standard evaluation using seqeval"""
    # Process test data
    spacy_predictions = []
    enhanced_predictions = []
    ground_truths = []

    for example in dataset["test"]:
        tokens = example["tokens"]
        predictions = ner_system.extract_entities_with_analysis(tokens)

        # Convert ground truth tags to BIO format
        ner_labels = dataset["train"].features["ner_tags"].feature
        ground_truth = [ner_labels.int2str(tag) for tag in example["ner_tags"]]
        filtered_truth = [tag if any(e in tag for e in ner_system.allowed_entities) else "O" for tag in ground_truth]

        spacy_predictions.append(predictions["spacy"])
        enhanced_predictions.append(predictions["corrected"])
        ground_truths.append(filtered_truth)

    # Evaluate results
    metric = load("seqeval")

    spacy_results = metric.compute(predictions=spacy_predictions, references=ground_truths)
    enhanced_results = metric.compute(predictions=enhanced_predictions, references=ground_truths)

    # Display results
    print("\n\033[1;34mBaseline spaCy Results:\033[0m")
    spacy_table = []
    for entity, metrics in spacy_results.items():
        if isinstance(metrics, dict) and entity in ner_system.allowed_entities:
            spacy_table.append([entity, metrics["precision"], metrics["recall"], metrics["f1"], metrics["number"]])
    print(tabulate(spacy_table, headers=["Entity", "Precision", "Recall", "F1 Score", "Count"], tablefmt="pretty"))
    print(f"Overall spaCy F1: {spacy_results['overall_f1']:.4f}")

    print("\n\033[1;34mEnhanced spaCy NER Results:\033[0m")
    enhanced_table = []
    for entity, metrics in enhanced_results.items():
        if isinstance(metrics, dict) and entity in ner_system.allowed_entities:
            enhanced_table.append([entity, metrics["precision"], metrics["recall"], metrics["f1"], metrics["number"]])
    print(tabulate(enhanced_table, headers=["Entity", "Precision", "Recall", "F1 Score", "Count"], tablefmt="pretty"))
    print(f"Overall Enhanced F1: {enhanced_results['overall_f1']:.4f}")

    # Show improvements
    print("\n\033[1;34mImprovement Analysis:\033[0m")
    for entity in ner_system.allowed_entities:
        if entity in spacy_results and entity in enhanced_results:
            f1_improvement = enhanced_results[entity]["f1"] - spacy_results[entity]["f1"]
            precision_improvement = enhanced_results[entity]["precision"] - spacy_results[entity]["precision"]
            recall_improvement = enhanced_results[entity]["recall"] - spacy_results[entity]["recall"]

            print(f"{entity}: F1 Δ = {f1_improvement:+.4f}, Precision Δ = {precision_improvement:+.4f}, Recall Δ = {recall_improvement:+.4f}")

    overall_f1_improvement = enhanced_results['overall_f1'] - spacy_results['overall_f1']
    print(f"\nOverall F1 Improvement: {overall_f1_improvement:+.4f}")


In [None]:
# 5. Main execution function
# -------------------------
def run_full_analysis():
    """Run full analysis with examples and statistics"""
    # Load dataset
    print("Loading CoNLL2003 dataset...")
    dataset = load_dataset("conll2003")

    # Initialize NER system
    print("Initializing Enhanced NER system...")
    ner_system = EnhancedSpacyNER()

    print("Learning correction patterns from training data...")
    ner_system.load_correction_patterns_from_training(dataset)

    # Run analysis
    print("\nAnalyzing examples where corrections were made...\n")
    test_results = analyze_examples(dataset, ner_system, max_examples=30)

    # Generate statistics
    print("\nGenerating statistics...")
    stats = generate_statistics(test_results)

    # Display analytics
    display_analytics(stats)

    # Run standard evaluation
    print("\n\nRunning standard evaluation...")
    evaluate_ner_system(dataset, ner_system)

In [None]:
# 7. Helper function for sample analysis
# ------------------------------------
def analyze_sample(ner_system, text):
    """Analyze a single sample text"""
    tokens = text.split()
    predictions = ner_system.extract_entities_with_analysis(tokens)

    print("\n\033[1;34mToken Analysis:\033[0m")
    print("\033[1;33mTokens:     \033[0m", " ".join(tokens))
    print("\033[1;31mSpaCy:      \033[0m", " ".join(predictions["spacy"]))
    print("\033[1;36mRegex:      \033[0m", " ".join(predictions["regex"]))
    print("\033[1;35mLearned:    \033[0m", " ".join(predictions["custom"]))
    print("\033[1;36mCorrected:  \033[0m", " ".join(predictions["corrected"]))

    # Identify differences
    print("\n\033[1;34mDifferences:\033[0m")
    for i, token in enumerate(tokens):
        if predictions["spacy"][i] != predictions["corrected"][i]:
            print(f"{token}: SpaCy={predictions['spacy'][i]}, Final={predictions['corrected'][i]}")


In [None]:
# Run the complete analysis
if __name__ == "__main__":
    run_full_analysis()

Loading CoNLL2003 dataset...
Initializing Enhanced NER system...
Learning correction patterns from training data...

Analyzing examples where corrections were made...


[1;34mExample 1:[0m
[1;33mTokens: [0m SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
[1;32mGround Truth:    [0m O O B-LOC O O O O B-PER O O O O
[1;31mSpaCy Prediction: [0m O O O O O O O O O O B-ORG O
[1;36mRegex Patterns:   [0m O O O O O O O O O O O O
[1;35mLearned Patterns: [0m O O O O O O O O O O B-ORG O
[1;36mFinal Corrected:  [0m O O O O O O O O O O B-ORG O

[1;35mMismatched Entities:[0m
JAPAN: GT=B-LOC, SpaCy=O, Regex=O, Learned=O, Final=O
CHINA: GT=B-PER, SpaCy=O, Regex=O, Learned=O, Final=O
DEFEAT: GT=O, SpaCy=B-ORG, Regex=O, Learned=B-ORG, Final=B-ORG
----------------------------------------------------------------------------------------------------

[1;34mExample 3:[0m
[1;33mTokens: [0m AL-AIN , United Arab Emirates 1996-12-06
[1;32mGround Truth:    [0m B-LOC O B-LOC I-LOC I-LO

In [None]:
def evaluate_enhanced_ner_system(dataset, ner_system):
    """Evaluate enhanced NER system using seqeval"""
    # Process test data
    enhanced_predictions = []
    ground_truths = []

    for example in dataset["test"]:
        tokens = example["tokens"]

        # Get enhanced predictions only
        predictions = ner_system.extract_entities_with_analysis(tokens)

        # Convert ground truth tags to BIO format
        ner_labels = dataset["train"].features["ner_tags"].feature
        ground_truth = [ner_labels.int2str(tag) for tag in example["ner_tags"]]
        filtered_truth = [tag if any(e in tag for e in ner_system.allowed_entities) else "O" for tag in ground_truth]

        enhanced_predictions.append(predictions["corrected"])
        ground_truths.append(filtered_truth)

    # Evaluate using seqeval
    metric = load("seqeval")
    enhanced_results = metric.compute(predictions=enhanced_predictions, references=ground_truths)

    # Display results
    print("\n\033[1;34mEnhanced SpaCy NER Evaluation Results:\033[0m")

    # Create detailed table for per-entity results
    enhanced_table = []
    for entity, metrics in enhanced_results.items():
        if isinstance(metrics, dict) and entity in ner_system.allowed_entities:
            enhanced_table.append([
                entity,
                f"{metrics['precision']:.4f}",
                f"{metrics['recall']:.4f}",
                f"{metrics['f1']:.4f}",
                metrics['number']
            ])

    print(tabulate(enhanced_table, headers=["Entity", "Precision", "Recall", "F1 Score", "Support"], tablefmt="pretty"))

    # Overall results
    print(f"\n\033[1;36mOverall Metrics:\033[0m")
    print(f"Overall Precision: {enhanced_results['overall_precision']:.4f}")
    print(f"Overall Recall: {enhanced_results['overall_recall']:.4f}")
    print(f"Overall F1 Score: {enhanced_results['overall_f1']:.4f}")
    print(f"Overall Accuracy: {enhanced_results['overall_accuracy']:.4f}")

    return enhanced_results

# Generate classification report using seqeval format
# ------------------------------------------------
def generate_seqeval_report(dataset, ner_system):
    """Generate detailed seqeval classification report"""
    # Process test data
    enhanced_predictions = []
    ground_truths = []

    for example in dataset["test"]:
        tokens = example["tokens"]
        predictions = ner_system.extract_entities_with_analysis(tokens)

        # Convert ground truth tags to BIO format
        ner_labels = dataset["train"].features["ner_tags"].feature
        ground_truth = [ner_labels.int2str(tag) for tag in example["ner_tags"]]
        filtered_truth = [tag if any(e in tag for e in ner_system.allowed_entities) else "O" for tag in ground_truth]

        enhanced_predictions.append(predictions["corrected"])
        ground_truths.append(filtered_truth)

    # Generate seqeval classification report
    metric = load("seqeval")
    print("\n\033[1;34mEnhanced NER Classification Report (seqeval):\033[0m")
    print(metric.compute(predictions=enhanced_predictions, references=ground_truths, mode='strict', scheme='IOB2'))

    return enhanced_predictions, ground_truths

# Main execution function for enhanced evaluation only
# -------------------------------------------------
def run_enhanced_evaluation():
    """Run evaluation focusing only on enhanced NER system"""
    # Load dataset
    print("Loading CoNLL2003 dataset...")
    dataset = load_dataset("conll2003")

    # Initialize NER system
    print("Initializing Enhanced NER system...")
    ner_system = EnhancedSpacyNER()

    print("Learning correction patterns from training data...")
    ner_system.load_correction_patterns_from_training(dataset)

    # Run evaluation
    print("\nEvaluating Enhanced NER system...")
    enhanced_results = evaluate_enhanced_ner_system(dataset, ner_system)

    # Generate detailed classification report
    print("\nGenerating detailed classification report...")
    generate_seqeval_report(dataset, ner_system)

    return enhanced_results

# Run the enhanced evaluation
if __name__ == "__main__":
    run_enhanced_evaluation()

Loading CoNLL2003 dataset...
Initializing Enhanced NER system...
Learning correction patterns from training data...

Evaluating Enhanced NER system...

[1;34mEnhanced SpaCy NER Evaluation Results:[0m
+--------+-----------+--------+----------+---------+
| Entity | Precision | Recall | F1 Score | Support |
+--------+-----------+--------+----------+---------+
|  LOC   |  0.7699   | 0.7524 |  0.7611  |  1668   |
|  MISC  |  0.6810   | 0.6083 |  0.6426  |   702   |
|  ORG   |  0.4574   | 0.3359 |  0.3874  |  1661   |
|  PER   |  0.7415   | 0.6172 |  0.6736  |  1617   |
+--------+-----------+--------+----------+---------+

[1;36mOverall Metrics:[0m
Overall Precision: 0.6714
Overall Recall: 0.5733
Overall F1 Score: 0.6185
Overall Accuracy: 0.9164

Generating detailed classification report...

[1;34mEnhanced NER Classification Report (seqeval):[0m
{'LOC': {'precision': np.float64(0.7713583282114321), 'recall': np.float64(0.7523980815347722), 'f1': np.float64(0.7617602427921093), 'number'