In [None]:
## To verify; this function is suggested by copilot
import unicodedata
import string


def comprehensive_character_scan(df, text_columns=None):
    """
    Comprehensive scan of all characters with categorization
    """
    if text_columns is None:
        text_columns = [col for col in df.columns if df[col].dtype == "object"]

    char_stats = {}

    for col in text_columns:
        for text in df[col].dropna():
            text = str(text)
            for char in text:
                if char not in char_stats:
                    char_stats[char] = {
                        "count": 0,
                        "unicode_code": ord(char),
                        "unicode_name": unicodedata.name(char, "UNKNOWN"),
                        "category": unicodedata.category(char),
                        "is_ascii": ord(char) <= 127,
                        "is_printable": char.isprintable(),
                        "columns": set(),
                    }
                char_stats[char]["count"] += 1
                char_stats[char]["columns"].add(col)

    return char_stats


# Run comprehensive scan
char_analysis = comprehensive_character_scan(annotations_test)
char_analysis.update(comprehensive_character_scan(labeled_dataset_test))

# Categorize problematic characters
categories = {
    "smart_quotes": [],
    "dashes": [],
    "spaces": [],
    "symbols": [],
    "accented": [],
    "other_non_ascii": [],
}

for char, stats in char_analysis.items():
    if not stats["is_ascii"]:
        unicode_name = stats["unicode_name"].lower()
        if "quote" in unicode_name:
            categories["smart_quotes"].append((char, stats))
        elif "dash" in unicode_name or "hyphen" in unicode_name:
            categories["dashes"].append((char, stats))
        elif "space" in unicode_name:
            categories["spaces"].append((char, stats))
        elif any(
            accent in unicode_name
            for accent in ["acute", "grave", "circumflex", "tilde", "diaeresis"]
        ):
            categories["accented"].append((char, stats))
        elif stats["category"].startswith("S"):  # Symbol categories
            categories["symbols"].append((char, stats))
        else:
            categories["other_non_ascii"].append((char, stats))

# Display results by category
for category, chars in categories.items():
    if chars:
        print(f"\n=== {category.upper().replace('_', ' ')} ===")
        for char, stats in sorted(chars, key=lambda x: x[1]["count"], reverse=True):
            print(
                f"'{char}' ({stats['unicode_code']:04x}) - {stats['unicode_name']} - Count: {stats['count']}"
            )
