In [12]:
# Core NLP and ML libraries
!pip install -q pandas numpy tqdm scikit-learn matplotlib seaborn fpdf

# Stanza for tokenization & lemmatization
!pip install -q stanza

# Sentence-Transformers (for ConfliBERT)
!pip install -q sentence-transformers

# HuggingFace Transformers for tokenizer compatibility
!pip install -q transformers

# spaCy for POS tagging (optional, but included if needed later)
!pip install -q spacy
!python -m spacy download en_core_web_lg

# NLTK for WordNet and lemmatization
!pip install -q nltk

!pip install -q unidecode
#!pip install unidecode

!pip install langid

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 1.0/587.7 MB 8.4 MB/s eta 0:01:11
     ---------------------------------------- 2.6/587.7 MB 8.4 MB/s eta 0:01:10
     ---------------------------------------- 4.2/587.7 MB 7.9 MB/s eta 0:01:15
     ---------------------------------------- 5.8/587.7 MB 7.7 MB/s eta 0:01:17
      --------------------------------------- 7.6/587.7 MB 8.0 MB/s eta 0:01:13
      --------------------------------------- 9.4/587.7 MB 8.0 MB/s eta 0:01:12
      -------------------------------------- 10.7/587.7 MB 7.9 MB/s eta 0:01:14
      -------------------------------------- 12.6/587.7 MB 8.0 MB/s eta 0:01:13
      -------------------------------------- 14.2/587.7 MB 7.8 MB/s eta 0:01:14
      ------------------------

In [13]:
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brike\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\brike\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\brike\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\brike\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brike\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# === 🧠 Ultra CASS+ Evaluation Script (Final Enhanced Version with Robust Filtering and Acronym Fixes) ===

import os
import pandas as pd
import numpy as np
import stanza
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.mixture import GaussianMixture
from sentence_transformers import SentenceTransformer, util
from unidecode import unidecode
import langid
from difflib import SequenceMatcher
import re

# === Constants ===
ORTHO_PENALTY_DIACRITIC = 0.10
ORTHO_PENALTY_ORTHO = 0.15
ORTHO_PENALTY_CHAR_NOISE = 0.20
STATIC_THRESHOLD = 0.62
STOPWORDS = set(stopwords.words('english'))

# === Setup ===
DATA_PATH = "C:/Users/brike/CASS/DeepL"
OUTPUT_PATH = os.path.join(DATA_PATH, "output_cass")
STATS_PATH = os.path.join(OUTPUT_PATH, "stats")
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(STATS_PATH, exist_ok=True)

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))

lemmatizer = WordNetLemmatizer()

stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma,pos,ner', use_gpu=True)

model = SentenceTransformer("eventdata-utd/ConfliBERT-scr-uncased")

# === Fix encoding corruption from legacy character sets ===
def fix_encoding(text):
    try:
        return text.encode("latin1").decode("utf8")
    except:
        return text

# === Load CSV files with fallback for encoding issues ===
def safe_read_csv(path):
    try:
        return pd.read_csv(path, encoding="utf-8")
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(path, encoding="latin1")
            for col in df.columns:
                df[col] = df[col].apply(lambda x: fix_encoding(str(x)) if isinstance(x, str) else x)
            return df
        except Exception:
            return pd.read_csv(path, encoding="utf-8", errors="replace")


df_pairs = safe_read_csv(os.path.join(DATA_PATH, "sentence_pairs.csv"))
lexicon_df = safe_read_csv(os.path.join(DATA_PATH, "conflict_lexicon.csv"))
acronym_map = {}
acro_path = os.path.join(DATA_PATH, "acronyms_expansions.csv")
if os.path.exists(acro_path):
    acro_df = pd.read_csv(acro_path)
    for _, row in acro_df.iterrows():
        acronym_map[str(row['lost_rare_lemma']).lower()] = str(row['representative_translation_unit']).lower()

lexicon_df['term'] = lexicon_df['term'].apply(fix_encoding)
lexicon = {t.lower() for t in lexicon_df['term'] if len(t) > 2 or t.isupper()}

# === Helper Functions ===
def extract_abbreviation_pairs(text):
    pairs = {}
    matches = re.findall(r'\(([A-Z]{2,})\)', text)
    for match in matches:
        pattern = r'([\w\s\-]+)\s+\(' + match + r'\)'
        full_match = re.search(pattern, text, re.IGNORECASE)
        if full_match:
            pairs[match.lower()] = full_match.group(1).strip()
    return pairs

def matches_acronym_expansion(acronym, mt_text):
    words = [w for w in mt_text.split() if w and w[0].isupper()]
    pattern = ''.join(w[0].upper() for w in words[:len(acronym)])
    return pattern == acronym.upper()

def is_diacritic_variant(term, candidate):
    return term != candidate and unidecode(term) == unidecode(candidate) and any(ord(c) > 127 for c in term + candidate)

def is_char_noise_match(term, candidate):
    return SequenceMatcher(None, term.lower(), candidate.lower()).ratio() >= 0.85

def is_weird_token(token):
    return any(ord(c) > 126 for c in token) or any(c in token for c in ['\u2019', '\u201c', '\u201d', '\u00a0'])

def get_stanza_features(text):
    doc = nlp(text)
    lemmas, pos_tags, ents = set(), {}, set()
    for sent in doc.sentences:
        for word in sent.words:
            lemmas.add(word.lemma.lower())
            pos_tags[word.text.lower()] = word.upos
        for ent in sent.ents:
            ents.add(ent.text.lower())
    return lemmas, pos_tags, ents

# === Evaluation ===
records = []
for _, row in df_pairs.iterrows():
    sid, original, mt = str(row['id']), str(row['Original_EN']), str(row['MT_EN'])
    abbrev_map = extract_abbreviation_pairs(original)
    lang, _ = langid.classify(mt)
    if lang != 'en':
        continue

    orig_lemmas, orig_pos, orig_ents = get_stanza_features(original)
    mt_lemmas, mt_pos, mt_ents = get_stanza_features(mt)

    for term in lexicon:
        if term in orig_lemmas and term not in mt_lemmas:
            ner_flag = "NER_Drop" if term in orig_ents and term not in mt_ents else "No"
            pos_flag = "POS_Mismatch" if term in orig_pos and term in mt_pos and orig_pos[term] != mt_pos[term] else "No"

            if term in acronym_map and acronym_map[term] in mt.lower():
                records.append({"Sentence_ID": sid, "Original_EN": original, "MT_EN": mt,
                                "Missing_Term": term, "Best_Candidate": acronym_map[term],
                                "Similarity_Score": 1.0, "Conflict_Match": 1, "CASS_Score": 1.0,
                                "Orthographic_Diff": "AcronymCSV", "CharNoise_Flag": "No",
                                "Abbreviation_Match": "CSV", "NER_Match": ner_flag, "POS_Match": pos_flag,
                                "Explanation_GMM": "AcronymCSV", "Explanation_Static": "AcronymCSV"})
                continue

            if term.isupper() and matches_acronym_expansion(term, mt):
                records.append({"Sentence_ID": sid, "Original_EN": original, "MT_EN": mt,
                                "Missing_Term": term, "Best_Candidate": "[expanded]",
                                "Similarity_Score": 1.0, "Conflict_Match": 1, "CASS_Score": 1.0,
                                "Orthographic_Diff": "AcronymPattern", "CharNoise_Flag": "No",
                                "Abbreviation_Match": "Pattern", "NER_Match": ner_flag, "POS_Match": pos_flag,
                                "Explanation_GMM": "AcronymPattern", "Explanation_Static": "AcronymPattern"})
                continue

            try:
                src_emb = model.encode(term, convert_to_tensor=True)
                mt_embs = model.encode(list(mt_lemmas), convert_to_tensor=True)
                sims = util.pytorch_cos_sim(src_emb, mt_embs)[0]
                best_idx = sims.argmax().item()
                best_cand = list(mt_lemmas)[best_idx]
                sim = sims[best_idx].item()
                match = 1 if best_cand in lexicon else 0

                if best_cand == term:
                    ortho_diff, penalty, char_noise = "No", 0, "No"
                elif is_diacritic_variant(term, best_cand):
                    ortho_diff, penalty, char_noise = "Diacritic", ORTHO_PENALTY_DIACRITIC, "No"
                elif unidecode(term) == unidecode(best_cand):
                    ortho_diff, penalty, char_noise = "Orthographic", ORTHO_PENALTY_ORTHO, "No"
                elif is_char_noise_match(term, best_cand):
                    ortho_diff, penalty, char_noise = "CharNoise", ORTHO_PENALTY_CHAR_NOISE, "Yes"
                elif is_weird_token(best_cand):
                    ortho_diff, penalty, char_noise = "WeirdChar", ORTHO_PENALTY_CHAR_NOISE, "Yes"
                else:
                    ortho_diff, penalty, char_noise = "No", 0, "No"

                cass = round(max(0, 0.7 * sim + 0.3 * match - penalty), 4)

                records.append({"Sentence_ID": sid, "Original_EN": original, "MT_EN": mt,
                                "Missing_Term": term, "Best_Candidate": best_cand,
                                "Similarity_Score": round(sim, 4), "Conflict_Match": match, "CASS_Score": cass,
                                "Orthographic_Diff": ortho_diff, "CharNoise_Flag": char_noise,
                                "Abbreviation_Match": "No", "NER_Match": ner_flag, "POS_Match": pos_flag,
                                "Explanation_GMM": "", "Explanation_Static": ""})
            except Exception as e:
                print(f"❌ Error in sentence {sid}: {e}")
print("Starting data load...")
# Your data loading code here
print("Data loaded.")

print("Starting training loop...")
# Training loop
for epoch in range(10):
    print(f"Epoch {epoch+1} started")
    # Your training code here
    print(f"Epoch {epoch+1} completed")

# === Scoring ===
df = pd.DataFrame(records)
if df["CASS_Score"].nunique() > 1:
    gmm = GaussianMixture(n_components=2).fit(df[["CASS_Score"]])
    thresh_gmm = gmm.means_.mean()
else:
    thresh_gmm = STATIC_THRESHOLD

def explain(row, threshold):
    if pd.isna(row["CASS_Score"]):
        return "Error"
    f = []
    if row["Similarity_Score"] < 0.4:
        f.append("LowSim")
    if row["Conflict_Match"] == 0:
        f.append("NoLexMatch")
    if row["Orthographic_Diff"] == "Diacritic":
        f.append("DiacriticShift")
    elif row["Orthographic_Diff"] == "Orthographic":
        f.append("OrthoDivergence")
    elif row["Orthographic_Diff"] == "CharNoise":
        f.append("CharNoiseMatch")
    elif row["Orthographic_Diff"] == "WeirdChar":
        f.append("WeirdChar")
    if row["Abbreviation_Match"] != "No":
        f.append(f"Abbr:{row['Abbreviation_Match']}")
    if row["NER_Match"] == "NER_Drop":
        f.append("NER_Drop")
    if row["POS_Match"] == "POS_Mismatch":
        f.append("POS_Mismatch")
    return f"{'|'.join(f) if f else 'ExactMatch'} | CASS={row['CASS_Score']:.2f} ≥ {threshold:.2f}"

df["Classification_GMM"] = df["CASS_Score"].apply(lambda x: "Acceptable" if x >= thresh_gmm else "Divergence")
df["Classification_Static"] = df["CASS_Score"].apply(lambda x: "Acceptable" if x >= STATIC_THRESHOLD else "Divergence")
df["Explanation_GMM"] = df.apply(lambda r: explain(r, thresh_gmm), axis=1)
df["Explanation_Static"] = df.apply(lambda r: explain(r, STATIC_THRESHOLD), axis=1)

# === Save Outputs ===
df.to_csv(os.path.join(OUTPUT_PATH, "mt_eval_CASS.csv"), index=False)
df[["Sentence_ID", "Original_EN", "MT_EN", "Missing_Term", "Best_Candidate", "Orthographic_Diff", "CharNoise_Flag", "Abbreviation_Match", "NER_Match", "POS_Match"]].to_csv(
    os.path.join(OUTPUT_PATH, "missing_terms.csv"), index=False)

summary = {
    "Total Terms": len(df),
    "Accepted (GMM)": (df["Classification_GMM"] == "Acceptable").sum(),
    "Divergence (GMM)": (df["Classification_GMM"] == "Divergence").sum(),
    "Accepted (Static)": (df["Classification_Static"] == "Acceptable").sum(),
    "Divergence (Static)": (df["Classification_Static"] == "Divergence").sum(),
    "Orthographic Errors": (df["Orthographic_Diff"] == "Orthographic").sum(),
    "Diacritic Errors": (df["Orthographic_Diff"] == "Diacritic").sum(),
    "CharNoise Matches": (df["Orthographic_Diff"] == "CharNoise").sum(),
    "WeirdChar Issues": (df["Orthographic_Diff"] == "WeirdChar").sum(),
    "Abbreviation Matches": (df["Abbreviation_Match"] != "No").sum(),
    "NER Drops": (df["NER_Match"] == "NER_Drop").sum(),
    "POS Mismatches": (df["POS_Match"] == "POS_Mismatch").sum(),
    "Threshold GMM": round(thresh_gmm, 3),
    "Threshold Static": STATIC_THRESHOLD
}

pd.DataFrame([summary]).to_csv(os.path.join(STATS_PATH, "cass_summary_stats.csv"), index=False)

print("✅ Ultra CASS+ Evaluation Completed")
print(f"→ Outputs saved to: {OUTPUT_PATH}")


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-06-10 19:19:14 INFO: Downloaded file to C:\Users\brike\stanza_resources\resources.json
2025-06-10 19:19:14 INFO: Downloading default packages for language: en (English) ...
2025-06-10 19:19:16 INFO: File exists: C:\Users\brike\stanza_resources\en\default.zip
2025-06-10 19:19:20 INFO: Finished downloading models and saved to C:\Users\brike\stanza_resources
2025-06-10 19:19:20 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-06-10 19:19:20 INFO: Downloaded file to C:\Users\brike\stanza_resources\resources.json
2025-06-10 19:19:21 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| pos       | combined_charlm           |
| lemma     | combined_nocharlm         |
| ner       | ontonotes-ww-multi_charlm |

2025-06-10 19:19:21 INFO: Using device: cpu
2025-06-10 19:19:21 INFO: Loading: tokenize
2025-06-10 19:19:21 INFO: Loading: mwt
2025-06-10 19:19:21 INFO: Loading: pos
2025-06-10 19:19:24 INFO: Loading: lemma
2025-06-10 19:19:26 INFO: Loading: ner
2025-06-10 19:19:30 INFO: Done loading processors!
No sentence-transformers model found with name eventdata-utd/ConfliBERT-scr-uncased. Creating a new one with mean pooling.
Some weights of BertModel were not initialized from the model checkpoint at eventdata-utd/ConfliBERT-scr-uncased and a

Starting data load...
Data loaded.
Starting training loop...
Epoch 1 started
Epoch 1 completed
Epoch 2 started
Epoch 2 completed
Epoch 3 started
Epoch 3 completed
Epoch 4 started
Epoch 4 completed
Epoch 5 started
Epoch 5 completed
Epoch 6 started
Epoch 6 completed
Epoch 7 started
Epoch 7 completed
Epoch 8 started
Epoch 8 completed
Epoch 9 started
Epoch 9 completed
Epoch 10 started
Epoch 10 completed
✅ Ultra CASS+ Evaluation Completed
→ Outputs saved to: C:/Users/brike/CASS/DeepL\output_cass




In [18]:
# === 2️⃣ Enhanced Sentence-Level Aggregation for CASS Results (with Diacritic/Ortho Analytics) ===

import pandas as pd
import os
import numpy as np

# === Paths ===
DATA_PATH = "C:/Users/brike/CASS/DeepL"
OUTPUT_PATH = os.path.join(DATA_PATH, "output_cass")
TERM_FREQ_FILE = os.path.join(OUTPUT_PATH, "cass_missing_term_frequencies.csv")
SENTENCE_SUMMARY_FILE = os.path.join(OUTPUT_PATH, "cass_sentence_level_summary.csv")
EXPANDED_SUMMARY_FILE = os.path.join(OUTPUT_PATH, "cass_expanded_term_summary.csv")
DIVERGENCE_REPORT = os.path.join(OUTPUT_PATH, "cass_divergence_breakdown.csv")

# === Threshold Logic ===
FALLBACK_THRESHOLD = 0.62
stats_path = os.path.join(OUTPUT_PATH, "stats/cass_summary_stats.csv")
if os.path.exists(stats_path):
    stats_df = pd.read_csv(stats_path)
    if "Threshold GMM" in stats_df.columns:
        CLASSIFICATION_THRESHOLD = stats_df["Threshold GMM"].iloc[0]
    else:
        CLASSIFICATION_THRESHOLD = FALLBACK_THRESHOLD
else:
    CLASSIFICATION_THRESHOLD = FALLBACK_THRESHOLD

print(f"🔧 Using classification threshold: {CLASSIFICATION_THRESHOLD:.4f}")

# === Load Data ===
df = pd.read_csv(os.path.join(OUTPUT_PATH, "mt_eval_CASS.csv"))

# === Enhanced Classification ===
if 'Classification_Static' in df.columns:
    df['Missing_Term_Classification'] = df['Classification_Static']
else:
    df['Missing_Term_Classification'] = np.where(
        df['CASS_Score'] >= CLASSIFICATION_THRESHOLD,
        'Acceptable',
        'Divergence'
    )

# === Term Frequency (Excluding Non-English) ===
valid_terms = df[df['Missing_Term'] != "—"]
term_freq = valid_terms['Missing_Term'].value_counts().reset_index()
term_freq.columns = ['Missing_Term', 'Frequency']
term_freq = term_freq.merge(
    valid_terms.groupby('Missing_Term')['CASS_Score'].mean().reset_index(),
    on='Missing_Term',
    how='left'
).rename(columns={'CASS_Score': 'Avg_CASS'})
term_freq['Divergence_Rate'] = valid_terms.groupby('Missing_Term')['Missing_Term_Classification'].apply(
    lambda x: (x == 'Divergence').mean()
).reset_index()['Missing_Term_Classification']
term_freq.to_csv(TERM_FREQ_FILE, index=False)

# === Enhanced Explanation Flags ===
def generate_explanation(row):
    reasons = []
    if 'NonEnglish' in str(row.get('Explanation_Static', '')):
        return "NonEnglish"
    sim = row.get('Similarity_Score', 0)
    if sim < 0.4:
        reasons.append(f"LowSim({sim:.2f})")
    if row.get('Conflict_Match', 0) == 0:
        reasons.append("NoLexMatch")
    ortho = row.get('Orthographic_Diff', 'No')
    if ortho == "Diacritic":
        reasons.append("DiacriticShift")
    elif ortho == "Yes":
        reasons.append("OrthoDivergence")
    cass = row.get('CASS_Score', 0)
    if cass < CLASSIFICATION_THRESHOLD:
        reasons.append(f"Threshold({cass:.2f}<{CLASSIFICATION_THRESHOLD:.2f})")
    return "|".join(reasons) if reasons else "AllGood"

df['Explanation_Flag'] = df.apply(generate_explanation, axis=1)

# === Save Term-Level Expanded Output ===
term_cols = [
    'Sentence_ID', 'Original_EN', 'MT_EN', 'Missing_Term',
    'Best_Candidate', 'CASS_Score', 'Similarity_Score',
    'Conflict_Match', 'Orthographic_Diff', 'Missing_Term_Classification',
    'Explanation_Flag'
]
if 'MT_Lang' in df.columns:
    term_cols.append('MT_Lang')
df[term_cols].to_csv(EXPANDED_SUMMARY_FILE, index=False)

# === Robust Sentence-Level Aggregation (with Ortho/Diacritic Error Breakdown) ===
df['Is_Divergence'] = df['Missing_Term_Classification'] == 'Divergence'
df['Is_Ortho_Error'] = df['Orthographic_Diff'] == 'Yes'
df['Is_Diacritic_Error'] = df['Orthographic_Diff'] == 'Diacritic'
df['Is_Lex_Mismatch'] = df['Conflict_Match'] == 0
df['Is_Low_Similarity'] = df['Similarity_Score'] < 0.4

agg_config = {
    'Original_EN': 'first',
    'MT_EN': 'first',
    'CASS_Score': ['count', 'min', 'max', 'mean'],
    'Is_Divergence': 'sum',
    'Is_Ortho_Error': 'sum',
    'Is_Diacritic_Error': 'sum',
    'Is_Lex_Mismatch': 'sum',
    'Is_Low_Similarity': 'sum',
    'Missing_Term_Classification': lambda x: '|'.join(x),
    'Explanation_Flag': lambda x: '||'.join(x)
}
if 'MT_Lang' in df.columns:
    agg_config['MT_Lang'] = 'first'

sentence_summary = df.groupby('Sentence_ID').agg(agg_config).reset_index()
sentence_summary.columns = [
    'Sentence_ID',
    'Original_EN',
    'MT_EN',
    'Num_Missing_Terms',
    'Min_CASS',
    'Max_CASS',
    'Mean_CASS',
    'Num_Divergent_Terms',
    'Num_Orthographic_Errors',
    'Num_Diacritic_Errors',
    'Num_Lexicon_Mismatches',
    'Num_Low_Similarity',
    'Term_Classifications',
    'Term_Explanation_Flags'
] + (['MT_Lang'] if 'MT_Lang' in df.columns else [])

# Sentence-level labels
sentence_summary['Sentence_Classification_Mean'] = np.where(
    sentence_summary['Mean_CASS'] >= CLASSIFICATION_THRESHOLD,
    'Acceptable',
    'Divergence'
)
sentence_summary['Sentence_Classification_Strict'] = np.where(
    sentence_summary['Num_Divergent_Terms'] > 0,
    'Divergence',
    'Acceptable'
)
sentence_summary['Divergence_Severity'] = np.where(
    sentence_summary['Mean_CASS'] < CLASSIFICATION_THRESHOLD - 0.2,
    'High',
    np.where(sentence_summary['Mean_CASS'] < CLASSIFICATION_THRESHOLD, 'Medium', 'None')
)

sentence_summary.to_csv(SENTENCE_SUMMARY_FILE, index=False)

# === Divergence Analytics Report (With Explicit Column Names) ===
divergence_terms = df[df['Missing_Term_Classification'] == 'Divergence'][[
    'Sentence_ID', 'Original_EN', 'MT_EN', 'Missing_Term',
    'Best_Candidate', 'CASS_Score', 'Similarity_Score', 'Conflict_Match',
    'Orthographic_Diff', 'Explanation_Flag'
] + (['MT_Lang'] if 'MT_Lang' in df.columns else [])]

divergence_terms.to_csv(DIVERGENCE_REPORT, index=False)

# === Final Output ===
print("\n✅ Enhanced CASS Aggregation Completed")
print(f"→ Classified {len(df)} term instances")
print(f"→ Analyzed {len(sentence_summary)} sentences")
print(f"→ Divergence rate: {sentence_summary['Num_Divergent_Terms'].sum()/len(df):.1%}")
print(f"→ Orthographic errors: {df['Orthographic_Diff'].eq('Yes').sum()} terms")
print(f"→ Diacritic errors: {df['Orthographic_Diff'].eq('Diacritic').sum()} terms")
print(f"→ Output files saved to {OUTPUT_PATH}")


🔧 Using classification threshold: 0.7270

✅ Enhanced CASS Aggregation Completed
→ Classified 289 term instances
→ Analyzed 240 sentences
→ Divergence rate: 92.0%
→ Orthographic errors: 0 terms
→ Diacritic errors: 13 terms
→ Output files saved to C:/Users/brike/CASS/DeepL\output_cass


In [19]:
# === 3️⃣ Enhanced CASS Visualization & Summary Script (Palette/Hue Fix) ===

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# === Paths ===
DATA_PATH = "C:/Users/brike/CASS/DeepL"
OUTPUT_PATH = os.path.join(DATA_PATH, "output_cass")
STATS_PATH = os.path.join(OUTPUT_PATH, "stats")
os.makedirs(STATS_PATH, exist_ok=True)

# === Load Data ===
df_terms = pd.read_csv(os.path.join(OUTPUT_PATH, "cass_expanded_term_summary.csv"))
df_sentences = pd.read_csv(os.path.join(OUTPUT_PATH, "cass_sentence_level_summary.csv"))
term_freq = pd.read_csv(os.path.join(OUTPUT_PATH, "cass_missing_term_frequencies.csv"))

# === Term-Level CASS Score Distribution ===
plt.figure(figsize=(10, 6))
sns.histplot(df_terms['CASS_Score'], bins=20, kde=True, color="skyblue")
plt.axvline(0.62, color="red", linestyle="--", label="Static Threshold = 0.62")
plt.title("Term-Level CASS Score Distribution")
plt.xlabel("CASS Score")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(STATS_PATH, "term_level_cass_distribution.png"))
plt.close()

# === Sentence-Level Mean CASS Distribution ===
plt.figure(figsize=(10, 6))
sns.histplot(df_sentences['Mean_CASS'], bins=20, kde=True, color="orange")
plt.axvline(0.62, color="red", linestyle="--", label="Static Threshold = 0.62")
plt.title("Sentence-Level Mean CASS Score Distribution")
plt.xlabel("Mean CASS Score")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(STATS_PATH, "sentence_level_cass_distribution.png"))
plt.close()

# === Sentence-Level Classification Bar Chart ===
plt.figure(figsize=(6, 5))
sns.countplot(
    data=df_sentences,
    x="Sentence_Classification_Mean",
    hue="Sentence_Classification_Mean",
    palette="Set2",
    legend=False
)
plt.title("Sentence-Level Classification (Acceptable vs Divergence)")
plt.tight_layout()
plt.savefig(os.path.join(STATS_PATH, "sentence_level_classification.png"))
plt.close()

# === Term-Level Classification Bar Chart ===
plt.figure(figsize=(6, 5))
sns.countplot(
    data=df_terms,
    x="Missing_Term_Classification",
    hue="Missing_Term_Classification",
    palette="coolwarm",
    legend=False
)
plt.title("Term-Level Classification (Acceptable vs Divergence)")
plt.tight_layout()
plt.savefig(os.path.join(STATS_PATH, "term_level_classification.png"))
plt.close()

# === Orthographic/Diacritic Error Frequency Plots ===
plt.figure(figsize=(6, 5))
sns.countplot(
    data=df_terms,
    x="Orthographic_Diff",
    hue="Orthographic_Diff",
    order=["No", "Diacritic", "Yes"],
    palette="rocket",
    legend=False
)
plt.title("Orthographic & Diacritic Error Breakdown (Terms)")
plt.tight_layout()
plt.savefig(os.path.join(STATS_PATH, "term_level_ortho_diakritik_errors.png"))
plt.close()

plt.figure(figsize=(8, 5))
error_counts = df_terms["Orthographic_Diff"].value_counts()
sns.barplot(
    x=error_counts.index,
    y=error_counts.values,
    hue=error_counts.index,
    palette="rocket",
    legend=False
)
plt.title("Term Count by Orthographic Error Type")
plt.ylabel("Term Count")
plt.xlabel("Orthographic Error Type")
plt.tight_layout()
plt.savefig(os.path.join(STATS_PATH, "ortho_error_type_counts.png"))
plt.close()

# === Explanation Flag Frequency ===
flag_counts = Counter(df_terms['Explanation_Flag'])
flag_df = pd.DataFrame(flag_counts.items(), columns=["Explanation_Flag", "Count"]).sort_values("Count", ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(
    data=flag_df,
    x="Explanation_Flag",
    y="Count",
    hue="Explanation_Flag",
    palette="magma",
    legend=False
)
plt.title("Explanation Flag Frequency")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(STATS_PATH, "explanation_flag_frequency.png"))
plt.close()

# === Top 20 Most Missing Terms ===
plt.figure(figsize=(12, 6))
top_terms = term_freq.nlargest(20, 'Frequency')
sns.barplot(
    data=top_terms,
    x="Missing_Term",
    y="Frequency",
    hue="Missing_Term",
    palette="viridis",
    legend=False
)
plt.title("Top 20 Most Frequently Missing Terms")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(STATS_PATH, "missing_term_frequency_top20.png"))
plt.close()

# === Statistics Summary ===
summary = {
    "Total Missing Terms": len(df_terms),
    "Unique Missing Terms": df_terms['Missing_Term'].nunique(),
    "Total Sentences Evaluated": len(df_sentences),
    "Average Terms per Sentence": round(df_terms.shape[0] / df_sentences.shape[0], 2),
    "Term Acceptable %": round((df_terms['Missing_Term_Classification'] == 'Acceptable').mean() * 100, 2),
    "Sentence Acceptable %": round((df_sentences['Sentence_Classification_Mean'] == 'Acceptable').mean() * 100, 2),
    "Term-Level Mean CASS": round(df_terms['CASS_Score'].mean(), 4),
    "Sentence-Level Mean CASS": round(df_sentences['Mean_CASS'].mean(), 4),
    "Diacritic Error Count": df_terms["Orthographic_Diff"].eq("Diacritic").sum(),
    "Orthographic Error Count": df_terms["Orthographic_Diff"].eq("Yes").sum(),
    "No Error Count": df_terms["Orthographic_Diff"].eq("No").sum(),
}
pd.DataFrame([summary]).to_csv(os.path.join(STATS_PATH, "cass_summary_report.csv"), index=False)

print("✅ All CASS visualizations and summary report saved to:")
print(f"→ {STATS_PATH}")


✅ All CASS visualizations and summary report saved to:
→ C:/Users/brike/CASS/DeepL\output_cass\stats
