In [None]:
# Core NLP and ML libraries
!pip install -q pandas numpy tqdm scikit-learn matplotlib seaborn fpdf

# Stanza for tokenization & lemmatization
!pip install -q stanza

# Sentence-Transformers (for ConfliBERT)
!pip install -q sentence-transformers

# HuggingFace Transformers for tokenizer compatibility
!pip install -q transformers

# spaCy for POS tagging (optional, but included if needed later)
!pip install -q spacy
!python -m spacy download en_core_web_lg

# NLTK for WordNet and lemmatization
!pip install -q nltk


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# === Final Lexical Divergence Evaluation Script (Corrected Divergence Metric) ===

import os
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from difflib import SequenceMatcher
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import seaborn as sns

# === Setup Paths ===
DATA_PATH = "/content/drive/MyDrive/Summer/CASS"
OUTPUT_PATH = os.path.join(DATA_PATH, "output_lexical_gmm")
os.makedirs(OUTPUT_PATH, exist_ok=True)

INPUT_PAIRS = os.path.join(DATA_PATH, "sentence_pairs.csv")
OUTPUT_MASTER = os.path.join(OUTPUT_PATH, "lexical_divergence_results.csv")
OUTPUT_HIST = os.path.join(OUTPUT_PATH, "lexical_divergence_distribution.png")
OUTPUT_SUMMARY = os.path.join(OUTPUT_PATH, "lexical_divergence_summary.csv")
OUTPUT_FLAG_PLOT = os.path.join(OUTPUT_PATH, "lexical_flag_distribution.png")
OUTPUT_CLASS_PLOT = os.path.join(OUTPUT_PATH, "lexical_class_comparison.png")

# === NLTK Setup ===
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# === Load Data ===
df_pairs = pd.read_csv(INPUT_PAIRS)

# === Calculate Lexical Divergence ===
records = []
for idx, row in df_pairs.iterrows():
    sid = f"S{idx+1:03d}"
    src_tokens = [lemmatizer.lemmatize(w.lower()) for w in nltk.word_tokenize(row['Original_EN']) if w.isalpha() and w.lower() not in stop_words]
    mt_tokens = [lemmatizer.lemmatize(w.lower()) for w in nltk.word_tokenize(row['MT_EN']) if w.isalpha() and w.lower() not in stop_words]
    matcher = SequenceMatcher(None, src_tokens, mt_tokens)
    match_len = sum(block.size for block in matcher.get_matching_blocks())
    divergence = 1 - (match_len / max(len(src_tokens), len(mt_tokens), 1))
    records.append({
        "Sentence_ID": sid,
        "Original_EN": row['Original_EN'],
        "MT_EN": row['MT_EN'],
        "Lexical_Divergence": round(divergence, 4)
    })

# === Save Raw Output ===
df = pd.DataFrame(records)

# === GMM Thresholding ===
gmm = GaussianMixture(n_components=2).fit(df['Lexical_Divergence'].values.reshape(-1, 1))
means = gmm.means_.flatten()
thresh_gmm = np.mean(means)

# === Static Threshold ===
thresh_static = 0.35

df['Classification_GMM'] = df['Lexical_Divergence'].apply(lambda x: 'Acceptable' if x <= thresh_gmm else 'Divergence')
df['Classification_Static'] = df['Lexical_Divergence'].apply(lambda x: 'Acceptable' if x <= thresh_static else 'Divergence')

df['Explanation_Flag'] = df['Lexical_Divergence'].apply(
    lambda x: 'HighDivergence' if x > 0.6 else ('Moderate' if x > 0.35 else 'Low'))

# === Save Output ===
df.to_csv(OUTPUT_MASTER, index=False)

# === Save Summary Stats ===
summary = {
    "Total Sentences": len(df),
    "Acceptable (GMM)": (df['Classification_GMM'] == 'Acceptable').sum(),
    "Divergence (GMM)": (df['Classification_GMM'] == 'Divergence').sum(),
    "Acceptable (Static)": (df['Classification_Static'] == 'Acceptable').sum(),
    "Divergence (Static)": (df['Classification_Static'] == 'Divergence').sum(),
    "GMM Threshold Used": round(thresh_gmm, 3),
    "Static Threshold Used": round(thresh_static, 3)
}
pd.DataFrame([summary]).to_csv(OUTPUT_SUMMARY, index=False)

# === Plot Lexical Divergence Distribution ===
plt.figure(figsize=(10, 6))
sns.histplot(df['Lexical_Divergence'], bins=20, kde=True, color='skyblue')
plt.axvline(thresh_gmm, color='red', linestyle='--', label=f'GMM Threshold = {thresh_gmm:.2f}')
plt.axvline(thresh_static, color='green', linestyle='--', label=f'Static Threshold = {thresh_static:.2f}')
plt.title("Lexical Divergence Score Distribution")
plt.xlabel("Lexical Divergence")
plt.ylabel("Frequency")
plt.legend()
plt.savefig(OUTPUT_HIST)
plt.close()

# === Plot Classification Count ===
plt.figure(figsize=(8, 6))
class_data = pd.DataFrame({
    'GMM': df['Classification_GMM'].value_counts(),
    'Static': df['Classification_Static'].value_counts()
}).T
class_data[['Acceptable', 'Divergence']].plot(kind='bar', figsize=(8, 5), color=['skyblue', 'salmon'])
plt.title("Acceptable vs Divergence (GMM vs Static)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(OUTPUT_CLASS_PLOT)
plt.close()

# === Plot Explanation Flags ===
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Explanation_Flag', order=df['Explanation_Flag'].value_counts().index, palette='muted')
plt.title("Top Lexical Divergence Explanation Flags")
plt.xlabel("Flag")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(OUTPUT_FLAG_PLOT)
plt.close()

print(f"✅ Final Lexical Divergence Evaluation Completed")
print(f"→ Score Distribution: {OUTPUT_HIST}")
print(f"→ Summary Saved: {OUTPUT_SUMMARY}")
print(f"→ Output File: {OUTPUT_MASTER}")


✅ Final Lexical Divergence Evaluation Completed
→ Score Distribution: /content/drive/MyDrive/Summer/CASS/output_lexical_gmm/lexical_divergence_distribution.png
→ Summary Saved: /content/drive/MyDrive/Summer/CASS/output_lexical_gmm/lexical_divergence_summary.csv
→ Output File: /content/drive/MyDrive/Summer/CASS/output_lexical_gmm/lexical_divergence_results.csv



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(data=df, x='Explanation_Flag', order=df['Explanation_Flag'].value_counts().index, palette='muted')


<Figure size 800x600 with 0 Axes>

In [None]:
# === Lexical Divergence Visualization and Summary Statistics ===

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# === Paths ===
DATA_PATH = "/content/drive/MyDrive/Summer/CASS/output_lexical_gmm"
INPUT_CSV = os.path.join(DATA_PATH, "lexical_divergence_results.csv")
SUMMARY_CSV = os.path.join(DATA_PATH, "lexical_divergence_summary.csv")

# === Load Data ===
cand_df = pd.read_csv(INPUT_CSV)
summary_df = pd.read_csv(SUMMARY_CSV)
# Load both thresholds from the summary DataFrame
gmm_threshold = summary_df['GMM Threshold Used'].iloc[0]
static_threshold = summary_df['Static Threshold Used'].iloc[0]


# === Plot 1: Distribution of Lexical Divergence ===
plt.figure(figsize=(10, 6))
# Correct the column name to match what was saved in the main output CSV
sns.histplot(cand_df['Lexical_Divergence'], bins=20, kde=True, color='skyblue')

# Add both vertical lines to the plot
plt.axvline(gmm_threshold, color='red', linestyle='--', label=f'GMM Threshold = {gmm_threshold:.2f}')
plt.axvline(static_threshold, color='green', linestyle='--', label=f'Static Threshold = {static_threshold:.2f}')

plt.title("Lexical Divergence Score Distribution with GMM and Static Thresholds") # Update title
plt.xlabel("Lexical Divergence") # Update label
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
# Update output filename for consistency
plt.savefig(os.path.join(DATA_PATH, "lexical_divergence_distribution_dual_threshold.png")) # New filename
plt.close()


# === Plot 2: Classification Comparison (GMM vs Static) ===
# Note: The static threshold logic here uses 0.62, which differs from 0.35 used in the previous cell.
# This might be intentional, but worth noting.
# Correct the column name for Lexical Divergence
cand_df['Classification_Static'] = cand_df['Lexical_Divergence'].apply(lambda x: 'Acceptable' if x >= 0.62 else 'Divergence')
gmm_counts = cand_df['Classification_GMM'].value_counts()
static_counts = cand_df['Classification_Static'].value_counts()

class_df = pd.DataFrame({'GMM': gmm_counts, 'Static': static_counts}).T
class_df[['Acceptable', 'Divergence']].plot(kind='bar', color=['green', 'red'])
plt.title("Acceptable vs Divergence (GMM vs Static)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(DATA_PATH, "classification_comparison.png"))
plt.close()

# === Plot 3: Boxplot of Lexical Score by GMM Classification ===
plt.figure(figsize=(10, 6))
# Correct the column name for Lexical Divergence
sns.boxplot(x='Classification_GMM', y='Lexical_Divergence', data=cand_df, palette='Set2')
plt.title("Lexical Divergence Distribution by Classification (GMM)") # Update title
plt.ylabel("Lexical Divergence") # Update label
plt.tight_layout()
plt.savefig(os.path.join(DATA_PATH, "boxplot_gmm_classification.png"))
plt.close()

# === Plot 4: Top Divergence Reasons (from Explanation_GMM) ===
# Correct the column name for Explanation flags
# Note: In the previous cell, the column is named 'Explanation_Flag'.
# If you intended to use a GMM-specific explanation, you would need to add that logic in the previous cell.
# Assuming 'Explanation_Flag' is the intended column here.
top_flags = cand_df['Explanation_Flag'].value_counts().nlargest(10)
top_flags.to_csv(os.path.join(DATA_PATH, "top_explanation_flags.csv"))

plt.figure(figsize=(10, 5))
top_flags.plot(kind='bar', color='purple')
plt.title("Top Lexical Divergence Explanation Flags") # Update title
plt.ylabel("Frequency")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(DATA_PATH, "top_explanation_flags.png"))
plt.close()

print("✅ Lexical Divergence Visualizations & Stats Saved to:", DATA_PATH)


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='Classification_GMM', y='Lexical_Divergence', data=cand_df, palette='Set2')


✅ Lexical Divergence Visualizations & Stats Saved to: /content/drive/MyDrive/Summer/CASS/output_lexical_gmm
