Jaccard Distance measures how many words two sentences share, ignoring word order. It ranges from 0 (all words the same) to 1 (no words in common). We are using it to evaluate how well the Lexical Divergence score reflects actual word overlap.
By comparing the two scores, we can check how much they agree. A small difference means both detect similar changes. A large difference shows they respond to different types of variation (like word order vs word content). This helps us assess the reliability of the Lexical Divergence model.
91.94% accuracy for the lexicaldev.

In [4]:
# b.l. import libraries for data, text processing, and nltk tools
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

#b.l. using the same preprocessing tools as Afraa, download stopwords from nltk 
nltk.download('stopwords')

# b.l. load the file with aligned sentence pairs that has the results of the lexdiv
df = pd.read_csv("lexical_divergence_results.csv")

# b.l. get the list of english stopwords from nltk
nltk_stop_words = set(stopwords.words("english"))

# b.l. define a tokenizer similar to the one used in lexical divergence
def match_original_tokenizer(text):
    tokens = re.findall(r'\b[a-z]+\b', text.lower())
    return [w for w in tokens if w not in nltk_stop_words]

# b.l. create a list to store jaccard distance values
jaccard_distances = []

#b.l. for each row, calculate jaccard distance between token sets
for _, row in df.iterrows():
    src_tokens = set(match_original_tokenizer(row['Original_EN']))
    mt_tokens = set(match_original_tokenizer(row['MT_EN']))
    intersection = src_tokens.intersection(mt_tokens)
    union = src_tokens.union(mt_tokens)
    jaccard = 1 - len(intersection) / len(union) if union else 0
    jaccard_distances.append(round(jaccard, 4))

# b.l. add the new score as a column to the dataframe
df['Jaccard_Distance'] = jaccard_distances

# b.l. calculate the absolute difference between lexical divergence and jaccard distance
df['Divergence_Jaccard_Diff'] = (df['Lexical_Divergence'] - df['Jaccard_Distance']).abs()

# b.l. save the updated file with both new columns
df.to_csv("lexical_divergence_with_jaccard.csv", index=False)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brike\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
