In [None]:
datapath = "./"
column_order = ['text', 'label', 'length', 'source', 'language', 'domain', 'topic']
selected = ['ar', 'ca', 'cs', 'de', 'en', 'es', 'nl', 'pt', 'ru', 'uk', 'zh']

In [None]:
obfuscators = ['backtranslated-m2m100-1.2B', 'backtranslated-nllb-200-distilled-1.3B', 'pegasus-paraphrase', 'dipper', 'paraphrased-ChatGPT', 'gptzzzs', 'gptzerobypasser', 'HomoglyphAttack', 'alison', 'dftfooler']

In [None]:
rename_obfuscators = {'backtranslated-m2m100-1.2B': 'm2m100-1.2B', 'backtranslated-nllb-200-distilled-1.3B': 'nllb-200-distilled-1.3B', 'pegasus-paraphrase': 'Pegasus-paraphrase', 'dipper': 'DIPPER', 'paraphrased-ChatGPT': 'ChatGPT', 'gptzzzs': 'GPTZzzs', 'gptzerobypasser': 'GPTZeroBypasser', 'HomoglyphAttack': 'HomoglyphAttack', 'alison': 'ALISON', 'dftfooler': 'DFTFooler'}

In [None]:
import os
import pandas as pd
import numpy as np
from ftlangdetect import detect
from tqdm import tqdm
from collections import Counter
from langcodes import *
from polyglot.text import Text, Word
import regex
import tensorflow_text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import ngram
import editdistance
from nltk.translate import meteor
from nltk import word_tokenize

import nltk
nltk.download('punkt')
nltk.download('wordnet')

#from evaluate import load
#bertscore = load("bertscore")

import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

pd.set_option('display.max_rows', 100)
tqdm.pandas()

In [None]:
from datasets import list_metrics, load_metric
metrics_list = list_metrics()
len(metrics_list)
print(metrics_list)
bertscore = load_metric("bertscore")

In [None]:
#remove whitespaces around texts
def clear_dataset(df):
  df_string_columns = df.select_dtypes(['object'])
  df[df_string_columns.columns] = df_string_columns.apply(lambda x: x.str.strip())
  return df

#remove some unicode chars making problems in polyglot
#https://github.com/aboSamoor/polyglot/issues/71#issuecomment-707997790
def remove_bad_chars(text):
  RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
  return RE_BAD_CHARS.sub("", text)

#remove prompts from generated text
def remove_prompts(row):
  language = row.language
  language_name = Language.make(language=row.language).display_name()
  headline = row.title
  prompt = f'You are a multilingual journalist.\n\nTask: Write a news article in {language_name} for the following headline: "{headline}". Leave out the instructions, return just the text of the article.\n\nOutput:'
  #alpaca prompt
  prompt2 = f'<unk>### Instruction:\nYou are a multilingual journalist.\n\nTask: Write a news article in {language_name} for the following headline: "{headline}". Leave out the instructions, return just the text of the article.\n\n\n\n### Response:'
  text = str(row.generated).strip()
  text = text.replace(prompt2, '').strip()
  text = text.replace(''.join(prompt2.split()), '').strip()
  #text = text.replace(remove_bad_chars(prompt2), '').strip()
  text = text.replace(prompt, '').strip()
  text = text.replace(''.join(prompt.split()), '').strip()
  #text = text.replace(remove_bad_chars(prompt), '').strip()
  text = text.replace(f'"{row.title}"', '').strip()
  text = text.replace(row.title, '').strip()
  #text = text.replace(remove_bad_chars(row.title), '').strip()
  return text
  text = text.replace('###', '').strip()
  text = text.replace('Instruction:', '').strip()
  text = text.replace('You are a multilingual journalist.', '').strip()
  text = text.replace('Task:', '').strip()
  text = text.replace(f'Write a news article in {language_name} for the following headline:', '').strip()
  text = text.replace('\"\".', '').strip()
  text = text.replace('Leave out the instructions, return just the text of the article.', '').strip()
  text = text.replace('Response:', '').strip()
  return text

#remove unfinished final sentence from generated text
def remove_unended_sentence(row):
  text = Text(row.generated, hint_language_code=row.language)
  if (row.generated != '') and (len(text.sentences) > 1):
    if (text.sentences[-1].words[-1] not in ['。', '؟', '!', '?', '.']): #final sentence not ended by any of these characters
      return row.generated.removesuffix(str(text.sentences[-1]))
  return row.generated

#detect language of generated text
def fasttext_detect_language(dataset):
  generated_languages = []
  for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
    if str(row.generated) != "nan":
      #lines = row.generated.split('\n')
      #detected = []
      #for line in lines:
      #  detected.append(detect(text=line, low_memory=False)['lang'])
      ##HOW TO REPRESENT PER-LINE DETECTION? ALL/MAJORITY MUST MATCH?
      #c = Counter(detected)
      #detected_language = c.most_common()[0][0] #majority
      #generated_languages.append(detected_language)
      generated_languages.append(detect(text=row.generated.replace('\n', ' '), low_memory=False)['lang'])
    else:
      generated_languages.append(row.language)
  return generated_languages

#shorten generated texts
def shorten_generated(row):
  generated = str(row.generated).strip()
  if (generated == ''):
    return generated
  generated_length = len(row.generated.split())
  if (row.language == 'zh'):
    generated_length = len(Text(row.generated, hint_language_code=row.language).words)
  human_length = len(row.text.split())
  if (row.language == 'zh'):
    human_length = len(Text(row.text, hint_language_code=row.language).words)

  if (human_length == 0):
    return generated

  while (human_length < (generated_length - 5)): #remove last sentence while more than 5 words longer
    #print(human_length, '<', generated_length)
    text = Text(generated, hint_language_code=row.language)
    if (len(text.sentences) < 2): #single sentence will not be removed
      return generated
    generated = generated.removesuffix(str(text.sentences[-1])).strip()
    generated_length = len(generated.split())
    if (row.language == 'zh'):
      generated_length = len(Text(generated, hint_language_code=row.language).words)
  return generated

#unify dataset form
def unify_form(dataset, model):
  dataset = clear_dataset(dataset)
  dataset['label'] = model
  dataset['text'] = dataset['generated']
  #ToDo: list() for Chinese to obtain letters or use some NLP library to get words
  #dataset['length'] = [len(x.split()) for x in dataset.text]
  dataset['length'] = [len(x.split()) if (y != 'zh') or (x == '') else len(Text(x, hint_language_code=y).words) for (x, y) in zip(dataset.text, dataset.language)]
  dataset['source'] = [f'MULTITuDE_{x}' for x in dataset.source]
  #dataset.drop(columns=['url', 'title', 'generated', 'generated_languages_fasttext'], inplace=True)
  #dataset = dataset[column_order]
  return dataset

#uniqueness/repetitiveness - get number of unique sentences in row.text
def unique_sentences(row):
  if row.text == '':
    return 0
  sentences = Text(row.text, hint_language_code=row.language).sentences
  return len(set(sentences)) / len(sentences)

#uniqueness/repetitiveness - get number of unique words in row.text
def unique_words(row):
  if row.text == '':
    return 0
  words = Text(row.text, hint_language_code=row.language).words
  return len(set(words)) / len(words)

#evaluate similarity of generated text to original human text
def get_ngram(dataset, n=3):
  metric = [""] * len(dataset)
  for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
    if ("metric" in row.index) and (row['metric'] is not np.NaN) and (str(row['metric']) != "nan"):
      metric[index] = row['metric']
      continue
    original = row.text
    obfuscated = row.generated
    try:
      metric[index] = round(ngram.NGram.compare(original, obfuscated, N=n), 4)
    except:
      metric[index] = 0.0
  return metric

def custom_tokenizer(text):
  #return word_tokenize(text)
  return list(Text(text).words)

#evaluate similarity of generated text to original human text
#nltk tokenizer can be changed to polyglot for better language support
def get_tf_cosine_similarity(dataset):
  metric = [""] * len(dataset)
  for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
    if ("metric" in row.index) and (row['metric'] is not np.NaN) and (str(row['metric']) != "nan"):
      metric[index] = row['metric']
      continue
    original = row.text
    obfuscated = row.generated
    original_tokens = custom_tokenizer(original)
    try:
      obfuscated_tokens = custom_tokenizer(obfuscated)
      words = set(original_tokens).union(set(obfuscated_tokens))
      vectorizer = CountVectorizer(tokenizer = custom_tokenizer, vocabulary = words)
      original_vector = vectorizer.transform([original])
      obfuscated_vector = vectorizer.transform([obfuscated])
      metric[index] = round(cosine_similarity(original_vector.toarray(), obfuscated_vector.toarray())[0][0], 4)
    except:
      metric[index] = 0.0
  return metric

#evaluate similarity of generated text to original human text
def get_meteor(dataset):
  metric = [""] * len(dataset)
  for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
    if ("metric" in row.index) and (row['metric'] is not np.NaN) and (str(row['metric']) != "nan"):
      metric[index] = row['metric']
      continue
    original = row.text
    obfuscated = row.generated
    metric[index] = round(meteor([word_tokenize(original)],word_tokenize(obfuscated)), 4)
  return metric

#evaluate similarity of generated text to original human text
def get_bertscore(dataset):
  metric = [""] * len(dataset)
  for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
    if ("metric" in row.index) and (row['metric'] is not np.NaN) and (str(row['metric']) != "nan"):
      metric[index] = row['metric']
      continue
    original = row.text
    obfuscated = row.generated
    results = bertscore.compute(predictions=[obfuscated], references=[original], model_type="bert-base-multilingual-cased")
    metric[index] = sum(results['f1']) / len(results['f1'])
  return metric

#evaluate similarity of generated text to original human text
def get_use_cosine_similarity(dataset):
  metric = [""] * len(dataset)
  for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
    if ("metric" in row.index) and (row['metric'] is not np.NaN) and (str(row['metric']) != "nan"):
      metric[index] = row['metric']
      continue
    original = row.text
    obfuscated = row.generated
    results = cosine_similarity(embed(original), embed(obfuscated), dense_output=False)
    metric[index] = results.mean()
  return metric

#evaluate similarity of generated text to original human text
def get_editdistance(dataset):
  metric = [""] * len(dataset)
  for index, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
    if ("metric" in row.index) and (row['metric'] is not np.NaN) and (str(row['metric']) != "nan"):
      metric[index] = row['metric']
      continue
    original = row.text
    obfuscated = row.generated
    try:
      metric[index] = editdistance.eval(original, obfuscated)
    except:
      metric[index] = 0.0
  return metric

#evaluate similarity of generated text to original human text
def get_diff_charlen(dataset):
  df = dataset
  prev = ''
  text_charlength = [len(''.join([y if (y != prev) & ((prev:=y) == y) else '' for y in x]).strip()) for x in df['text']]
  generated_charlength = [len(''.join([y if (y != prev) & ((prev:=y) == y) else '' for y in x]).strip()) for x in df['generated']]
  #result = np.divide(text_charlength, generated_charlength)
  generated_charlength_inv = np.array([1/i if i!=0 else 0 for i in generated_charlength])
  result = text_charlength * generated_charlength_inv
  return result

## Obfuscated Texts

In [None]:
%%time

for obfuscator in obfuscators:
  if os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_quality-metrics.csv.gz'): continue
  dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv.gz')

  dataset = dataset[:dataset[~dataset.generated.isna()].iloc[-1].name +1]

  print(f'Processing {obfuscator}')

  #dataset['generated'] = dataset['generated'].apply(lambda x: remove_bad_chars(x))

  dataset['meteor'] = get_meteor(dataset)
  dataset['bertscore'] = get_bertscore(dataset)
  dataset['use'] = get_use_cosine_similarity(dataset)

  dataset.to_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_quality-metrics.csv.gz', index=False)

In [None]:
%%time
stat = pd.DataFrame()
for obfuscator in obfuscators:
  if os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_quality-metrics2.csv.gz'): continue
  if not os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv') and not os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv.gz'): continue
  try:
    dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv')
  except:
    dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv.gz')

  dataset = dataset[:dataset[~dataset.generated.isna()].iloc[-1].name +1]

  print(f'Processing {obfuscator}')
  dataset['fasttext'] = fasttext_detect_language(dataset)
  dataset['ngram'] = get_ngram(dataset, n=3)
  dataset['tf'] = get_tf_cosine_similarity(dataset)

  dataset.to_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_quality-metrics2.csv.gz', index=False)

  stat[obfuscator] = [len(dataset[dataset.language != dataset.fasttext]) / len(dataset), dataset['ngram'].mean(), dataset['tf'].mean()]
stat

In [None]:
stat = stat.T
stat = stat.rename(columns={0: 'Changed Language', 1: 'ngram', 2: 'tf'})
stat

In [None]:
%%time
stat = pd.DataFrame()
for obfuscator in obfuscators:
  if os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_editdistance.csv.gz'): continue
  if not os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv') and not os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv.gz'): continue
  try:
    dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv')
  except:
    dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv.gz')

  dataset = dataset[:dataset[~dataset.generated.isna()].iloc[-1].name +1]

  print(f'Processing {obfuscator}')
  dataset['editdistance'] = get_editdistance(dataset)

  dataset.to_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_editdistance.csv.gz', index=False)

  stat[obfuscator] = [dataset['editdistance'].mean()]
stat

In [None]:
dataset['ED-norm'] = dataset['editdistance'] / [len(x) for x in dataset['text']]

In [None]:
dataset['ED-norm'].describe()

In [None]:
%%time
stat = pd.DataFrame()
for obfuscator in obfuscators:
  if os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_tf.csv.gz'): continue
  if not os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv') and not os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv.gz'): continue
  try:
    dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv')
  except:
    dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv.gz')

  dataset = dataset[:dataset[~dataset.generated.isna()].iloc[-1].name +1]

  print(f'Processing {obfuscator}')
  dataset['tf'] = get_tf_cosine_similarity(dataset)
  dataset['diff_charlen'] = get_diff_charlen(dataset)

  dataset.to_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_tf.csv.gz', index=False)

  stat[obfuscator] = [dataset['tf'].mean(), dataset['diff_charlen'].mean()]
stat

In [None]:
%%time
stat = pd.DataFrame()
for obfuscator in obfuscators:
  if os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_diff.csv.gz'): continue
  if not os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv') and not os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv.gz'): continue
  try:
    dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv')
  except:
    dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}.csv.gz')

  dataset = dataset[:dataset[~dataset.generated.isna()].iloc[-1].name +1]

  print(f'Processing {obfuscator}')
  dataset['diff_charlen'] = get_diff_charlen(dataset)

  dataset.to_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_diff.csv.gz', index=False)

  stat[obfuscator] = [dataset['diff_charlen'].mean()]
stat

## Analyze resulted metrics

In [None]:
results = pd.DataFrame()
data = pd.DataFrame()
for obfuscator in tqdm(obfuscators, total=len(obfuscators)):
  if not os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_quality-metrics.csv.gz'): continue
  dataset = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_quality-metrics.csv.gz')
  if os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_quality-metrics2.csv.gz'):
    dataset2 = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_quality-metrics2.csv.gz')
    dataset = pd.merge(dataset, dataset2[['text', 'fasttext', 'ngram', 'tf']], how='left', on=['text'])
  if os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_editdistance.csv.gz'):
    dataset2 = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_editdistance.csv.gz')
    dataset = pd.merge(dataset, dataset2[['text', 'editdistance']], how='left', on=['text'])
  if os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_tf.csv.gz'):
    dataset2 = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_tf.csv.gz')
    dataset = pd.merge(dataset, dataset2[['text', 'tf']], how='left', on=['text'])
  if os.path.isfile(datapath + f'dataset/multitude_obfuscated_{obfuscator}_diff.csv.gz'):
    dataset2 = pd.read_csv(datapath + f'dataset/multitude_obfuscated_{obfuscator}_diff.csv.gz')
    dataset = pd.merge(dataset, dataset2[['text', 'diff_charlen']], how='left', on=['text'])
  print(f'Processing {obfuscator}')
  dataset['obfuscator'] = obfuscator
  dataset['ED-norm'] = dataset['editdistance'] / [len(x) for x in dataset['text']]
  dataset['diff_charlen'] = np.array([1/i if i!=0 else 0 for i in dataset['diff_charlen']])
  dataset.rename(columns={'tf_y': 'tf'}, inplace=True)
  temp = dataset.reset_index().groupby(['multi_label', 'language']).apply(lambda x: x.sample(min(10, len(x)), random_state = 0)).reset_index(drop=True)
  temp['multi_label'] = temp['multi_label'] + '_' + obfuscator
  data = pd.concat([data, temp], ignore_index=True)
  temp = dataset.groupby(['obfuscator'])[['meteor', 'bertscore', 'use', 'ngram', 'tf', 'ED-norm', 'diff_charlen']].agg(['mean', 'std'])
  temp['changed_language'] = len(dataset[dataset.language != dataset.fasttext]) / len(dataset)
  results = pd.concat([results, temp])
display(results.style.format(na_rep=0, precision=4).highlight_max(props='font-weight: bold;', axis=0, subset=[(x, 'mean') if x != 'changed_language' else (x,'') for x in results.columns.get_level_values(0).unique()]))

In [None]:
#dump subset for human check
data.to_csv(datapath + f'dataset/multitude_obfuscated_human_check.csv', index=False)

In [None]:
temp = results.copy()
temp2 = results.copy()
for col in temp.columns.get_level_values(0).unique()[:-1]:
  temp2[col] = [f"{str('%.3f' % x)} (±{str('%.2f' % y)})" for x,y in zip(temp[(col, 'mean')], temp[(col, 'std')])]
temp2.columns = temp2.columns.droplevel(level=1)
temp2 = temp2.T.drop_duplicates().T
temp2.rename(index=rename_obfuscators, inplace=True)
temp2.columns = ['METEOR', 'BERTScore', 'USE', 'ngram', 'TF', 'LD', 'CharLenDiff', 'LangCheck']
temp2.style.highlight_max(props='font-weight: bold;', axis=0).format({'LangCheck': '{:,.2%}'.format}, na_rep=0, precision=4)

Unnamed: 0_level_0,METEOR,BERTScore,USE,ngram,TF,LD,CharLenDiff,LangCheck
obfuscator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
m2m100-1.2B,0.452 (±0.22),0.853 (±0.07),0.842 (±0.13),0.485 (±0.18),0.810 (±0.16),0.467 (±0.21),0.678 (±0.24),0.55%
nllb-200-distilled-1.3B,0.398 (±0.23),0.833 (±0.08),0.797 (±0.17),0.431 (±0.20),0.775 (±0.18),0.542 (±0.33),0.638 (±0.39),0.30%
Pegasus-paraphrase,0.331 (±0.24),0.708 (±0.15),0.575 (±0.34),0.324 (±0.23),0.646 (±0.28),0.698 (±0.40),0.556 (±0.49),28.17%
DIPPER,0.276 (±0.23),0.760 (±0.10),0.683 (±0.26),0.282 (±0.23),0.528 (±0.34),0.704 (±0.28),0.756 (±0.32),51.79%
ChatGPT,0.566 (±0.22),0.867 (±0.07),0.884 (±0.11),0.546 (±0.18),0.819 (±0.16),0.418 (±0.22),0.920 (±0.24),1.38%
GPTZzzs,0.968 (±0.06),0.974 (±0.03),0.988 (±0.02),0.918 (±0.09),0.986 (±0.02),0.046 (±0.05),1.017 (±0.02),2.78%
GPTZeroBypasser,0.131 (±0.10),0.651 (±0.21),0.375 (±0.18),0.168 (±0.14),0.130 (±0.17),0.495 (±0.17),1.238 (±0.03),37.33%
HomoglyphAttack,0.568 (±0.10),0.778 (±0.05),0.762 (±0.11),0.596 (±0.06),0.179 (±0.16),0.094 (±0.02),1.003 (±0.00),2.74%
ALISON,0.987 (±0.06),0.991 (±0.02),0.993 (±0.01),0.971 (±0.04),0.968 (±0.07),0.009 (±0.01),1.005 (±0.01),2.77%
DFTFooler,0.948 (±0.07),0.977 (±0.02),0.990 (±0.02),0.920 (±0.08),0.963 (±0.06),0.033 (±0.04),1.004 (±0.01),2.78%


In [None]:
print(temp2.style.highlight_max(props='font-weight: bold;', axis=0).format({'LangCheck': '{:,.2%}'.format}, na_rep=0, precision=4).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('%', '\%'))

\begin{tabular}{lllllllll}
 & \bfseries METEOR & \bfseries BERTScore & \bfseries USE & \bfseries ngram & \bfseries TF & \bfseries LD & \bfseries CharLenDiff & \bfseries LangCheck \\
obfuscator &  &  &  &  &  &  &  &  \\
\bfseries m2m100-1.2B & 0.452 (±0.22) & 0.853 (±0.07) & 0.842 (±0.13) & 0.485 (±0.18) & 0.810 (±0.16) & 0.467 (±0.21) & 0.678 (±0.24) & 0.55\% \\
\bfseries nllb-200-distilled-1.3B & 0.398 (±0.23) & 0.833 (±0.08) & 0.797 (±0.17) & 0.431 (±0.20) & 0.775 (±0.18) & 0.542 (±0.33) & 0.638 (±0.39) & 0.30\% \\
\bfseries Pegasus-paraphrase & 0.331 (±0.24) & 0.708 (±0.15) & 0.575 (±0.34) & 0.324 (±0.23) & 0.646 (±0.28) & 0.698 (±0.40) & 0.556 (±0.49) & 28.17\% \\
\bfseries DIPPER & 0.276 (±0.23) & 0.760 (±0.10) & 0.683 (±0.26) & 0.282 (±0.23) & 0.528 (±0.34) & \bfseries 0.704 (±0.28) & 0.756 (±0.32) & \bfseries 51.79\% \\
\bfseries ChatGPT & 0.566 (±0.22) & 0.867 (±0.07) & 0.884 (±0.11) & 0.546 (±0.18) & 0.819 (±0.16) & 0.418 (±0.22) & 0.920 (±0.24) & 1.38\% \\
\bfseries GPTZzzs 