In [1]:
DATAPATH = "./results/" #directory with the detectors results
optimize_threshold = False #whether to find optimal classification threshold or to use a default 0.5

# Installs and imports

In [2]:
#mount GDrive if DATAPATH is on it
from google.colab import drive
drive.mount('/content/drive')

In [3]:
import pandas as pd
import numpy as np
import glob
import shutil
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, f1_score, accuracy_score
import pickle
import os
from tqdm import tqdm
import random

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

# Data loading

In [4]:
#casting dtypes to reduce RAM usage
CAST_DICT = {'text': 'string', 'language': 'string', 'label': 'string', 'length': 'int16', 'source': 'string', 'domain': 'string', 'topic': 'string', 'split': 'string', 'multi_label': 'string', 'predictions': 'string', 'prediction_probs': np.float16}
#modification of labels from textual to numeric form (and backwards) for evaluation purpose
label_names = ["human", "machine"] #0, 1
id2label = {idx:label for idx, label in enumerate(label_names)}
label2id = {v:k for k,v in id2label.items()}

test_results = []

In [5]:
#Load results from finetuned detectors
files = glob.glob(DATAPATH + '*.csv.gz')
for f in tqdm(files, total= len(files)):
  df = pd.read_csv(f, dtype = CAST_DICT)
  df.drop(columns=['text', 'domain', 'topic', 'split'], inplace=True)
  test_results.append({f.split('/')[-1] : df})

In [6]:
multitude_test = pd.read_csv(DATAPATH + 'multitude.csv', dtype = CAST_DICT)
multitude_test = multitude_test[multitude_test.split == 'test']
multitude_test.drop(columns=['text', 'domain', 'topic', 'split'], inplace=True)

#Add results from statistical detectors
files = glob.glob(DATAPATH + "statistical/*.csv")
for f in tqdm(files, total= len(files)):
  df = pd.read_csv(f)
  temp = multitude_test.copy().reset_index()
  temp['predictions'] = [id2label[x] for x in df['Predictions']]
  temp['predictions'] = temp['predictions'].astype('string')
  temp['label'] = [id2label[int(x)] for x in temp['label']]
  temp['label'] = temp['label'].astype('string')
  test_results.append({f.split('/')[-1].replace('predictions_', 'statistical-').replace('.csv', '') : temp})

#Add results from black-box detectors
files = glob.glob(DATAPATH + "blackbox/*.csv")
for f in tqdm(files, total= len(files)):
  df = pd.read_csv(f)
  temp = multitude_test.copy().reset_index()
  temp['predictions'] = [id2label[x] for x in df['predictions']]
  temp['predictions'] = temp['predictions'].astype('string')
  temp['label'] = [id2label[int(x)] for x in temp['label']]
  temp['label'] = temp['label'].astype('string')
  test_results.append({f.split('/')[-1].replace('predictions_', 'blackbox-').replace('.csv', '') : temp})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  multitude_test.drop(columns=['text', 'domain', 'topic', 'split'], inplace=True)
100%|██████████| 7/7 [00:00<00:00, 20.25it/s]
100%|██████████| 2/2 [00:00<00:00, 21.12it/s]


In [7]:
#limit detectors to those in EMNLP paper
EMNLP_detectors = ['bert-base-multilingual-cased-finetuned',
                   'roberta-large-openai-detector-finetuned',
                   'mdeberta-v3-base-finetuned',
                   'xlm-roberta-large-finetuned',
                   'electra-large-discriminator-finetuned',
                   'gpt2-medium-finetuned',
                   'mGPT-finetuned',
                   'statistical-entropy',
                   'statistical-rank',
                   'statistical-log_rank',
                   'statistical-rank_GLTR',
                   'statistical-likelihood',
                   'statistical-detectgpt',
                   'statistical-entropy_RF-tuned',
                   'blackbox-GPTZero',
                   'blackbox-ZeroGPT']
def is_EMNLP_detector(detector):
  for k,v in detector.items():
    #print(k);
    temp = k.split('-finetuned-')[0]
    if len(k.split('-finetuned-')) > 1:
      temp += '-finetuned'
    return temp in EMNLP_detectors

test_results = [x for x in test_results if is_EMNLP_detector(x)]

#Results analysis

In [8]:
def cr2df(labels, predictions, detector):
  cr = classification_report(labels, predictions, digits=4, output_dict=True, zero_division=0)
  cm = confusion_matrix(labels, predictions)

  #based on https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal
  TN = cm[0][0]
  FN = cm[1][0]
  TP = cm[1][1]
  FP = cm[0][1]
  # Fall out or false positive rate
  FPR = FP/(FP+TN) if (FP+TN) > 0 else 0
  # False negative rate
  FNR = FN/(TP+FN) if (TP+FN) > 0 else 0

  train_language = 'N/A'
  train_llm = 'N/A'
  if 'finetuned' in detector:
    train_language = detector.split('-finetuned-')[1].split('-')[0]
    train_llm = detector.split(f'-{train_language}-')[1].replace('.csv.gz', '')
  return pd.DataFrame({'Model': detector.split('-finetuned-')[0], 'Train Language': train_language, 'Train LLM': train_llm, 'Detector': detector, 'Macro avg F1-score': cr['macro avg']['f1-score'], 'Weighted avg F1-score': cr['weighted avg']['f1-score'], 'Weighted avg Precision': cr['weighted avg']['precision'], 'Weighted avg Recall': cr['weighted avg']['recall'], 'Accuracy': cr['accuracy'], 'FPR': FPR, 'FNR': FNR, 'Human samples': cr['human']['support'], 'Machine samples': cr['machine']['support']}, index=[0])

In [83]:
def analyze(results_list):
  results = pd.DataFrame()
  for detector in tqdm(results_list, total=len(results_list)):
    for detector_name, detector_data in detector.items():
        temp = detector_data
        if len(temp.label.unique()) < 2: continue
        optimal_threshold = 0.5
        if optimize_threshold and 'prediction_probs' in temp.columns:
          labels = [label2id[x] for x in temp['label']]
          predictions = [label2id[x] for x in temp['predictions']]
          temp = temp.fillna(0.0)
          temp['prediction_probs'] = temp['prediction_probs'].astype(float)
          temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs']
          if (optimize_threshold):
            fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
            optimal_threshold = thresholds[np.argmax(tpr - fpr)]
            #optimal_threshold = thresholds[fpr <= 0.05][-1] #get threshold for 5% FPR
          preds = ["machine" if ((y > optimal_threshold)) else "human" for x,y in zip(temp['predictions'],temp['prediction_probs'])]
        else:
          preds = temp['predictions']
          if 'prediction_probs' in temp.columns:
            temp = temp.fillna(0.0)
            temp['prediction_probs'] = temp['prediction_probs'].astype(float)
            temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs']
        scores = cr2df(temp['label'], preds, detector_name)
        try:
          scores['AUC_preds'] = roc_auc_score([label2id[x] for x in temp['label']], [label2id[x] for x in preds])
          scores['AUC_probs'] = roc_auc_score([label2id[x] for x in temp['label']], temp['prediction_probs'])
          scores['threshold'] = optimal_threshold
        except:
          pass
        results = pd.concat([results, scores], copy=False, ignore_index=True)
  temp = results.sort_values(by=['Macro avg F1-score'], ascending=False).reset_index(drop=True)
  return temp


## General Test

In [84]:
all = analyze(test_results)
all

100%|██████████| 324/324 [09:00<00:00,  1.67s/it]


Unnamed: 0,Model,Train Language,Train LLM,Detector,Macro avg F1-score,Weighted avg F1-score,Weighted avg Precision,Weighted avg Recall,Accuracy,FPR,FNR,Human samples,Machine samples,AUC_preds,AUC_probs,threshold
0,mdeberta-v3-base,all,all,mdeberta-v3-base-finetuned-all-all.csv.gz,0.848011,0.939955,0.940286,0.939648,0.939648,0.261434,0.035381,3236,26059,0.851592,0.960706,0.5
1,xlm-roberta-large,all,all,xlm-roberta-large-finetuned-all-all.csv.gz,0.824012,0.935233,0.935675,0.939819,0.939819,0.4178,0.015772,3236,26059,0.783214,0.965781,0.5
2,xlm-roberta-large,es,all,xlm-roberta-large-finetuned-es-all.csv.gz,0.811016,0.925465,0.925731,0.925209,0.925209,0.330655,0.043018,3236,26059,0.813164,0.92226,0.5
3,xlm-roberta-large,ru,all,xlm-roberta-large-finetuned-ru-all.csv.gz,0.798669,0.923438,0.921867,0.926062,0.926062,0.408838,0.03235,3236,26059,0.779406,0.944882,0.5
4,mdeberta-v3-base,es,all,mdeberta-v3-base-finetuned-es-all.csv.gz,0.796051,0.922763,0.921114,0.925755,0.925755,0.419345,0.03139,3236,26059,0.774632,0.930882,0.5
5,bert-base-multilingual-cased,all,all,bert-base-multilingual-cased-finetuned-all-all.csv.gz,0.756344,0.907265,0.905064,0.91036,0.91036,0.478059,0.041406,3236,26059,0.740267,0.918754,0.5
6,mdeberta-v3-base,ru,all,mdeberta-v3-base-finetuned-ru-all.csv.gz,0.746595,0.910254,0.912071,0.920737,0.920737,0.584672,0.016501,3236,26059,0.699413,0.926895,0.5
7,bert-base-multilingual-cased,es,all,bert-base-multilingual-cased-finetuned-es-all.csv.gz,0.740026,0.894183,0.898589,0.89063,0.89063,0.413782,0.071568,3236,26059,0.757325,0.881894,0.5
8,roberta-large-openai-detector,all,all,roberta-large-openai-detector-finetuned-all-all.csv.gz,0.736044,0.893278,0.896759,0.890357,0.890357,0.430779,0.069765,3236,26059,0.749728,0.86447,0.5
9,mGPT,ru,all,mGPT-finetuned-ru-all.csv.gz,0.721899,0.897587,0.894128,0.904762,0.904762,0.575093,0.03565,3236,26059,0.694629,0.878017,0.5


In [85]:
#only best of each base model
temp = all.drop_duplicates(subset=['Model'], keep='first')
temp.loc[(~temp['Model'].str.contains('blackbox-') & ~temp['Model'].str.contains('statistical-')), 'Model'] = 'finetuned-' + temp['Model']
#multilingual = ['mdeberta-v3-base', 'xlm-roberta-large', 'mGPT', 'bert-base-multilingual-cased']
#temp['Model'] = [f'{x}*' if x in multilingual else x for x in temp['Model']]
category = ['S' if 'statistical-' in x else 'B' if 'blackbox-' in x else 'F' for x in temp['Model']]
temp.insert(1, 'Category', category)
temp['Model'] = temp['Model'].str.replace('statistical-', '', regex=False).str.replace('blackbox-', '', regex=False).str.replace('finetuned-', '', regex=False)
names_dic = { 'gptzero': 'GPTZero',
              'ZeroGPT': 'ZeroGPT*',
              'sapling': 'Sapling',
              'gptkit': 'GPTKit',
              'likelihood': 'Log-likelihood*',
              'rank': 'Rank*',
              'log_rank': 'Log-Rank*',
              'entropy': 'Entropy*',
              'entropy_RF-tuned': 'Entropy + RandomForest*',
              'rank_GLTR': 'GLTR Test-2 (Rank)*',
              'detectgpt': 'DetectGPT*',
              'roberta-large-openai-detector': 'RoBERTa-large-OpenAI-detector',
              'gpt2-medium': 'GPT-2 Medium',
              'xlm-roberta-large' : 'XLM-RoBERTa-large*',
              'bert-base-multilingual-cased': 'BERT-base-multilingual-cased*',
              'mdeberta-v3-base': 'MDeBERTa-v3-base*',
              'electra-large-discriminator': 'ELECTRA-large',
              'mGPT': 'mGPT*'
}
temp['Model'] = temp['Model'].replace(dict(names_dic), regex=False)
temp = temp.rename(columns={'Model': 'Detector Model'})
temp = temp.set_index('Detector Model')
temp.drop(columns=['Detector', 'Human samples', 'Machine samples'], inplace=True)
if 'AUC_preds' in temp.columns:
  temp.drop(columns=['threshold', 'AUC_preds'], inplace=True)
print(temp.style.format(na_rep="N/A", precision=4).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))
temp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['Model'] = temp['Model'].str.replace('statistical-', '', regex=False).str.replace('blackbox-', '', regex=False).str.replace('finetuned-', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['Model'] = temp['Model'].replace(dict(names_dic), regex=False)


\begin{tabular}{llllrrrrrrrr}
 & \bfseries Category & \bfseries Train Language & \bfseries Train LLM & \bfseries Macro avg F1-score & \bfseries Weighted avg F1-score & \bfseries Weighted avg Precision & \bfseries Weighted avg Recall & \bfseries Accuracy & \bfseries FPR & \bfseries FNR & \bfseries AUC_probs \\
Detector Model &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries MDeBERTa-v3-base* & F & all & all & 0.8480 & 0.9400 & 0.9403 & 0.9396 & 0.9396 & 0.2614 & 0.0354 & 0.9607 \\
\bfseries XLM-RoBERTa-large* & F & all & all & 0.8240 & 0.9352 & 0.9357 & 0.9398 & 0.9398 & 0.4178 & 0.0158 & 0.9658 \\
\bfseries BERT-base-multilingual-cased* & F & all & all & 0.7563 & 0.9073 & 0.9051 & 0.9104 & 0.9104 & 0.4781 & 0.0414 & 0.9188 \\
\bfseries RoBERTa-large-OpenAI-detector & F & all & all & 0.7360 & 0.8933 & 0.8968 & 0.8904 & 0.8904 & 0.4308 & 0.0698 & 0.8645 \\
\bfseries mGPT* & F & ru & all & 0.7219 & 0.8976 & 0.8941 & 0.9048 & 0.9048 & 0.5751 & 0.0356 & 0.8780 \\
\bfseries GPT-2 Medium & F & al

Unnamed: 0_level_0,Category,Train Language,Train LLM,Macro avg F1-score,Weighted avg F1-score,Weighted avg Precision,Weighted avg Recall,Accuracy,FPR,FNR,AUC_probs
Detector Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
MDeBERTa-v3-base*,F,all,all,0.848011,0.939955,0.940286,0.939648,0.939648,0.261434,0.035381,0.960706
XLM-RoBERTa-large*,F,all,all,0.824012,0.935233,0.935675,0.939819,0.939819,0.4178,0.015772,0.965781
BERT-base-multilingual-cased*,F,all,all,0.756344,0.907265,0.905064,0.91036,0.91036,0.478059,0.041406,0.918754
RoBERTa-large-OpenAI-detector,F,all,all,0.736044,0.893278,0.896759,0.890357,0.890357,0.430779,0.069765,0.86447
mGPT*,F,ru,all,0.721899,0.897587,0.894128,0.904762,0.904762,0.575093,0.03565,0.878017
GPT-2 Medium,F,all,all,0.664612,0.866769,0.868235,0.86537,0.86537,0.584981,0.078706,0.789929
ELECTRA-large,F,en,all,0.555896,0.795225,0.830961,0.768391,0.768391,0.652967,0.179285,0.605279
Entropy + RandomForest*,S,,,0.48633,0.833501,0.805047,0.872913,0.872913,0.975587,0.02172,
Rank*,S,,,0.47077,0.837535,0.791277,0.889537,0.889537,1.0,0.0,
DetectGPT*,S,,,0.47077,0.837535,0.791277,0.889537,0.889537,1.0,0.0,


## RQ1 Zero Shot

In [16]:
#How do zero-shot detectors perform on specified language?
def analyze_language(results_list, test_language):
  results = pd.DataFrame()
  for detector in tqdm(results_list, total=len(results_list)):
    for detector_name, detector_data in detector.items():
        #skip fine-tuned detectors
        if 'statistical-' not in detector_name and 'blackbox-' not in detector_name: continue
        temp = detector_data[detector_data.language.str.contains(test_language)]
        if len(temp.label.unique()) < 2: continue
        if optimize_threshold and 'prediction_probs' in temp.columns:
          optimal_threshold = 0.5
          labels = [label2id[x] for x in temp['label']]
          predictions = [label2id[x] for x in temp['predictions']]
          temp = temp.fillna(0.0)
          temp['prediction_probs'] = temp['prediction_probs'].astype(float)
          temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs']
          if (optimize_threshold):
            fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
            optimal_threshold = thresholds[np.argmax(tpr - fpr)]
            #optimal_threshold = thresholds[fpr <= 0.05][-1] #get threshold for 5% FPR
          preds = ["machine" if ((y > optimal_threshold)) else "human" for x,y in zip(temp['predictions'],temp['prediction_probs'])]
        else:
          preds = temp['predictions']
        scores = cr2df(temp['label'], preds, detector_name)
        results = pd.concat([results, scores], copy=False, ignore_index=True)
  temp = results.sort_values(by=['Macro avg F1-score'], ascending=False).reset_index(drop=True)
  return temp

In [17]:
%%time
results_all = pd.DataFrame()
for test_language in ['ar', 'ca', 'cs', 'de', 'en', 'es', 'nl', 'pt', 'ru', 'uk', 'zh']:
    temp = analyze_language(test_results, test_language)
    temp = temp[['Model', 'Macro avg F1-score']]
    temp = temp.set_index(['Model'])
    temp.rename(columns={'Macro avg F1-score': test_language}, inplace=True)
    if len(results_all) > 0: temp = temp[test_language]
    results_all = pd.concat([results_all, temp], copy=False, axis=1)

100%|██████████| 324/324 [00:01<00:00, 297.11it/s] 
100%|██████████| 324/324 [00:01<00:00, 303.03it/s] 
100%|██████████| 324/324 [00:01<00:00, 305.85it/s] 
100%|██████████| 324/324 [00:01<00:00, 308.18it/s] 
100%|██████████| 324/324 [00:00<00:00, 333.91it/s] 
100%|██████████| 324/324 [00:01<00:00, 246.52it/s] 
100%|██████████| 324/324 [00:01<00:00, 163.14it/s] 
100%|██████████| 324/324 [00:01<00:00, 167.59it/s] 
100%|██████████| 324/324 [00:01<00:00, 287.85it/s] 
100%|██████████| 324/324 [00:01<00:00, 314.27it/s] 
100%|██████████| 324/324 [00:01<00:00, 315.37it/s] 

CPU times: user 13.6 s, sys: 79.8 ms, total: 13.6 s
Wall time: 13.8 s





In [18]:
def rename_models(temp):
  temp = temp.reset_index()
  temp['Model'] = temp['Model'].str.replace('statistical-', '', regex=False).str.replace('blackbox-', '', regex=False).str.replace('finetuned-', '', regex=False)
  names_dic = { 'gptzero': 'GPTZero',
              'ZeroGPT': 'ZeroGPT',
              'sapling': 'Sapling',
              'gptkit': 'GPTKit',
              'likelihood': 'Log-likelihood',
              'rank': 'Rank',
              'log_rank': 'Log-Rank',
              'entropy': 'Entropy',
              'entropy_RF-tuned': 'Entropy + RandomForest',
              'rank_GLTR': 'GLTR Test-2 (Rank)',
              'detectgpt': 'DetectGPT',
              'roberta-large-openai-detector': 'RoBERTa-large-OpenAI-detector',
              'gpt2-medium': 'GPT-2 Medium',
              'xlm-roberta-large' : 'XLM-RoBERTa-large',
              'bert-base-multilingual-cased': 'BERT-base-multilingual-cased',
              'mdeberta-v3-base': 'MDeBERTa-v3-base',
              'electra-large-discriminator': 'ELECTRA-large',
              'mGPT': 'mGPT'
  }
  temp['Model'] = temp['Model'].replace(dict(names_dic), regex=False)
  return temp.set_index('Model')

In [19]:
results_all = rename_models(results_all)

In [20]:
multilingual = ['mdeberta-v3-base', 'xlm-roberta-large', 'mGPT', 'bert-base-multilingual-cased']
sorted_languages = ['en', 'de', 'nl', 'es', 'pt', 'ca', 'cs', 'ru', 'uk','ar', 'zh']
bg_cmap = 'PuBu'
bg_vmin = 0.0
bg_vmax = 2.0
bg_text_color_threshold = 0

In [21]:
temp = results_all.style.format(na_rep=0, precision=4)
display(temp.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None))

Unnamed: 0_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Entropy + RandomForest,0.486,0.4721,0.4732,0.4729,0.4703,0.4697,0.4692,0.4702,0.5202,0.504,0.4663
Rank,0.4704,0.4705,0.4705,0.4712,0.4706,0.472,0.4706,0.4716,0.4702,0.4704,0.4704
Entropy,0.4704,0.4705,0.4705,0.4712,0.4706,0.472,0.4706,0.4716,0.4702,0.4704,0.4704
DetectGPT,0.4704,0.4705,0.4705,0.4712,0.4706,0.472,0.4706,0.4716,0.4702,0.4704,0.4704
Log-Rank,0.4702,0.4705,0.4705,0.4712,0.4706,0.472,0.4706,0.4716,0.4698,0.4703,0.4644
Log-likelihood,0.4702,0.4705,0.4705,0.4712,0.4706,0.472,0.4706,0.4716,0.4699,0.4703,0.4662
GLTR Test-2 (Rank),0.4239,0.4702,0.47,0.4701,0.4706,0.472,0.4703,0.4711,0.4697,0.4697,0.4653
ZeroGPT,0.3055,0.4807,0.4509,0.4019,0.5979,0.475,0.4625,0.451,0.4194,0.4267,0.1398
GPTZero,0.1128,0.1057,0.104,0.0999,0.5626,0.0973,0.1044,0.101,0.1042,0.1014,0.1189


In [22]:
print(temp.highlight_max(props='font-weight: bold;', axis=0).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('_','\_'))

\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Model &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries Entropy + RandomForest & {\cellcolor[HTML]{D2D2E7}} \color[HTML]{000000} \bfseries 0.4860 & {\cellcolor[HTML]{D3D4E7}} \color[HTML]{000000} 0.4721 & {\cellcolor[HTML]{D3D4E7}} \color[HTML]{000000} \bfseries 0.4732 & {\cellcolor[HTML]{D3D4E7}} \color[HTML]{000000} \bfseries 0.4729 & {\cellcolor[HTML]{D3D4E7}} \color[HTML]{000000} 0.4703 & {\cellcolor[HTML]{D3D4E7}} \color[HTML]{000000} 0.4697 & {\cellcolor[HTML]{D3D4E7}} \color[HTML]{000000} 0.4692 & {\cellcolor[HTML]{D3D4E7}} \color[HTML]{000000} 0.4702 & {\cellcolor[HTML]{CDD0E5}} \color[HTML]{000000} \bfseries 0.5202 & {\cellcolor[HTML]{D0D1E6}} \color[HTML]{000000} \bfseries 0.5040 & {\cellcolor[HTML]{D4D4E8}} \color[HTML]{000000} 0.4663 \\
\bfseries Rank & {\cellcolor[HTML]{D3D4E7}} \c

## RQ2 Monolingual Generalization

In [23]:
#How do detectors perform on individual language and LLM when trained on specified language?
def analyze_language_for_train_language_per_llm(results_list, train_language, test_language):
  results = pd.DataFrame()
  for detector in tqdm(results_list, total=len(results_list)):
    for detector_name, detector_data in detector.items():
      for llm in ['text-davinci-003', 'gpt-3.5-turbo', 'gpt-4', 'alpaca-lora-30b', 'vicuna-13b', 'llama-65b', 'opt-66b', 'opt-iml-max-1.3b']:
        if f'-{train_language}-' not in detector_name: continue
        if f'-{llm}.' not in detector_name: continue
        temp = detector_data[(detector_data.language.str.contains(test_language)) & (detector_data.multi_label.str.contains(llm) | detector_data.multi_label.str.contains('human'))]
        if len(temp.label.unique()) < 2: continue
        if optimize_threshold and 'prediction_probs' in temp.columns:
          optimal_threshold = 0.5
          labels = [label2id[x] for x in temp['label']]
          predictions = [label2id[x] for x in temp['predictions']]
          temp = temp.fillna(0.0)
          temp['prediction_probs'] = temp['prediction_probs'].astype(float)
          temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs']
          if (optimize_threshold):
            fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
            optimal_threshold = thresholds[np.argmax(tpr - fpr)]
            #optimal_threshold = thresholds[fpr <= 0.05][-1] #get threshold for 5% FPR
          preds = ["machine" if ((y > optimal_threshold)) else "human" for x,y in zip(temp['predictions'],temp['prediction_probs'])]
        else:
          preds = temp['predictions']
        scores = cr2df(temp['label'], preds, detector_name)
        results = pd.concat([results, scores], copy=False, ignore_index=True)
  temp = results.sort_values(by=['Macro avg F1-score'], ascending=False).reset_index(drop=True)
  return temp

In [24]:
%%time
#How do detectors perform on individual language (only corresponding LLM machine data) when trained on specified language and LLM?
results_all = pd.DataFrame()
for train_language in ['en', 'es', 'ru']:
  results = pd.DataFrame()
  for test_language in ['ar', 'ca', 'cs', 'de', 'en', 'es', 'nl', 'pt', 'ru', 'uk', 'zh']:
    temp = analyze_language_for_train_language_per_llm(test_results, train_language, test_language)
    temp = temp[~temp['Train LLM'].str.contains('all')]
    temp = temp[['Train Language', 'Train LLM', 'Model', 'Macro avg F1-score']]
    temp = temp.sort_values(by=['Train Language', 'Train LLM', 'Model'])
    temp = temp.set_index(['Train Language', 'Train LLM', 'Model'])
    temp.rename(columns={'Macro avg F1-score': test_language}, inplace=True)
    if len(results) > 0: temp = temp[test_language]
    results = pd.concat([results, temp], copy=False, axis=1)
  results_all = pd.concat([results_all, results], copy=False)

100%|██████████| 324/324 [00:03<00:00, 87.93it/s]
100%|██████████| 324/324 [00:04<00:00, 79.08it/s]
100%|██████████| 324/324 [00:05<00:00, 62.25it/s]
100%|██████████| 324/324 [00:03<00:00, 90.94it/s]
100%|██████████| 324/324 [00:03<00:00, 91.02it/s]
100%|██████████| 324/324 [00:05<00:00, 58.38it/s]
100%|██████████| 324/324 [00:03<00:00, 88.12it/s]
100%|██████████| 324/324 [00:03<00:00, 92.48it/s]
100%|██████████| 324/324 [00:04<00:00, 80.17it/s]
100%|██████████| 324/324 [00:05<00:00, 61.72it/s]
100%|██████████| 324/324 [00:03<00:00, 90.96it/s]
100%|██████████| 324/324 [00:03<00:00, 90.22it/s] 
100%|██████████| 324/324 [00:05<00:00, 57.54it/s]
100%|██████████| 324/324 [00:03<00:00, 87.47it/s] 
100%|██████████| 324/324 [00:03<00:00, 90.56it/s] 
100%|██████████| 324/324 [00:04<00:00, 80.64it/s]
100%|██████████| 324/324 [00:05<00:00, 62.67it/s] 
100%|██████████| 324/324 [00:03<00:00, 90.85it/s] 
100%|██████████| 324/324 [00:03<00:00, 91.02it/s] 
100%|██████████| 324/324 [00:05<00:00, 57.16

CPU times: user 2min 14s, sys: 921 ms, total: 2min 15s
Wall time: 2min 17s





In [25]:
#results_all.style.background_gradient(axis=1).format(na_rep=0, precision=4)
temp = results_all.style.format(na_rep=0, precision=4)
display(temp.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train Language,Train LLM,Model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
en,alpaca-lora-30b,bert-base-multilingual-cased,0.5375,0.8271,0.8546,0.8917,0.9567,0.7517,0.8563,0.8055,0.8374,0.8091,0.5537
en,alpaca-lora-30b,electra-large-discriminator,0.527,0.4929,0.4819,0.3956,0.9783,0.5856,0.5448,0.5196,0.5153,0.4215,0.4642
en,alpaca-lora-30b,gpt2-medium,0.7438,0.4252,0.4513,0.3928,0.9657,0.4063,0.4233,0.3719,0.5547,0.5326,0.3742
en,alpaca-lora-30b,mGPT,0.4024,0.8089,0.6132,0.8763,0.9639,0.739,0.8791,0.8333,0.8162,0.821,0.4626
en,alpaca-lora-30b,mdeberta-v3-base,0.208,0.8592,0.7691,0.9003,0.9439,0.7744,0.8977,0.857,0.7988,0.7563,0.3099
en,alpaca-lora-30b,roberta-large-openai-detector,0.3484,0.6258,0.543,0.5133,0.9238,0.4809,0.7787,0.4348,0.4029,0.4838,0.4487
en,alpaca-lora-30b,xlm-roberta-large,0.4474,0.8488,0.8713,0.9324,0.9801,0.6319,0.7706,0.713,0.8733,0.8319,0.4474
en,gpt-3.5-turbo,bert-base-multilingual-cased,0.9215,0.8904,0.915,0.902,0.9783,0.8545,0.9348,0.9124,0.9183,0.8962,0.8933
en,gpt-3.5-turbo,electra-large-discriminator,0.3985,0.8409,0.3333,0.3576,0.9765,0.8243,0.7355,0.781,0.3849,0.3526,0.3399
en,gpt-3.5-turbo,gpt2-medium,0.3673,0.3407,0.3443,0.3369,0.9838,0.3264,0.3432,0.3378,0.3689,0.371,0.3428


In [26]:
#display(results_all.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg(['mean','std']).style.background_gradient(axis=1).format(na_rep=0, precision=4))
temp = results_all.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg(['mean','std']).style.format(na_rep=0, precision=4)
display(temp.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None))

Unnamed: 0_level_0,ar,ar,ca,ca,cs,cs,de,de,en,en,es,es,nl,nl,pt,pt,ru,ru,uk,uk,zh,zh
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
Train Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
en,0.5016,0.1664,0.6412,0.2122,0.5909,0.1862,0.6578,0.2074,0.9361,0.0577,0.6284,0.1681,0.6703,0.2113,0.6273,0.1955,0.5976,0.1835,0.5832,0.1649,0.4902,0.1374
es,0.648,0.2083,0.8413,0.1173,0.73,0.2195,0.785,0.1922,0.579,0.25,0.9259,0.0607,0.7689,0.1811,0.8749,0.0947,0.6804,0.2151,0.68,0.2146,0.6082,0.207
ru,0.7421,0.1789,0.5274,0.2069,0.6171,0.2644,0.5914,0.2392,0.4795,0.2328,0.5614,0.2256,0.5524,0.2223,0.5369,0.2087,0.887,0.1195,0.8557,0.143,0.6183,0.1899


In [27]:
temp_mean = results_all.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg('mean')

In [28]:
temp_std = results_all.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg('std')

In [29]:
temp = temp_mean.copy()
for col in temp_mean.columns:
  temp[col] = [f"{str('%.4f' % x)} (±{str('%.2f' % y)})" for x,y in zip(temp_mean[col], temp_std[col])]

In [30]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import colors

def b_g(s, cmap='PuBu', low=0, high=0):
    #if s.name in temp_mean.columns:
    #  a = temp_mean.loc[:,s.name].copy()
    #else:
    a = temp_mean.T.loc[:,s.name].copy() #for axis=1 must be T used
    rng = a.max() - a.min()
    norm = colors.Normalize(a.min() - (rng * low),
                        a.max()+0.1 + (rng * high))
    normed = norm(a.values)
    c = [colors.rgb2hex(x) for x in matplotlib.colormaps[cmap](normed)]
    return ['background-color: %s' % color for color in c]

temp.style.apply(b_g,cmap='PuBu', axis=1)

Unnamed: 0_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
en,0.5016 (±0.17),0.6412 (±0.21),0.5909 (±0.19),0.6578 (±0.21),0.9361 (±0.06),0.6284 (±0.17),0.6703 (±0.21),0.6273 (±0.20),0.5976 (±0.18),0.5832 (±0.16),0.4902 (±0.14)
es,0.6480 (±0.21),0.8413 (±0.12),0.7300 (±0.22),0.7850 (±0.19),0.5790 (±0.25),0.9259 (±0.06),0.7689 (±0.18),0.8749 (±0.09),0.6804 (±0.22),0.6800 (±0.21),0.6082 (±0.21)
ru,0.7421 (±0.18),0.5274 (±0.21),0.6171 (±0.26),0.5914 (±0.24),0.4795 (±0.23),0.5614 (±0.23),0.5524 (±0.22),0.5369 (±0.21),0.8870 (±0.12),0.8557 (±0.14),0.6183 (±0.19)


In [31]:
#results_all.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg('max').style.background_gradient(axis=1).format(na_rep=0, precision=4)
temp = results_all.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg('max').style.format(na_rep=0, precision=4)
display(temp.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None))

Unnamed: 0_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
en,0.9215,0.9733,1.0,0.9609,0.9928,0.9435,0.9615,0.9319,0.9471,0.957,0.8933
es,0.9931,0.9916,0.9933,0.9932,0.9801,0.9914,0.9799,0.983,0.9949,0.9931,0.9528
ru,0.9931,0.96,0.995,0.9442,0.9765,0.9435,0.9633,0.9368,0.9966,0.9948,0.9338


In [32]:
print(results_all.reset_index().groupby(['Train Language']).agg('mean').style.format(na_rep=0, precision=4).highlight_max(props='font-weight: bold;', axis=1).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Train Language &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries en & 0.5016 & 0.6412 & 0.5909 & 0.6578 & \bfseries 0.9361 & 0.6284 & 0.6703 & 0.6273 & 0.5976 & 0.5832 & 0.4902 \\
\bfseries es & 0.6480 & 0.8413 & 0.7300 & 0.7850 & 0.5790 & \bfseries 0.9259 & 0.7689 & 0.8749 & 0.6804 & 0.6800 & 0.6082 \\
\bfseries ru & 0.7421 & 0.5274 & 0.6171 & 0.5914 & 0.4795 & 0.5614 & 0.5524 & 0.5369 & \bfseries 0.8870 & 0.8557 & 0.6183 \\
\end{tabular}



  print(results_all.reset_index().groupby(['Train Language']).agg('mean').style.format(na_rep=0, precision=4).highlight_max(props='font-weight: bold;', axis=1).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))


In [33]:
#Results just for English fine-tuned models
temp = results_all.loc['en',:]
display(temp.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4))
print(temp.style.format(na_rep=0, precision=4).highlight_max(props='font-weight: bold;', axis=1).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

means = pd.DataFrame()
means = pd.concat([means, temp.agg('mean')], copy=False, axis=1)
means.rename(columns={0:'All Detectors Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Multilingual Base Models Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x not in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Monolingual Base Models Mean'}, inplace=True)
means = means.T.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)
display(means)
temp = means.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True)
#little hack to properly format latex table
print(temp.replace('\\color[HTML]{F1F1F1} ', '\\textcolor{white}{').replace('\\color[HTML]{000000} ', '\\textcolor{black}{').replace(' & {\\cellcolor', '} & {\\cellcolor').replace(' \\\\', '} \\\\').replace('\n\\bfseries', '\n\\multicolumn{2}{r|}{\\bfseries').replace('zh} \\\\', 'zh \\\\'))

Unnamed: 0_level_0,Unnamed: 1_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train LLM,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
alpaca-lora-30b,bert-base-multilingual-cased,0.5375,0.8271,0.8546,0.8917,0.9567,0.7517,0.8563,0.8055,0.8374,0.8091,0.5537
alpaca-lora-30b,electra-large-discriminator,0.527,0.4929,0.4819,0.3956,0.9783,0.5856,0.5448,0.5196,0.5153,0.4215,0.4642
alpaca-lora-30b,gpt2-medium,0.7438,0.4252,0.4513,0.3928,0.9657,0.4063,0.4233,0.3719,0.5547,0.5326,0.3742
alpaca-lora-30b,mGPT,0.4024,0.8089,0.6132,0.8763,0.9639,0.739,0.8791,0.8333,0.8162,0.821,0.4626
alpaca-lora-30b,mdeberta-v3-base,0.208,0.8592,0.7691,0.9003,0.9439,0.7744,0.8977,0.857,0.7988,0.7563,0.3099
alpaca-lora-30b,roberta-large-openai-detector,0.3484,0.6258,0.543,0.5133,0.9238,0.4809,0.7787,0.4348,0.4029,0.4838,0.4487
alpaca-lora-30b,xlm-roberta-large,0.4474,0.8488,0.8713,0.9324,0.9801,0.6319,0.7706,0.713,0.8733,0.8319,0.4474
gpt-3.5-turbo,bert-base-multilingual-cased,0.9215,0.8904,0.915,0.902,0.9783,0.8545,0.9348,0.9124,0.9183,0.8962,0.8933
gpt-3.5-turbo,electra-large-discriminator,0.3985,0.8409,0.3333,0.3576,0.9765,0.8243,0.7355,0.781,0.3849,0.3526,0.3399
gpt-3.5-turbo,gpt2-medium,0.3673,0.3407,0.3443,0.3369,0.9838,0.3264,0.3432,0.3378,0.3689,0.371,0.3428


\begin{tabular}{llrrrrrrrrrrr}
 &  & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Train LLM & Model &  &  &  &  &  &  &  &  &  &  &  \\
\multirow[c]{7}{*}{\bfseries alpaca-lora-30b} & \bfseries bert-base-multilingual-cased & 0.5375 & 0.8271 & 0.8546 & 0.8917 & \bfseries 0.9567 & 0.7517 & 0.8563 & 0.8055 & 0.8374 & 0.8091 & 0.5537 \\
\bfseries  & \bfseries electra-large-discriminator & 0.5270 & 0.4929 & 0.4819 & 0.3956 & \bfseries 0.9783 & 0.5856 & 0.5448 & 0.5196 & 0.5153 & 0.4215 & 0.4642 \\
\bfseries  & \bfseries gpt2-medium & 0.7438 & 0.4252 & 0.4513 & 0.3928 & \bfseries 0.9657 & 0.4063 & 0.4233 & 0.3719 & 0.5547 & 0.5326 & 0.3742 \\
\bfseries  & \bfseries mGPT & 0.4024 & 0.8089 & 0.6132 & 0.8763 & \bfseries 0.9639 & 0.7390 & 0.8791 & 0.8333 & 0.8162 & 0.8210 & 0.4626 \\
\bfseries  & \bfseries mdeberta-v3-base & 0.2080 & 0.8592 & 0.7691 & 0.9003 & \bfseries 0.9439

Unnamed: 0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
All Detectors Mean,0.5016,0.6412,0.5909,0.6578,0.9361,0.6284,0.6703,0.6273,0.5976,0.5832,0.4902
Multilingual Base Models Mean,0.5448,0.7335,0.6793,0.8104,0.9292,0.7018,0.7508,0.7362,0.7148,0.6746,0.558
Monolingual Base Models Mean,0.444,0.5182,0.473,0.4544,0.9454,0.5304,0.5629,0.4822,0.4412,0.4613,0.3999


\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
\multicolumn{2}{r|}{\bfseries All Detectors Mean} & {\cellcolor[HTML]{D0D1E6}} \textcolor{black}{0.5016} & {\cellcolor[HTML]{B8C6E0}} \textcolor{black}{0.6412} & {\cellcolor[HTML]{C1CAE2}} \textcolor{black}{0.5909} & {\cellcolor[HTML]{B5C4DF}} \textcolor{black}{0.6578} & {\cellcolor[HTML]{81AED2}} \textcolor{black}{0.9361} & {\cellcolor[HTML]{BBC7E0}} \textcolor{black}{0.6284} & {\cellcolor[HTML]{B4C4DF}} \textcolor{black}{0.6703} & {\cellcolor[HTML]{BBC7E0}} \textcolor{black}{0.6273} & {\cellcolor[HTML]{C0C9E2}} \textcolor{black}{0.5976} & {\cellcolor[HTML]{C2CBE2}} \textcolor{black}{0.5832} & {\cellcolor[HTML]{D2D2E7}} \textcolor{black}{0.4902} \\
\multicolumn{2}{r|}{\bfseries Multilingual Base Models Mean} & {\cellcolor[HTML]{C9CEE4}} \textcolor{black}{0.5448} & {\cellcolor[HTML]{A9BFDC}

In [34]:
#Results just for Spanish fine-tuned models
temp = results_all.loc['es',:]
display(temp.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4))
print(temp.style.format(na_rep=0, precision=4).highlight_max(props='font-weight: bold;', axis=1).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

means = pd.DataFrame()
means = pd.concat([means, temp.agg('mean')], copy=False, axis=1)
means.rename(columns={0:'All Detectors Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Multilingual Base Models Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x not in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Monolingual Base Models Mean'}, inplace=True)
means = means.T.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)
display(means)
temp = means.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True)
#little hack to properly format latex table
print(temp.replace('\\color[HTML]{F1F1F1} ', '\\textcolor{white}{').replace('\\color[HTML]{000000} ', '\\textcolor{black}{').replace(' & {\\cellcolor', '} & {\\cellcolor').replace(' \\\\', '} \\\\').replace('\n\\bfseries', '\n\\multicolumn{2}{r|}{\\bfseries').replace('zh} \\\\', 'zh \\\\'))

Unnamed: 0_level_0,Unnamed: 1_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train LLM,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
alpaca-lora-30b,bert-base-multilingual-cased,0.6019,0.895,0.8108,0.8765,0.7517,0.9092,0.8941,0.8943,0.86,0.8182,0.7713
alpaca-lora-30b,electra-large-discriminator,0.5116,0.4754,0.337,0.3361,0.3325,0.9348,0.5286,0.8665,0.3891,0.382,0.3986
alpaca-lora-30b,gpt2-medium,0.6937,0.809,0.6564,0.8226,0.7332,0.8935,0.8291,0.8602,0.5735,0.5679,0.4391
alpaca-lora-30b,mGPT,0.698,0.8639,0.6989,0.9001,0.6589,0.9452,0.7088,0.9403,0.8366,0.888,0.5691
alpaca-lora-30b,mdeberta-v3-base,0.8255,0.7338,0.911,0.9171,0.8771,0.8256,0.8038,0.7777,0.9317,0.9096,0.7954
alpaca-lora-30b,roberta-large-openai-detector,0.3578,0.7052,0.6789,0.8414,0.2717,0.9295,0.8296,0.8671,0.3587,0.3634,0.7537
alpaca-lora-30b,xlm-roberta-large,0.6999,0.8349,0.9008,0.9493,0.9295,0.9281,0.9532,0.9301,0.8845,0.8278,0.6164
gpt-3.5-turbo,bert-base-multilingual-cased,0.8896,0.9365,0.8997,0.89,0.9169,0.9263,0.9232,0.8858,0.8949,0.8722,0.8765
gpt-3.5-turbo,electra-large-discriminator,0.5268,0.7374,0.3333,0.334,0.3333,0.9743,0.429,0.9215,0.539,0.4383,0.3333
gpt-3.5-turbo,gpt2-medium,0.4313,0.8889,0.39,0.3827,0.3669,0.897,0.4874,0.832,0.4949,0.4526,0.3355


\begin{tabular}{llrrrrrrrrrrr}
 &  & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Train LLM & Model &  &  &  &  &  &  &  &  &  &  &  \\
\multirow[c]{7}{*}{\bfseries alpaca-lora-30b} & \bfseries bert-base-multilingual-cased & 0.6019 & 0.8950 & 0.8108 & 0.8765 & 0.7517 & \bfseries 0.9092 & 0.8941 & 0.8943 & 0.8600 & 0.8182 & 0.7713 \\
\bfseries  & \bfseries electra-large-discriminator & 0.5116 & 0.4754 & 0.3370 & 0.3361 & 0.3325 & \bfseries 0.9348 & 0.5286 & 0.8665 & 0.3891 & 0.3820 & 0.3986 \\
\bfseries  & \bfseries gpt2-medium & 0.6937 & 0.8090 & 0.6564 & 0.8226 & 0.7332 & \bfseries 0.8935 & 0.8291 & 0.8602 & 0.5735 & 0.5679 & 0.4391 \\
\bfseries  & \bfseries mGPT & 0.6980 & 0.8639 & 0.6989 & 0.9001 & 0.6589 & \bfseries 0.9452 & 0.7088 & 0.9403 & 0.8366 & 0.8880 & 0.5691 \\
\bfseries  & \bfseries mdeberta-v3-base & 0.8255 & 0.7338 & 0.9110 & 0.9171 & 0.8771 & 0.8256 

Unnamed: 0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
All Detectors Mean,0.648,0.8413,0.73,0.785,0.579,0.9259,0.7689,0.8749,0.6804,0.68,0.6082
Multilingual Base Models Mean,0.7857,0.8747,0.8016,0.8812,0.7322,0.9314,0.8143,0.8944,0.8375,0.8299,0.7216
Monolingual Base Models Mean,0.4644,0.7967,0.6346,0.6568,0.3748,0.9187,0.7082,0.8488,0.471,0.4801,0.4569


\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
\multicolumn{2}{r|}{\bfseries All Detectors Mean} & {\cellcolor[HTML]{B8C6E0}} \textcolor{black}{0.6480} & {\cellcolor[HTML]{94B6D7}} \textcolor{black}{0.8413} & {\cellcolor[HTML]{A9BFDC}} \textcolor{black}{0.7300} & {\cellcolor[HTML]{9FBAD9}} \textcolor{black}{0.7850} & {\cellcolor[HTML]{C2CBE2}} \textcolor{black}{0.5790} & {\cellcolor[HTML]{83AFD3}} \textcolor{black}{0.9259} & {\cellcolor[HTML]{A2BCDA}} \textcolor{black}{0.7689} & {\cellcolor[HTML]{8EB3D5}} \textcolor{black}{0.8749} & {\cellcolor[HTML]{B1C2DE}} \textcolor{black}{0.6804} & {\cellcolor[HTML]{B1C2DE}} \textcolor{black}{0.6800} & {\cellcolor[HTML]{BFC9E1}} \textcolor{black}{0.6082} \\
\multicolumn{2}{r|}{\bfseries Multilingual Base Models Mean} & {\cellcolor[HTML]{9FBAD9}} \textcolor{black}{0.7857} & {\cellcolor[HTML]{8EB3D5}

In [35]:
#Results just for Russian fine-tuned models
temp = results_all.loc['ru',:]
display(temp.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4))
print(temp.style.format(na_rep=0, precision=4).highlight_max(props='font-weight: bold;', axis=1).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

means = pd.DataFrame()
means = pd.concat([means, temp.agg('mean')], copy=False, axis=1)
means.rename(columns={0:'All Detectors Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Multilingual Base Models Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x not in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Monolingual Base Models Mean'}, inplace=True)
means = means.T.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)
display(means)
temp = means.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True)
#little hack to properly format latex table
print(temp.replace('\\color[HTML]{F1F1F1} ', '\\textcolor{white}{').replace('\\color[HTML]{000000} ', '\\textcolor{black}{').replace(' & {\\cellcolor', '} & {\\cellcolor').replace(' \\\\', '} \\\\').replace('\n\\bfseries', '\n\\multicolumn{2}{r|}{\\bfseries').replace('zh} \\\\', 'zh \\\\'))

Unnamed: 0_level_0,Unnamed: 1_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train LLM,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
alpaca-lora-30b,bert-base-multilingual-cased,0.6026,0.6327,0.8067,0.7409,0.4634,0.6572,0.6705,0.6567,0.9099,0.8829,0.6696
alpaca-lora-30b,electra-large-discriminator,0.4945,0.368,0.3443,0.3516,0.3829,0.5214,0.4081,0.3884,0.5506,0.51,0.4514
alpaca-lora-30b,gpt2-medium,0.7291,0.4784,0.4665,0.4094,0.6639,0.6505,0.4265,0.5331,0.8599,0.8249,0.395
alpaca-lora-30b,mGPT,0.5635,0.4123,0.6725,0.8716,0.5272,0.7714,0.5325,0.792,0.935,0.9129,0.4906
alpaca-lora-30b,mdeberta-v3-base,0.9716,0.8123,0.985,0.9409,0.855,0.8885,0.8868,0.8199,0.9617,0.9615,0.8237
alpaca-lora-30b,roberta-large-openai-detector,0.7062,0.3326,0.3407,0.3363,0.2381,0.3394,0.337,0.3421,0.8599,0.8464,0.5544
alpaca-lora-30b,xlm-roberta-large,0.918,0.8983,0.97,0.9442,0.9169,0.9058,0.9431,0.8993,0.9433,0.9397,0.8522
gpt-3.5-turbo,bert-base-multilingual-cased,0.9416,0.868,0.9045,0.8538,0.781,0.8754,0.8783,0.8378,0.9567,0.9331,0.8666
gpt-3.5-turbo,electra-large-discriminator,0.6636,0.3333,0.3333,0.3303,0.3457,0.3272,0.333,0.3276,0.6336,0.5592,0.3333
gpt-3.5-turbo,gpt2-medium,0.5384,0.3443,0.3407,0.3369,0.3423,0.3264,0.3403,0.335,0.8716,0.8061,0.3333


\begin{tabular}{llrrrrrrrrrrr}
 &  & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Train LLM & Model &  &  &  &  &  &  &  &  &  &  &  \\
\multirow[c]{7}{*}{\bfseries alpaca-lora-30b} & \bfseries bert-base-multilingual-cased & 0.6026 & 0.6327 & 0.8067 & 0.7409 & 0.4634 & 0.6572 & 0.6705 & 0.6567 & \bfseries 0.9099 & 0.8829 & 0.6696 \\
\bfseries  & \bfseries electra-large-discriminator & 0.4945 & 0.3680 & 0.3443 & 0.3516 & 0.3829 & 0.5214 & 0.4081 & 0.3884 & \bfseries 0.5506 & 0.5100 & 0.4514 \\
\bfseries  & \bfseries gpt2-medium & 0.7291 & 0.4784 & 0.4665 & 0.4094 & 0.6639 & 0.6505 & 0.4265 & 0.5331 & \bfseries 0.8599 & 0.8249 & 0.3950 \\
\bfseries  & \bfseries mGPT & 0.5635 & 0.4123 & 0.6725 & 0.8716 & 0.5272 & 0.7714 & 0.5325 & 0.7920 & \bfseries 0.9350 & 0.9129 & 0.4906 \\
\bfseries  & \bfseries mdeberta-v3-base & 0.9716 & 0.8123 & \bfseries 0.9850 & 0.9409 & 0.8550

Unnamed: 0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
All Detectors Mean,0.7421,0.5274,0.6171,0.5914,0.4795,0.5614,0.5524,0.5369,0.887,0.8557,0.6183
Multilingual Base Models Mean,0.8487,0.6532,0.7924,0.7591,0.576,0.6884,0.6915,0.6626,0.9522,0.9387,0.7294
Monolingual Base Models Mean,0.6001,0.3596,0.3835,0.3677,0.3508,0.392,0.3669,0.3692,0.8,0.7451,0.4701


\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
\multicolumn{2}{r|}{\bfseries All Detectors Mean} & {\cellcolor[HTML]{A8BEDC}} \textcolor{black}{0.7421} & {\cellcolor[HTML]{CCCFE5}} \textcolor{black}{0.5274} & {\cellcolor[HTML]{BDC8E1}} \textcolor{black}{0.6171} & {\cellcolor[HTML]{C1CAE2}} \textcolor{black}{0.5914} & {\cellcolor[HTML]{D2D3E7}} \textcolor{black}{0.4795} & {\cellcolor[HTML]{C6CCE3}} \textcolor{black}{0.5614} & {\cellcolor[HTML]{C8CDE4}} \textcolor{black}{0.5524} & {\cellcolor[HTML]{CACEE5}} \textcolor{black}{0.5369} & {\cellcolor[HTML]{8BB2D4}} \textcolor{black}{0.8870} & {\cellcolor[HTML]{91B5D6}} \textcolor{black}{0.8557} & {\cellcolor[HTML]{BCC7E1}} \textcolor{black}{0.6183} \\
\multicolumn{2}{r|}{\bfseries Multilingual Base Models Mean} & {\cellcolor[HTML]{93B5D6}} \textcolor{black}{0.8487} & {\cellcolor[HTML]{B7C5DF}

In [36]:
temp = results_all[['en', 'de', 'nl', 'es', 'pt', 'ca', 'cs', 'ru', 'uk','ar', 'zh']].corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

Unnamed: 0,en,de,nl,es,pt,ca,cs,ru,uk,ar,zh
en,1.0,0.3586,0.4236,0.1826,0.2428,0.2954,0.0899,-0.0791,-0.1245,-0.003,0.0994
de,0.3586,1.0,0.783,0.7144,0.7724,0.7467,0.7077,0.3471,0.35,0.3736,0.5491
nl,0.4236,0.783,1.0,0.6816,0.7527,0.8655,0.5376,0.1788,0.1623,0.3141,0.5523
es,0.1826,0.7144,0.6816,1.0,0.9586,0.8215,0.4784,0.058,0.0733,0.2244,0.3091
pt,0.2428,0.7724,0.7527,0.9586,1.0,0.8666,0.4834,0.1055,0.1045,0.2535,0.3519
ca,0.2954,0.7467,0.8655,0.8215,0.8666,1.0,0.5308,0.1499,0.1478,0.3016,0.4097
cs,0.0899,0.7077,0.5376,0.4784,0.4834,0.5308,1.0,0.357,0.4382,0.4438,0.6073
ru,-0.0791,0.3471,0.1788,0.058,0.1055,0.1499,0.357,1.0,0.9317,0.7726,0.5683
uk,-0.1245,0.35,0.1623,0.0733,0.1045,0.1478,0.4382,0.9317,1.0,0.7883,0.5773
ar,-0.003,0.3736,0.3141,0.2244,0.2535,0.3016,0.4438,0.7726,0.7883,1.0,0.6829


\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries en & \bfseries de & \bfseries nl & \bfseries es & \bfseries pt & \bfseries ca & \bfseries cs & \bfseries ru & \bfseries uk & \bfseries ar & \bfseries zh \\
\bfseries en & {\cellcolor[HTML]{73A9CF}} \color[HTML]{000000} 1.0000 & {\cellcolor[HTML]{E0DEED}} \color[HTML]{000000} 0.3586 & {\cellcolor[HTML]{D9D8EA}} \color[HTML]{000000} 0.4236 & {\cellcolor[HTML]{F1EBF5}} \color[HTML]{000000} 0.1826 & {\cellcolor[HTML]{EDE7F2}} \color[HTML]{000000} 0.2428 & {\cellcolor[HTML]{E7E3F0}} \color[HTML]{000000} 0.2954 & {\cellcolor[HTML]{F8F1F8}} \color[HTML]{000000} 0.0899 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.0791 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.1245 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.0030 & {\cellcolor[HTML]{F8F1F8}} \color[HTML]{000000} 0.0994 \\
\bfseries de & {\cellcolor[HTML]{E0DEED}} \color[HTML]{000000} 0.3586 & {\cellcolor[HTML]{73A9CF}} \color[HTML]{000000} 1.0000 & {\cellcolor[HTML]{9FBAD9}}

## RQ3 Multilingual Generalization

In [37]:
%%time
#How do detectors perform on individual language (only corresponding LLM machine data) when trained on all train languages per specified LLM?
results_all = pd.DataFrame()
for train_language in ['en', 'es', 'ru', 'all', 'en3']:
  results = pd.DataFrame()
  for test_language in ['ar', 'ca', 'cs', 'de', 'en', 'es', 'nl', 'pt', 'ru', 'uk', 'zh']:
    temp = analyze_language_for_train_language_per_llm(test_results, train_language, test_language)
    temp = temp[~temp['Train LLM'].str.contains('all')]
    temp = temp[['Train Language', 'Train LLM', 'Model', 'Macro avg F1-score']]
    temp = temp.sort_values(by=['Train Language', 'Train LLM', 'Model'])
    temp = temp.set_index(['Train Language', 'Train LLM', 'Model'])
    temp.rename(columns={'Macro avg F1-score': test_language}, inplace=True)
    if len(results) > 0: temp = temp[test_language]
    results = pd.concat([results, temp], copy=False, axis=1)
  results_all = pd.concat([results_all, results], copy=False)

100%|██████████| 324/324 [00:05<00:00, 56.00it/s]
100%|██████████| 324/324 [00:03<00:00, 88.28it/s]
100%|██████████| 324/324 [00:03<00:00, 89.70it/s]
100%|██████████| 324/324 [00:05<00:00, 61.53it/s]
100%|██████████| 324/324 [00:03<00:00, 81.46it/s]
100%|██████████| 324/324 [00:03<00:00, 91.27it/s]
100%|██████████| 324/324 [00:03<00:00, 84.58it/s]
100%|██████████| 324/324 [00:05<00:00, 58.97it/s]
100%|██████████| 324/324 [00:03<00:00, 88.70it/s]
100%|██████████| 324/324 [00:03<00:00, 90.52it/s]
100%|██████████| 324/324 [00:05<00:00, 59.77it/s]
100%|██████████| 324/324 [00:03<00:00, 84.50it/s] 
100%|██████████| 324/324 [00:03<00:00, 90.97it/s] 
100%|██████████| 324/324 [00:03<00:00, 82.92it/s]
100%|██████████| 324/324 [00:05<00:00, 60.13it/s] 
100%|██████████| 324/324 [00:03<00:00, 93.96it/s] 
100%|██████████| 324/324 [00:03<00:00, 91.10it/s] 
100%|██████████| 324/324 [00:05<00:00, 60.62it/s]
100%|██████████| 324/324 [00:03<00:00, 82.62it/s] 
100%|██████████| 324/324 [00:03<00:00, 91.06

CPU times: user 3min 47s, sys: 1.55 s, total: 3min 48s
Wall time: 3min 53s





In [38]:
temp_mean = results_all.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg('mean')
temp_std = results_all.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg('std')
temp = temp_mean.copy()
for col in temp_mean.columns:
  temp[col] = [f"{str('%.4f' % x)} (±{str('%.2f' % y)})" for x,y in zip(temp_mean[col], temp_std[col])]
sort_key = {'en': 0, 'es': 1, 'ru': 2, 'all': 3, 'en3': 4}
temp = temp.sort_index(key=lambda x: x.map(sort_key))

display(temp.style.apply(b_g,cmap='PuBu', axis=1))
print(temp.style.apply(b_g,cmap='PuBu', axis=1).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

Unnamed: 0_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
en,0.5016 (±0.17),0.6412 (±0.21),0.5909 (±0.19),0.6578 (±0.21),0.9361 (±0.06),0.6284 (±0.17),0.6703 (±0.21),0.6273 (±0.20),0.5976 (±0.18),0.5832 (±0.16),0.4902 (±0.14)
es,0.6480 (±0.21),0.8413 (±0.12),0.7300 (±0.22),0.7850 (±0.19),0.5790 (±0.25),0.9259 (±0.06),0.7689 (±0.18),0.8749 (±0.09),0.6804 (±0.22),0.6800 (±0.21),0.6082 (±0.21)
ru,0.7421 (±0.18),0.5274 (±0.21),0.6171 (±0.26),0.5914 (±0.24),0.4795 (±0.23),0.5614 (±0.23),0.5524 (±0.22),0.5369 (±0.21),0.8870 (±0.12),0.8557 (±0.14),0.6183 (±0.19)
all,0.7463 (±0.19),0.8765 (±0.09),0.7918 (±0.22),0.8300 (±0.16),0.9472 (±0.04),0.9375 (±0.05),0.8407 (±0.13),0.9099 (±0.05),0.8592 (±0.20),0.8296 (±0.21),0.6558 (±0.18)
en3,0.4931 (±0.20),0.6568 (±0.21),0.6270 (±0.21),0.6871 (±0.21),0.9529 (±0.06),0.6476 (±0.17),0.6800 (±0.22),0.6497 (±0.19),0.5759 (±0.20),0.5716 (±0.21),0.5299 (±0.16)


\begin{tabular}{llllllllllll}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Train Language &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries en & {\cellcolor[HTML]{FCF4FA}} 0.5016 (±0.17) & {\cellcolor[HTML]{C8CDE4}} 0.6412 (±0.21) & {\cellcolor[HTML]{DFDDEC}} 0.5909 (±0.19) & {\cellcolor[HTML]{BDC8E1}} 0.6578 (±0.21) & {\cellcolor[HTML]{04649D}} 0.9361 (±0.06) & {\cellcolor[HTML]{D0D1E6}} 0.6284 (±0.17) & {\cellcolor[HTML]{B5C4DF}} 0.6703 (±0.21) & {\cellcolor[HTML]{D0D1E6}} 0.6273 (±0.20) & {\cellcolor[HTML]{DCDAEB}} 0.5976 (±0.18) & {\cellcolor[HTML]{E2DFEE}} 0.5832 (±0.16) & {\cellcolor[HTML]{FFF7FB}} 0.4902 (±0.14) \\
\bfseries es & {\cellcolor[HTML]{E6E2EF}} 0.6480 (±0.21) & {\cellcolor[HTML]{4897C4}} 0.8413 (±0.12) & {\cellcolor[HTML]{B3C3DE}} 0.7300 (±0.22) & {\cellcolor[HTML]{83AFD3}} 0.7850 (±0.19) & {\cellcolor[HTML]{FFF7FB}} 0.5790 (±0.25) & {\cellcolor[HTM

In [39]:
#using the same scale (the same color representing the same value), i.e., not per row min-max
temp = temp.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, gmap=temp_mean, axis=None)
display(temp)
temp = temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True)
#little hack to properly format latex table
print(temp.replace('\\color[HTML]{F1F1F1} ', '\\textcolor{white}{').replace('\\color[HTML]{000000} ', '\\textcolor{black}{').replace(') & {\\cellcolor', ')} & {\\cellcolor').replace(') \\', ')} \\'))

Unnamed: 0_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
en,0.5016 (±0.17),0.6412 (±0.21),0.5909 (±0.19),0.6578 (±0.21),0.9361 (±0.06),0.6284 (±0.17),0.6703 (±0.21),0.6273 (±0.20),0.5976 (±0.18),0.5832 (±0.16),0.4902 (±0.14)
es,0.6480 (±0.21),0.8413 (±0.12),0.7300 (±0.22),0.7850 (±0.19),0.5790 (±0.25),0.9259 (±0.06),0.7689 (±0.18),0.8749 (±0.09),0.6804 (±0.22),0.6800 (±0.21),0.6082 (±0.21)
ru,0.7421 (±0.18),0.5274 (±0.21),0.6171 (±0.26),0.5914 (±0.24),0.4795 (±0.23),0.5614 (±0.23),0.5524 (±0.22),0.5369 (±0.21),0.8870 (±0.12),0.8557 (±0.14),0.6183 (±0.19)
all,0.7463 (±0.19),0.8765 (±0.09),0.7918 (±0.22),0.8300 (±0.16),0.9472 (±0.04),0.9375 (±0.05),0.8407 (±0.13),0.9099 (±0.05),0.8592 (±0.20),0.8296 (±0.21),0.6558 (±0.18)
en3,0.4931 (±0.20),0.6568 (±0.21),0.6270 (±0.21),0.6871 (±0.21),0.9529 (±0.06),0.6476 (±0.17),0.6800 (±0.22),0.6497 (±0.19),0.5759 (±0.20),0.5716 (±0.21),0.5299 (±0.16)


\begin{tabular}{llllllllllll}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Train Language &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries en & {\cellcolor[HTML]{D0D1E6}} \textcolor{black}{0.5016 (±0.17)} & {\cellcolor[HTML]{B8C6E0}} \textcolor{black}{0.6412 (±0.21)} & {\cellcolor[HTML]{C1CAE2}} \textcolor{black}{0.5909 (±0.19)} & {\cellcolor[HTML]{B5C4DF}} \textcolor{black}{0.6578 (±0.21)} & {\cellcolor[HTML]{81AED2}} \textcolor{black}{0.9361 (±0.06)} & {\cellcolor[HTML]{BBC7E0}} \textcolor{black}{0.6284 (±0.17)} & {\cellcolor[HTML]{B4C4DF}} \textcolor{black}{0.6703 (±0.21)} & {\cellcolor[HTML]{BBC7E0}} \textcolor{black}{0.6273 (±0.20)} & {\cellcolor[HTML]{C0C9E2}} \textcolor{black}{0.5976 (±0.18)} & {\cellcolor[HTML]{C2CBE2}} \textcolor{black}{0.5832 (±0.16)} & {\cellcolor[HTML]{D2D2E7}} \textcolor{black}{0.4902 (±0.14)} \\
\bfseries es & {\cellcolor[HTML]{B8C6E0}}

In [40]:
bg_cmap = 'PuBu'
bg_vmin = 0.0
bg_vmax = 2.0
bg_text_color_threshold = 0

In [41]:
#Only multilingual detectors
multilingual = ['mdeberta-v3-base', 'xlm-roberta-large', 'mGPT', 'bert-base-multilingual-cased']
sorted_languages = ['en', 'de', 'nl', 'es', 'pt', 'ca', 'cs', 'ru', 'uk','ar', 'zh']

results_all_multilingual = results_all.loc[[x in multilingual for x in results_all.reset_index().Model], :]
temp_mean = results_all_multilingual.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg('mean')
temp_std = results_all_multilingual.reset_index().drop(columns=['Train LLM', 'Model']).groupby(['Train Language']).agg('std')
temp = temp_mean.copy()
for col in temp_mean.columns:
  temp[col] = [f"{str('%.4f' % x)} (±{str('%.2f' % y)})" for x,y in zip(temp_mean[col], temp_std[col])]
sort_key = {'en': 0, 'es': 1, 'ru': 2, 'all': 3, 'en3': 4}
temp = temp.sort_index(key=lambda x: x.map(sort_key))

temp = temp.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, gmap=temp_mean, axis=None)
display(temp)
temp = temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True)
#little hack to properly format latex table
print(temp.replace('\\color[HTML]{F1F1F1} ', '\\textcolor{white}{').replace('\\color[HTML]{000000} ', '\\textcolor{black}{').replace(') & {\\cellcolor', ')} & {\\cellcolor').replace(') \\', ')} \\').replace('\\bfseries all', '\\hline\n\\bfseries all'))

temp = results_all_multilingual[sorted_languages].corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

Unnamed: 0_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
en,0.5448 (±0.19),0.7335 (±0.21),0.6793 (±0.18),0.8104 (±0.11),0.9292 (±0.07),0.7018 (±0.13),0.7508 (±0.20),0.7362 (±0.15),0.7148 (±0.14),0.6746 (±0.14),0.5580 (±0.14)
es,0.7857 (±0.15),0.8747 (±0.08),0.8016 (±0.20),0.8812 (±0.09),0.7322 (±0.20),0.9314 (±0.06),0.8143 (±0.18),0.8944 (±0.09),0.8375 (±0.11),0.8299 (±0.13),0.7216 (±0.17)
ru,0.8487 (±0.14),0.6532 (±0.19),0.7924 (±0.21),0.7591 (±0.17),0.5760 (±0.25),0.6884 (±0.21),0.6915 (±0.20),0.6626 (±0.19),0.9522 (±0.03),0.9387 (±0.04),0.7294 (±0.15)
all,0.8537 (±0.10),0.8977 (±0.08),0.8604 (±0.20),0.9073 (±0.06),0.9420 (±0.04),0.9372 (±0.05),0.8808 (±0.12),0.9253 (±0.04),0.9560 (±0.03),0.9374 (±0.05),0.7659 (±0.12)
en3,0.5605 (±0.24),0.7484 (±0.18),0.7289 (±0.19),0.8244 (±0.10),0.9392 (±0.08),0.7156 (±0.11),0.7778 (±0.18),0.7508 (±0.14),0.7092 (±0.16),0.7118 (±0.16),0.6160 (±0.14)


\begin{tabular}{llllllllllll}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Train Language &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries en & {\cellcolor[HTML]{C9CEE4}} \textcolor{black}{0.5448 (±0.19)} & {\cellcolor[HTML]{A9BFDC}} \textcolor{black}{0.7335 (±0.21)} & {\cellcolor[HTML]{B3C3DE}} \textcolor{black}{0.6793 (±0.18)} & {\cellcolor[HTML]{9AB8D8}} \textcolor{black}{0.8104 (±0.11)} & {\cellcolor[HTML]{83AFD3}} \textcolor{black}{0.9292 (±0.07)} & {\cellcolor[HTML]{AFC1DD}} \textcolor{black}{0.7018 (±0.13)} & {\cellcolor[HTML]{A5BDDB}} \textcolor{black}{0.7508 (±0.20)} & {\cellcolor[HTML]{A8BEDC}} \textcolor{black}{0.7362 (±0.15)} & {\cellcolor[HTML]{ACC0DD}} \textcolor{black}{0.7148 (±0.14)} & {\cellcolor[HTML]{B3C3DE}} \textcolor{black}{0.6746 (±0.14)} & {\cellcolor[HTML]{C6CCE3}} \textcolor{black}{0.5580 (±0.14)} \\
\bfseries es & {\cellcolor[HTML]{9FBAD9}}

Unnamed: 0,en,de,nl,es,pt,ca,cs,ru,uk,ar,zh
en,1.0,0.5248,0.5249,0.3774,0.4949,0.424,-0.087,-0.1903,-0.3074,-0.1778,0.0412
de,0.5248,1.0,0.6765,0.7511,0.8161,0.7022,0.2435,0.3652,0.2556,0.3026,0.2963
nl,0.5249,0.6765,1.0,0.5792,0.7243,0.8492,0.0985,0.3641,0.2168,0.389,0.51
es,0.3774,0.7511,0.5792,1.0,0.9243,0.7478,0.2131,0.4184,0.3518,0.4588,0.3148
pt,0.4949,0.8161,0.7243,0.9243,1.0,0.8457,0.1508,0.3831,0.2955,0.3898,0.3068
ca,0.424,0.7022,0.8492,0.7478,0.8457,1.0,0.2092,0.4019,0.2981,0.4311,0.4134
cs,-0.087,0.2435,0.0985,0.2131,0.1508,0.2092,1.0,0.3783,0.4723,0.4136,0.4772
ru,-0.1903,0.3652,0.3641,0.4184,0.3831,0.4019,0.3783,1.0,0.8976,0.7622,0.5121
uk,-0.3074,0.2556,0.2168,0.3518,0.2955,0.2981,0.4723,0.8976,1.0,0.7574,0.5292
ar,-0.1778,0.3026,0.389,0.4588,0.3898,0.4311,0.4136,0.7622,0.7574,1.0,0.7178


\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries en & \bfseries de & \bfseries nl & \bfseries es & \bfseries pt & \bfseries ca & \bfseries cs & \bfseries ru & \bfseries uk & \bfseries ar & \bfseries zh \\
\bfseries en & {\cellcolor[HTML]{73A9CF}} \color[HTML]{000000} 1.0000 & {\cellcolor[HTML]{CCCFE5}} \color[HTML]{000000} 0.5248 & {\cellcolor[HTML]{CCCFE5}} \color[HTML]{000000} 0.5249 & {\cellcolor[HTML]{DEDCEC}} \color[HTML]{000000} 0.3774 & {\cellcolor[HTML]{D1D2E6}} \color[HTML]{000000} 0.4949 & {\cellcolor[HTML]{D9D8EA}} \color[HTML]{000000} 0.4240 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.0870 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.1903 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.3074 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.1778 & {\cellcolor[HTML]{FCF4FA}} \color[HTML]{000000} 0.0412 \\
\bfseries de & {\cellcolor[HTML]{CCCFE5}} \color[HTML]{000000} 0.5248 & {\cellcolor[HTML]{73A9CF}} \color[HTML]{000000} 1.0000 & {\cellcolor[HTML]{B3C3DE}

In [42]:
results_all_multilingual

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train Language,Train LLM,Model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
en,alpaca-lora-30b,bert-base-multilingual-cased,0.537542,0.827078,0.85456,0.891714,0.956665,0.751676,0.856264,0.805481,0.837419,0.809057,0.55369
en,alpaca-lora-30b,mGPT,0.402388,0.808877,0.613177,0.876305,0.963891,0.738976,0.879058,0.833282,0.816224,0.821046,0.462625
en,alpaca-lora-30b,mdeberta-v3-base,0.207984,0.859243,0.769121,0.900331,0.943947,0.774384,0.897682,0.857008,0.798806,0.756265,0.309948
en,alpaca-lora-30b,xlm-roberta-large,0.447388,0.848775,0.871277,0.932405,0.980144,0.631892,0.770589,0.712993,0.873298,0.831866,0.447399
en,gpt-3.5-turbo,bert-base-multilingual-cased,0.921465,0.89038,0.914981,0.902026,0.978339,0.854525,0.934819,0.91243,0.918328,0.896247,0.893291
en,gpt-3.5-turbo,mGPT,0.700916,0.926614,0.337026,0.907073,0.989169,0.850593,0.943226,0.914732,0.901331,0.907969,0.561038
en,gpt-3.5-turbo,mdeberta-v3-base,0.787647,0.559878,0.780318,0.805155,0.717787,0.602325,0.779041,0.678157,0.648425,0.764939,0.673301
en,gpt-3.5-turbo,xlm-roberta-large,0.72182,0.973319,0.72115,0.939164,0.983754,0.786081,0.922878,0.928099,0.883877,0.629046,0.515536
en,gpt-4,bert-base-multilingual-cased,0.841926,0.926529,0.759176,0.816308,0.976534,0.826112,0.926544,0.870419,0.792921,0.710398,0.836037
en,gpt-4,mGPT,0.694497,0.788771,0.386602,0.836477,0.990975,0.79537,0.83337,0.865765,0.800774,0.784926,0.473412


In [43]:
#Results just for models fine-tuned on English, on English vs Non-English
temp = results_all.loc['en',:]
means = pd.DataFrame()
means = pd.concat([means, temp.agg('sum')], copy=False, axis=1)
means.rename(columns={0:'All Detectors Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x in multilingual for x in temp.reset_index().Model], :].agg('sum')], copy=False, axis=1)
means.rename(columns={0:'Multilingual Base Models Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x not in multilingual for x in temp.reset_index().Model], :].agg('sum')], copy=False, axis=1)
means.rename(columns={0:'Monolingual Base Models Mean'}, inplace=True)
means = means.T

#!!!numbers need update for new detectors
#en: 56, 32, 24
#others: 560, 320, 24
counts = pd.DataFrame([[56,32,24], [560,320,240]], index=['English', 'Non-English'], columns=['All Detectors Mean',	'Multilingual Base Models Mean',	'Monolingual Base Models Mean'])
temp = means.T.reset_index()
temp['others'] = ~temp['index'].str.contains('en')
stat = temp.groupby('others').agg('sum')
stat.rename(index={False:'English', True:'Non-English'}, inplace=True)
stat.div(counts).T

  stat = temp.groupby('others').agg('sum')


others,English,Non-English
All Detectors Mean,0.936122,0.598855
Multilingual Base Models Mean,0.929195,0.690418
Monolingual Base Models Mean,0.945359,0.476772


In [44]:
#Results just for models fine-tuned on all languages
temp = results_all.loc['all',:]
display(temp.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4))
print(temp.style.format(na_rep=0, precision=4).highlight_max(props='font-weight: bold;', axis=1).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

means = pd.DataFrame()
means = pd.concat([means, temp.agg('mean')], copy=False, axis=1)
means.rename(columns={0:'All Detectors Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Multilingual Base Models Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x not in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Monolingual Base Models Mean'}, inplace=True)
means = means.T.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)
display(means)
temp = means.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True)
#little hack to properly format latex table
print(temp.replace('\\color[HTML]{F1F1F1} ', '\\textcolor{white}{').replace('\\color[HTML]{000000} ', '\\textcolor{black}{').replace(' & {\\cellcolor', '} & {\\cellcolor').replace(' \\\\', '} \\\\').replace('\n\\bfseries', '\n\\multicolumn{2}{r|}{\\bfseries').replace('zh} \\\\', 'zh \\\\'))

Unnamed: 0_level_0,Unnamed: 1_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train LLM,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
alpaca-lora-30b,bert-base-multilingual-cased,0.5096,0.9133,0.8879,0.9138,0.9657,0.9401,0.9161,0.9113,0.92,0.8928,0.7472
alpaca-lora-30b,electra-large-discriminator,0.3367,0.7431,0.4389,0.5051,0.9856,0.9502,0.7716,0.8824,0.3407,0.3326,0.3999
alpaca-lora-30b,gpt2-medium,0.8118,0.7896,0.6623,0.7747,0.9819,0.8919,0.8064,0.8307,0.8463,0.8251,0.4282
alpaca-lora-30b,mGPT,0.6955,0.8833,0.648,0.929,0.9585,0.9315,0.8215,0.9334,0.9383,0.9129,0.5555
alpaca-lora-30b,mdeberta-v3-base,0.8438,0.8062,0.8906,0.8727,0.9075,0.8643,0.8044,0.8364,0.9283,0.9331,0.7118
alpaca-lora-30b,roberta-large-openai-detector,0.804,0.701,0.8484,0.8629,0.9348,0.9363,0.8641,0.8969,0.8817,0.8487,0.7252
alpaca-lora-30b,xlm-roberta-large,0.7742,0.9283,0.96,0.9578,0.9856,0.9589,0.9481,0.9522,0.9366,0.9246,0.82
gpt-3.5-turbo,bert-base-multilingual-cased,0.8483,0.975,0.9633,0.9391,0.9819,0.9777,0.9816,0.9506,0.955,0.9313,0.9148
gpt-3.5-turbo,electra-large-discriminator,0.3501,0.7954,0.3333,0.334,0.9819,0.9657,0.4354,0.8583,0.3326,0.3326,0.3901
gpt-3.5-turbo,gpt2-medium,0.6911,0.918,0.5441,0.6283,0.9874,0.9452,0.7647,0.8586,0.848,0.6995,0.3333


\begin{tabular}{llrrrrrrrrrrr}
 &  & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Train LLM & Model &  &  &  &  &  &  &  &  &  &  &  \\
\multirow[c]{7}{*}{\bfseries alpaca-lora-30b} & \bfseries bert-base-multilingual-cased & 0.5096 & 0.9133 & 0.8879 & 0.9138 & \bfseries 0.9657 & 0.9401 & 0.9161 & 0.9113 & 0.9200 & 0.8928 & 0.7472 \\
\bfseries  & \bfseries electra-large-discriminator & 0.3367 & 0.7431 & 0.4389 & 0.5051 & \bfseries 0.9856 & 0.9502 & 0.7716 & 0.8824 & 0.3407 & 0.3326 & 0.3999 \\
\bfseries  & \bfseries gpt2-medium & 0.8118 & 0.7896 & 0.6623 & 0.7747 & \bfseries 0.9819 & 0.8919 & 0.8064 & 0.8307 & 0.8463 & 0.8251 & 0.4282 \\
\bfseries  & \bfseries mGPT & 0.6955 & 0.8833 & 0.6480 & 0.9290 & \bfseries 0.9585 & 0.9315 & 0.8215 & 0.9334 & 0.9383 & 0.9129 & 0.5555 \\
\bfseries  & \bfseries mdeberta-v3-base & 0.8438 & 0.8062 & 0.8906 & 0.8727 & 0.9075 & 0.8643 

Unnamed: 0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
All Detectors Mean,0.7463,0.8765,0.7918,0.83,0.9472,0.9375,0.8407,0.9099,0.8592,0.8296,0.6558
Multilingual Base Models Mean,0.8537,0.8977,0.8604,0.9073,0.942,0.9372,0.8808,0.9253,0.956,0.9374,0.7659
Monolingual Base Models Mean,0.6031,0.8482,0.7003,0.7269,0.9542,0.938,0.7872,0.8895,0.7301,0.6859,0.509


\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
\multicolumn{2}{r|}{\bfseries All Detectors Mean} & {\cellcolor[HTML]{A7BDDB}} \textcolor{black}{0.7463} & {\cellcolor[HTML]{8CB3D5}} \textcolor{black}{0.8765} & {\cellcolor[HTML]{9EBAD9}} \textcolor{black}{0.7918} & {\cellcolor[HTML]{96B6D7}} \textcolor{black}{0.8300} & {\cellcolor[HTML]{7EADD1}} \textcolor{black}{0.9472} & {\cellcolor[HTML]{80AED2}} \textcolor{black}{0.9375} & {\cellcolor[HTML]{94B6D7}} \textcolor{black}{0.8407} & {\cellcolor[HTML]{86B0D3}} \textcolor{black}{0.9099} & {\cellcolor[HTML]{91B5D6}} \textcolor{black}{0.8592} & {\cellcolor[HTML]{96B6D7}} \textcolor{black}{0.8296} & {\cellcolor[HTML]{B7C5DF}} \textcolor{black}{0.6558} \\
\multicolumn{2}{r|}{\bfseries Multilingual Base Models Mean} & {\cellcolor[HTML]{91B5D6}} \textcolor{black}{0.8537} & {\cellcolor[HTML]{89B1D4}

In [45]:
#Results just for English fine-tuned models with 3x more samples available
temp = results_all.loc['en3',:]
display(temp.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4))
print(temp.style.format(na_rep=0, precision=4).highlight_max(props='font-weight: bold;', axis=1).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

means = pd.DataFrame()
means = pd.concat([means, temp.agg('mean')], copy=False, axis=1)
means.rename(columns={0:'All Detectors Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Multilingual Base Models Mean'}, inplace=True)
means = pd.concat([means, temp.loc[[x not in multilingual for x in temp.reset_index().Model], :].agg('mean')], copy=False, axis=1)
means.rename(columns={0:'Monolingual Base Models Mean'}, inplace=True)
means = means.T.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)
display(means)
temp = means.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True)
#little hack to properly format latex table
print(temp.replace('\\color[HTML]{F1F1F1} ', '\\textcolor{white}{').replace('\\color[HTML]{000000} ', '\\textcolor{black}{').replace(' & {\\cellcolor', '} & {\\cellcolor').replace(' \\\\', '} \\\\').replace('\n\\bfseries', '\n\\multicolumn{2}{r|}{\\bfseries').replace('zh} \\\\', 'zh \\\\'))

Unnamed: 0_level_0,Unnamed: 1_level_0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
Train LLM,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
alpaca-lora-30b,bert-base-multilingual-cased,0.4951,0.8743,0.86,0.9053,0.9838,0.7617,0.9079,0.8579,0.8461,0.8144,0.5719
alpaca-lora-30b,electra-large-discriminator,0.377,0.4539,0.4389,0.3827,0.9892,0.5344,0.4379,0.4684,0.3363,0.3333,0.4711
alpaca-lora-30b,gpt2-medium,0.3656,0.442,0.4544,0.4038,0.9928,0.4063,0.4297,0.3814,0.3854,0.4195,0.4141
alpaca-lora-30b,mGPT,0.293,0.7623,0.4958,0.885,0.9801,0.736,0.8594,0.8655,0.8209,0.8043,0.4268
alpaca-lora-30b,mdeberta-v3-base,0.1764,0.6872,0.8096,0.8529,0.9385,0.6356,0.7888,0.734,0.7332,0.6824,0.4177
alpaca-lora-30b,roberta-large-openai-detector,0.3448,0.4963,0.4613,0.4216,0.9711,0.3945,0.6572,0.3721,0.3571,0.434,0.4346
alpaca-lora-30b,xlm-roberta-large,0.3306,0.8378,0.8782,0.9425,0.9874,0.6985,0.9007,0.7872,0.8294,0.8524,0.5984
gpt-3.5-turbo,bert-base-multilingual-cased,0.9449,0.9178,0.9383,0.9257,0.9819,0.8718,0.9499,0.9298,0.91,0.8946,0.9083
gpt-3.5-turbo,electra-large-discriminator,0.4179,0.4001,0.3407,0.3524,0.9856,0.6747,0.4557,0.5561,0.4058,0.3972,0.3132
gpt-3.5-turbo,gpt2-medium,0.369,0.3407,0.337,0.345,0.991,0.3301,0.3432,0.3313,0.3658,0.3539,0.3333


\begin{tabular}{llrrrrrrrrrrr}
 &  & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
Train LLM & Model &  &  &  &  &  &  &  &  &  &  &  \\
\multirow[c]{7}{*}{\bfseries alpaca-lora-30b} & \bfseries bert-base-multilingual-cased & 0.4951 & 0.8743 & 0.8600 & 0.9053 & \bfseries 0.9838 & 0.7617 & 0.9079 & 0.8579 & 0.8461 & 0.8144 & 0.5719 \\
\bfseries  & \bfseries electra-large-discriminator & 0.3770 & 0.4539 & 0.4389 & 0.3827 & \bfseries 0.9892 & 0.5344 & 0.4379 & 0.4684 & 0.3363 & 0.3333 & 0.4711 \\
\bfseries  & \bfseries gpt2-medium & 0.3656 & 0.4420 & 0.4544 & 0.4038 & \bfseries 0.9928 & 0.4063 & 0.4297 & 0.3814 & 0.3854 & 0.4195 & 0.4141 \\
\bfseries  & \bfseries mGPT & 0.2930 & 0.7623 & 0.4958 & 0.8850 & \bfseries 0.9801 & 0.7360 & 0.8594 & 0.8655 & 0.8209 & 0.8043 & 0.4268 \\
\bfseries  & \bfseries mdeberta-v3-base & 0.1764 & 0.6872 & 0.8096 & 0.8529 & \bfseries 0.9385

Unnamed: 0,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh
All Detectors Mean,0.4931,0.6568,0.627,0.6871,0.9529,0.6476,0.68,0.6497,0.5759,0.5716,0.5299
Multilingual Base Models Mean,0.5605,0.7484,0.7289,0.8244,0.9392,0.7156,0.7778,0.7508,0.7092,0.7118,0.616
Monolingual Base Models Mean,0.4033,0.5346,0.4912,0.504,0.9712,0.557,0.5497,0.515,0.3981,0.3847,0.4152


\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries ar & \bfseries ca & \bfseries cs & \bfseries de & \bfseries en & \bfseries es & \bfseries nl & \bfseries pt & \bfseries ru & \bfseries uk & \bfseries zh \\
\multicolumn{2}{r|}{\bfseries All Detectors Mean} & {\cellcolor[HTML]{D1D2E6}} \textcolor{black}{0.4931} & {\cellcolor[HTML]{B5C4DF}} \textcolor{black}{0.6568} & {\cellcolor[HTML]{BBC7E0}} \textcolor{black}{0.6270} & {\cellcolor[HTML]{B1C2DE}} \textcolor{black}{0.6871} & {\cellcolor[HTML]{7EADD1}} \textcolor{black}{0.9529} & {\cellcolor[HTML]{B8C6E0}} \textcolor{black}{0.6476} & {\cellcolor[HTML]{B1C2DE}} \textcolor{black}{0.6800} & {\cellcolor[HTML]{B7C5DF}} \textcolor{black}{0.6497} & {\cellcolor[HTML]{C4CBE3}} \textcolor{black}{0.5759} & {\cellcolor[HTML]{C4CBE3}} \textcolor{black}{0.5716} & {\cellcolor[HTML]{CCCFE5}} \textcolor{black}{0.5299} \\
\multicolumn{2}{r|}{\bfseries Multilingual Base Models Mean} & {\cellcolor[HTML]{C6CCE3}} \textcolor{black}{0.5605} & {\cellcolor[HTML]{A7BDDB}

### Statistics

In [46]:
import scipy.stats as stats
import itertools
from statsmodels.stats.anova import AnovaRM

In [47]:
#in the submitted paper (correlation also from all and en3)
results_all_multilingual[sorted_languages].corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)

Unnamed: 0,en,de,nl,es,pt,ca,cs,ru,uk,ar,zh
en,1.0,0.5248,0.5249,0.3774,0.4949,0.424,-0.087,-0.1903,-0.3074,-0.1778,0.0412
de,0.5248,1.0,0.6765,0.7511,0.8161,0.7022,0.2435,0.3652,0.2556,0.3026,0.2963
nl,0.5249,0.6765,1.0,0.5792,0.7243,0.8492,0.0985,0.3641,0.2168,0.389,0.51
es,0.3774,0.7511,0.5792,1.0,0.9243,0.7478,0.2131,0.4184,0.3518,0.4588,0.3148
pt,0.4949,0.8161,0.7243,0.9243,1.0,0.8457,0.1508,0.3831,0.2955,0.3898,0.3068
ca,0.424,0.7022,0.8492,0.7478,0.8457,1.0,0.2092,0.4019,0.2981,0.4311,0.4134
cs,-0.087,0.2435,0.0985,0.2131,0.1508,0.2092,1.0,0.3783,0.4723,0.4136,0.4772
ru,-0.1903,0.3652,0.3641,0.4184,0.3831,0.4019,0.3783,1.0,0.8976,0.7622,0.5121
uk,-0.3074,0.2556,0.2168,0.3518,0.2955,0.2981,0.4723,0.8976,1.0,0.7574,0.5292
ar,-0.1778,0.3026,0.389,0.4588,0.3898,0.4311,0.4136,0.7622,0.7574,1.0,0.7178


In [48]:
#there are just small differences when only monolingual generalization correlated
temp = results_all_multilingual.reset_index()
temp[~temp['Train Language'].isin(["en3", "all"])][sorted_languages].corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)

Unnamed: 0,en,de,nl,es,pt,ca,cs,ru,uk,ar,zh
en,1.0,0.542,0.5551,0.3794,0.496,0.4237,-0.1613,-0.3235,-0.4988,-0.2672,-0.0033
de,0.542,1.0,0.6006,0.7657,0.8022,0.6491,0.2176,0.199,0.0784,0.1954,0.1656
nl,0.5551,0.6006,1.0,0.5585,0.6905,0.8342,0.0626,0.2403,0.0503,0.3516,0.4694
es,0.3794,0.7657,0.5585,1.0,0.9317,0.7331,0.157,0.1774,0.1224,0.2989,0.2015
pt,0.496,0.8022,0.6905,0.9317,1.0,0.8251,0.0893,0.1321,0.0528,0.2483,0.185
ca,0.4237,0.6491,0.8342,0.7331,0.8251,1.0,0.1521,0.2103,0.0826,0.3345,0.316
cs,-0.1613,0.2176,0.0626,0.157,0.0893,0.1521,1.0,0.369,0.4489,0.4264,0.45
ru,-0.3235,0.199,0.2403,0.1774,0.1321,0.2103,0.369,1.0,0.8606,0.7378,0.4463
uk,-0.4988,0.0784,0.0503,0.1224,0.0528,0.0826,0.4489,0.8606,1.0,0.7398,0.4664
ar,-0.2672,0.1954,0.3516,0.2989,0.2483,0.3345,0.4264,0.7378,0.7398,1.0,0.7249


In [49]:
tex_temp = temp[~temp['Train Language'].isin(["en3", "all"])][sorted_languages].corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)
print(tex_temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

\begin{tabular}{lrrrrrrrrrrr}
 & \bfseries en & \bfseries de & \bfseries nl & \bfseries es & \bfseries pt & \bfseries ca & \bfseries cs & \bfseries ru & \bfseries uk & \bfseries ar & \bfseries zh \\
\bfseries en & {\cellcolor[HTML]{73A9CF}} \color[HTML]{000000} 1.0000 & {\cellcolor[HTML]{C9CEE4}} \color[HTML]{000000} 0.5420 & {\cellcolor[HTML]{C6CCE3}} \color[HTML]{000000} 0.5551 & {\cellcolor[HTML]{DEDCEC}} \color[HTML]{000000} 0.3794 & {\cellcolor[HTML]{D1D2E6}} \color[HTML]{000000} 0.4960 & {\cellcolor[HTML]{D9D8EA}} \color[HTML]{000000} 0.4237 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.1613 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.3235 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.4988 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.2672 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.0033 \\
\bfseries de & {\cellcolor[HTML]{C9CEE4}} \color[HTML]{000000} 0.5420 & {\cellcolor[HTML]{73A9CF}} \color[HTML]{000000} 1.0000 & {\cellcolor[HTML]{C0C9E2

In [50]:
df = temp[sorted_languages]

In [51]:
langs = sorted_languages
df = temp[~temp['Train Language'].isin(["en3", "all"])][sorted_languages]

In [52]:
res_df = pd.DataFrame()
for (src, trg) in itertools.combinations_with_replacement(langs, 2):
  print(src, trg)

  res = stats.pearsonr(df[src], df[trg])

  print(res)
  print(res.confidence_interval(0.95))

  res_df = pd.concat([res_df, pd.DataFrame({'src':src, 'trg':trg, 'PearsonR p-value':res[1], '0.95 Confidence Intrval low':res.confidence_interval(0.95)[0], '0.95 Confidence Intrval high':res.confidence_interval(0.95)[1]}, index=[0])], ignore_index=True)

en en
PearsonRResult(statistic=1.0, pvalue=0.0)
ConfidenceInterval(low=1.0, high=1.0)
en de
PearsonRResult(statistic=0.5420339360665481, pvalue=1.17353745159646e-08)
ConfidenceInterval(low=0.383189237088439, high=0.6697391976407968)
en nl
PearsonRResult(statistic=0.555080654338777, pvalue=4.383673414223362e-09)
ConfidenceInterval(low=0.3989974926376806, high=0.6799030022255345)
en es
PearsonRResult(statistic=0.3793888761599111, pvalue=0.0001377950137024026)
ConfidenceInterval(low=0.19363091270041108, high=0.5388859998238579)
en pt
PearsonRResult(statistic=0.4959527564251869, pvalue=2.77425475348252e-07)
ConfidenceInterval(low=0.3280892353651722, high=0.6334534392972023)
en ca
PearsonRResult(statistic=0.4237135259461428, pvalue=1.695622254234994e-05)
ConfidenceInterval(low=0.24395086769690397, high=0.5753263027746047)
en cs
PearsonRResult(statistic=-0.16132211381149508, pvalue=0.11636259493873008)
ConfidenceInterval(low=-0.3504727815031259, high=0.04047283619885538)
en ru
PearsonRResult

In [53]:
#res_df[res_df['PearsonR p-value'] >0.05]
res_df#.style.format(na_rep=0, precision=4)

Unnamed: 0,src,trg,PearsonR p-value,0.95 Confidence Intrval low,0.95 Confidence Intrval high
0,en,en,0.0,1.0,1.0
1,en,de,1.173537e-08,0.383189,0.669739
2,en,nl,4.383673e-09,0.398997,0.679903
3,en,es,0.000137795,0.193631,0.538886
4,en,pt,2.774255e-07,0.328089,0.633453
5,en,ca,1.695622e-05,0.243951,0.575326
6,en,cs,0.1163626,-0.350473,0.040473
7,en,ru,0.00130209,-0.492111,-0.131597
8,en,uk,2.307428e-07,-0.635739,-0.331499
9,en,ar,0.008500537,-0.443886,-0.070465


In [54]:
#result only for not statistically significant correlations (whether these are high or low)
for (src, trg) in itertools.combinations_with_replacement(langs, 2):
  res = stats.pearsonr(df[src], df[trg])
  if (res.pvalue < 0.05): continue #or (res.statistic < 0.1)
  print(src, trg)
  print(res)
  print(res.confidence_interval(0.95))

en cs
PearsonRResult(statistic=-0.16132211381149508, pvalue=0.11636259493873008)
ConfidenceInterval(low=-0.3504727815031259, high=0.04047283619885538)
en zh
PearsonRResult(statistic=-0.0033334191839927835, pvalue=0.9742862799054467)
ConfidenceInterval(low=-0.20368324142318447, high=0.19728437119169254)
de ru
PearsonRResult(statistic=0.19897529354311413, pvalue=0.051956517078766494)
ConfidenceInterval(low=-0.0015734173288190019, high=0.3841373279247955)
de uk
PearsonRResult(statistic=0.0784088274615922, pvalue=0.4476414481117586)
ConfidenceInterval(low=-0.12402680197468402, high=0.2745784352519565)
de ar
PearsonRResult(statistic=0.19538045105555943, pvalue=0.056432378613650254)
ConfidenceInterval(low=-0.005313633985096929, high=0.38094440905631666)
de zh
PearsonRResult(statistic=0.16558345384819556, pvalue=0.10690142122444854)
ConfidenceInterval(low=-0.03610093913051338, high=0.3543074006052856)
nl cs
PearsonRResult(statistic=0.06262311443806762, pvalue=0.544424113712105)
ConfidenceInte

In [55]:
temp['subject_id'] = temp['Model'] + '_' + temp['Train LLM']
temp['within'] = temp['Train Language']

In [56]:
print(AnovaRM(data=temp, depvar='en', subject='subject_id', within=['within']).fit())

                Anova
       F Value Num DF  Den DF  Pr > F
-------------------------------------
within 49.4569 4.0000 124.0000 0.0000



In [57]:
for lang in sorted_languages:
  print(lang)
  display(AnovaRM(data=temp, depvar=lang, subject='subject_id', within=['within']).fit().anova_table.style.format(na_rep=0, precision=25))

en


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,49.45686753965068,4.0,124.0,0.0


de


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,16.6094980607195,4.0,124.0,6.34022e-11


nl


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,12.387269278473237,4.0,124.0,1.65894866e-08


es


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,49.21631443668045,4.0,124.0,0.0


pt


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,35.09087324122684,4.0,124.0,0.0


ca


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,20.654338117009488,4.0,124.0,4.605e-13


cs


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,12.757869612795695,4.0,124.0,9.9871265e-09


ru


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,48.87566275986906,4.0,124.0,0.0


uk


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,40.47846642920844,4.0,124.0,0.0


ar


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,38.51972652184127,4.0,124.0,0.0


zh


Unnamed: 0,F Value,Num DF,Den DF,Pr > F
within,19.20151694674624,4.0,124.0,2.5877e-12


In [58]:
temp

Unnamed: 0,Train Language,Train LLM,Model,ar,ca,cs,de,en,es,nl,pt,ru,uk,zh,subject_id,within
0,en,alpaca-lora-30b,bert-base-multilingual-cased,0.537542,0.827078,0.85456,0.891714,0.956665,0.751676,0.856264,0.805481,0.837419,0.809057,0.55369,bert-base-multilingual-cased_alpaca-lora-30b,en
1,en,alpaca-lora-30b,mGPT,0.402388,0.808877,0.613177,0.876305,0.963891,0.738976,0.879058,0.833282,0.816224,0.821046,0.462625,mGPT_alpaca-lora-30b,en
2,en,alpaca-lora-30b,mdeberta-v3-base,0.207984,0.859243,0.769121,0.900331,0.943947,0.774384,0.897682,0.857008,0.798806,0.756265,0.309948,mdeberta-v3-base_alpaca-lora-30b,en
3,en,alpaca-lora-30b,xlm-roberta-large,0.447388,0.848775,0.871277,0.932405,0.980144,0.631892,0.770589,0.712993,0.873298,0.831866,0.447399,xlm-roberta-large_alpaca-lora-30b,en
4,en,gpt-3.5-turbo,bert-base-multilingual-cased,0.921465,0.89038,0.914981,0.902026,0.978339,0.854525,0.934819,0.91243,0.918328,0.896247,0.893291,bert-base-multilingual-cased_gpt-3.5-turbo,en
5,en,gpt-3.5-turbo,mGPT,0.700916,0.926614,0.337026,0.907073,0.989169,0.850593,0.943226,0.914732,0.901331,0.907969,0.561038,mGPT_gpt-3.5-turbo,en
6,en,gpt-3.5-turbo,mdeberta-v3-base,0.787647,0.559878,0.780318,0.805155,0.717787,0.602325,0.779041,0.678157,0.648425,0.764939,0.673301,mdeberta-v3-base_gpt-3.5-turbo,en
7,en,gpt-3.5-turbo,xlm-roberta-large,0.72182,0.973319,0.72115,0.939164,0.983754,0.786081,0.922878,0.928099,0.883877,0.629046,0.515536,xlm-roberta-large_gpt-3.5-turbo,en
8,en,gpt-4,bert-base-multilingual-cased,0.841926,0.926529,0.759176,0.816308,0.976534,0.826112,0.926544,0.870419,0.792921,0.710398,0.836037,bert-base-multilingual-cased_gpt-4,en
9,en,gpt-4,mGPT,0.694497,0.788771,0.386602,0.836477,0.990975,0.79537,0.83337,0.865765,0.800774,0.784926,0.473412,mGPT_gpt-4,en


In [59]:
#T-test for all combinations of train languages
for (src, trg) in itertools.combinations_with_replacement(['en', 'es', 'ru', 'all', 'en3'], 2):
  for lang in sorted_languages:
   if src == trg: continue
   print(f"Test language: {lang}, ({trg}, {src})")
   res = stats.ttest_rel(temp[temp["Train Language"] == trg][lang], temp[temp["Train Language"] == src][lang])
   print(stats.bayes_mvs(temp[temp["Train Language"] == trg][lang])[0])
   print(stats.bayes_mvs(temp[temp["Train Language"] == src][lang])[0])
   print(res)

Test language: en, (es, en)
Mean(statistic=0.7321654431554827, minmax=(0.6716763103740118, 0.7926545759369537))
Mean(statistic=0.9291952750903931, minmax=(0.9094486936553241, 0.9489418565254621))
TtestResult(statistic=-5.579616536059947, pvalue=4.0759617309541465e-06, df=31)
Test language: de, (es, en)
Mean(statistic=0.881243868426579, minmax=(0.8551394981190868, 0.9073482387340711))
Mean(statistic=0.8104183690222857, minmax=(0.7773688548598876, 0.8434678831846838))
TtestResult(statistic=3.749296999557661, pvalue=0.0007294359059912015, df=31)
Test language: nl, (es, en)
Mean(statistic=0.8143354015523292, minmax=(0.7608956217257862, 0.8677751813788723))
Mean(statistic=0.7508473921257937, minmax=(0.692139277952657, 0.8095555062989304))
TtestResult(statistic=2.4678731210511757, pvalue=0.01931370019530848, df=31)
Test language: es, (es, en)
Mean(statistic=0.9313660269095141, minmax=(0.9124199669949031, 0.950312086824125))
Mean(statistic=0.7018062196286831, minmax=(0.6628381900645105, 0.740

In [60]:
#T-test results for only combinations with pvalue >= 0.05 ~ differences between combinations not statistically significant
for (src, trg) in itertools.combinations_with_replacement(['en', 'es', 'ru', 'all', 'en3'], 2):
  for lang in sorted_languages:
   if src == trg: continue
   res = stats.ttest_rel(temp[temp["Train Language"] == trg][lang], temp[temp["Train Language"] == src][lang])
   if (res.pvalue < 0.05): continue
   print(f"Test language: {lang}, ({trg}, {src})")
   print(stats.bayes_mvs(temp[temp["Train Language"] == trg][lang])[0])
   print(stats.bayes_mvs(temp[temp["Train Language"] == src][lang])[0])
   print(res)

Test language: de, (ru, en)
Mean(statistic=0.7591294340362715, minmax=(0.7074880498822876, 0.8107708181902554))
Mean(statistic=0.8104183690222857, minmax=(0.7773688548598876, 0.8434678831846838))
TtestResult(statistic=-2.016026017954804, pvalue=0.05253412814807076, df=31)
Test language: nl, (ru, en)
Mean(statistic=0.6915329002535896, minmax=(0.6320598881927483, 0.751005912314431))
Mean(statistic=0.7508473921257937, minmax=(0.692139277952657, 0.8095555062989304))
TtestResult(statistic=-1.5580379318232784, pvalue=0.1293759666006121, df=31)
Test language: es, (ru, en)
Mean(statistic=0.6884135445954014, minmax=(0.6267347019692713, 0.7500923872215316))
Mean(statistic=0.7018062196286831, minmax=(0.6628381900645105, 0.7407742491928557))
TtestResult(statistic=-0.37635146769881617, pvalue=0.7092192433603035, df=31)
Test language: ca, (ru, en)
Mean(statistic=0.653202923386915, minmax=(0.5962775897825391, 0.7101282569912909))
Mean(statistic=0.7335083001418823, minmax=(0.6719409589258563, 0.795075

## RQ4 Cross-Generator Generalization

In [61]:
#How does a detectors trained on one LLM perform on different LLMs?
def analyze_llm_for_train_language_per_llm(results_list, train_language, test_llm):
  results = pd.DataFrame()
  for detector in tqdm(results_list, total=len(results_list)):
    for detector_name, detector_data in detector.items():
      for llm in ['text-davinci-003', 'gpt-3.5-turbo', 'gpt-4', 'alpaca-lora-30b', 'vicuna-13b', 'llama-65b', 'opt-66b', 'opt-iml-max-1.3b', 'all']:
        if f'-{train_language}-' not in detector_name: continue
        if f'-{llm}.' not in detector_name and 'all' not in llm: continue
        if 'all' in test_llm:
          temp = detector_data
        else:
          temp = detector_data[(detector_data.multi_label.str.contains(test_llm) | detector_data.multi_label.str.contains('human'))]
        if len(temp.label.unique()) < 2: continue
        #results = pd.concat([results, cr2df(temp['label'], temp['predictions'], detector_name)], copy=False, ignore_index=True)
        if optimize_threshold and 'prediction_probs' in temp.columns:
          optimal_threshold = 0.5
          labels = [label2id[x] for x in temp['label']]
          predictions = [label2id[x] for x in temp['predictions']]
          temp = temp.fillna(0.0)
          temp['prediction_probs'] = temp['prediction_probs'].astype(float)
          temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs']
          if (optimize_threshold):
            fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
            optimal_threshold = thresholds[np.argmax(tpr - fpr)]
            #optimal_threshold = thresholds[fpr <= 0.05][-1] #get threshold for 5% FPR
          preds = ["machine" if ((y > optimal_threshold)) else "human" for x,y in zip(temp['predictions'],temp['prediction_probs'])]
        else:
          preds = temp['predictions']
        scores = cr2df(temp['label'], preds, detector_name)
        results = pd.concat([results, scores], copy=False, ignore_index=True)
  temp = results.sort_values(by=['Macro avg F1-score'], ascending=False).reset_index(drop=True)
  return temp

In [62]:
%%time
#How does a detectorstrained on one LLM perform on different LLMs?
results_all = pd.DataFrame()
for train_language in ['en', 'es', 'ru', 'all', 'en3']:
  results = pd.DataFrame()
  for test_llm in ['text-davinci-003', 'gpt-3.5-turbo', 'gpt-4', 'alpaca-lora-30b', 'vicuna-13b', 'llama-65b', 'opt-66b', 'opt-iml-max-1.3b', 'all']:
    temp = analyze_llm_for_train_language_per_llm(test_results, train_language, test_llm)
    #temp = temp[~temp['Train LLM'].str.contains('all')]
    temp = temp[['Train Language', 'Train LLM', 'Model', 'Macro avg F1-score']]
    temp = temp.sort_values(by=['Train Language', 'Train LLM', 'Model'])
    temp = temp.set_index(['Train Language', 'Train LLM', 'Model'])
    temp.rename(columns={'Macro avg F1-score': test_llm}, inplace=True)
    if len(results) > 0: temp = temp[test_llm]
    results = pd.concat([results, temp], copy=False, axis=1)
    #break
  results_all = pd.concat([results_all, results], copy=False)
  #break

100%|██████████| 324/324 [00:40<00:00,  7.99it/s]
100%|██████████| 324/324 [00:41<00:00,  7.80it/s]
100%|██████████| 324/324 [00:41<00:00,  7.81it/s]
100%|██████████| 324/324 [00:41<00:00,  7.73it/s]
100%|██████████| 324/324 [00:41<00:00,  7.79it/s]
100%|██████████| 324/324 [00:40<00:00,  7.98it/s]
100%|██████████| 324/324 [00:43<00:00,  7.43it/s]
100%|██████████| 324/324 [00:40<00:00,  8.01it/s]
100%|██████████| 324/324 [03:03<00:00,  1.77it/s]
100%|██████████| 324/324 [00:41<00:00,  7.86it/s]
100%|██████████| 324/324 [00:40<00:00,  8.05it/s]
100%|██████████| 324/324 [00:41<00:00,  7.77it/s]
100%|██████████| 324/324 [00:44<00:00,  7.34it/s]
100%|██████████| 324/324 [00:41<00:00,  7.75it/s]
100%|██████████| 324/324 [00:41<00:00,  7.77it/s]
100%|██████████| 324/324 [00:41<00:00,  7.82it/s]
100%|██████████| 324/324 [00:39<00:00,  8.16it/s]
100%|██████████| 324/324 [03:05<00:00,  1.75it/s]
100%|██████████| 324/324 [00:41<00:00,  7.84it/s]
100%|██████████| 324/324 [00:41<00:00,  7.78it/s]


CPU times: user 42min 21s, sys: 8.63 s, total: 42min 29s
Wall time: 43min





In [63]:
rq4_results_all = results_all

In [64]:
results_all = rq4_results_all

In [65]:
new_order = ['gpt-4', 'gpt-3.5-turbo', 'text-davinci-003', 'vicuna-13b', 'alpaca-lora-30b', 'opt-iml-max-1.3b', 'llama-65b', 'opt-66b', 'all']
results_all = results_all.reset_index().set_index(['Train Language', 'Train LLM', 'Model'])
results_all = results_all[new_order]

In [66]:
results_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gpt-4,gpt-3.5-turbo,text-davinci-003,vicuna-13b,alpaca-lora-30b,opt-iml-max-1.3b,llama-65b,opt-66b,all
Train Language,Train LLM,Model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
en,all,bert-base-multilingual-cased,0.61307,0.613582,0.605591,0.604411,0.591141,0.521586,0.60525,0.545696,0.628257
en,all,electra-large-discriminator,0.572043,0.524899,0.499486,0.569871,0.524444,0.597828,0.62745,0.560828,0.555896
en,all,gpt2-medium,0.490022,0.487222,0.485969,0.53565,0.471993,0.567807,0.634057,0.555311,0.484911
en,all,mGPT,0.517999,0.519259,0.515067,0.521795,0.512592,0.455756,0.523695,0.496428,0.572694
en,all,mdeberta-v3-base,0.575883,0.57515,0.566313,0.568594,0.543716,0.509931,0.568337,0.520285,0.614824
en,all,roberta-large-openai-detector,0.427121,0.427305,0.42724,0.426098,0.426897,0.424688,0.424856,0.425894,0.554132
en,all,xlm-roberta-large,0.447902,0.447416,0.447934,0.447804,0.447578,0.44057,0.445515,0.437878,0.567889
en,alpaca-lora-30b,bert-base-multilingual-cased,0.796355,0.823652,0.810178,0.762162,0.788933,0.486282,0.433616,0.431465,0.540083
en,alpaca-lora-30b,bert-base-multilingual-cased,0.796355,0.823652,0.810178,0.762162,0.788933,0.486282,0.433616,0.431465,0.540083
en,alpaca-lora-30b,electra-large-discriminator,0.500101,0.524322,0.53558,0.545989,0.569812,0.516792,0.405988,0.459469,0.314809


In [67]:
results_all = results_all.reset_index()

In [68]:
results_all.style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)

Unnamed: 0,Train Language,Train LLM,Model,gpt-4,gpt-3.5-turbo,text-davinci-003,vicuna-13b,alpaca-lora-30b,opt-iml-max-1.3b,llama-65b,opt-66b,all
0,en,all,bert-base-multilingual-cased,0.6131,0.6136,0.6056,0.6044,0.5911,0.5216,0.6052,0.5457,0.6283
1,en,all,electra-large-discriminator,0.572,0.5249,0.4995,0.5699,0.5244,0.5978,0.6275,0.5608,0.5559
2,en,all,gpt2-medium,0.49,0.4872,0.486,0.5357,0.472,0.5678,0.6341,0.5553,0.4849
3,en,all,mGPT,0.518,0.5193,0.5151,0.5218,0.5126,0.4558,0.5237,0.4964,0.5727
4,en,all,mdeberta-v3-base,0.5759,0.5751,0.5663,0.5686,0.5437,0.5099,0.5683,0.5203,0.6148
5,en,all,roberta-large-openai-detector,0.4271,0.4273,0.4272,0.4261,0.4269,0.4247,0.4249,0.4259,0.5541
6,en,all,xlm-roberta-large,0.4479,0.4474,0.4479,0.4478,0.4476,0.4406,0.4455,0.4379,0.5679
7,en,alpaca-lora-30b,bert-base-multilingual-cased,0.7964,0.8237,0.8102,0.7622,0.7889,0.4863,0.4336,0.4315,0.5401
8,en,alpaca-lora-30b,bert-base-multilingual-cased,0.7964,0.8237,0.8102,0.7622,0.7889,0.4863,0.4336,0.4315,0.5401
9,en,alpaca-lora-30b,electra-large-discriminator,0.5001,0.5243,0.5356,0.546,0.5698,0.5168,0.406,0.4595,0.3148


In [69]:
results_all.drop(columns=['all'], inplace=True)

In [70]:
results_all.corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)

  results_all.corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)


Unnamed: 0,gpt-4,gpt-3.5-turbo,text-davinci-003,vicuna-13b,alpaca-lora-30b,opt-iml-max-1.3b,llama-65b,opt-66b
gpt-4,1.0,0.9695,0.9075,0.883,0.8052,-0.4841,-0.4342,-0.4331
gpt-3.5-turbo,0.9695,1.0,0.9633,0.9016,0.8706,-0.4759,-0.4858,-0.4592
text-davinci-003,0.9075,0.9633,1.0,0.926,0.9366,-0.3625,-0.4321,-0.3408
vicuna-13b,0.883,0.9016,0.926,1.0,0.9301,-0.1327,-0.1718,-0.1032
alpaca-lora-30b,0.8052,0.8706,0.9366,0.9301,1.0,-0.1316,-0.2379,-0.13
opt-iml-max-1.3b,-0.4841,-0.4759,-0.3625,-0.1327,-0.1316,1.0,0.7839,0.921
llama-65b,-0.4342,-0.4858,-0.4321,-0.1718,-0.2379,0.7839,1.0,0.8419
opt-66b,-0.4331,-0.4592,-0.3408,-0.1032,-0.13,0.921,0.8419,1.0


In [71]:
results_all = results_all[~results_all['Train Language'].isin(['en3', 'all'])]

In [72]:
results_all = results_all[results_all['Train LLM'] != 'all']

In [73]:
results_all.corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)

  results_all.corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)


Unnamed: 0,gpt-4,gpt-3.5-turbo,text-davinci-003,vicuna-13b,alpaca-lora-30b,opt-iml-max-1.3b,llama-65b,opt-66b
gpt-4,1.0,0.9712,0.9005,0.8786,0.7781,-0.5218,-0.4868,-0.4779
gpt-3.5-turbo,0.9712,1.0,0.9585,0.9056,0.8562,-0.4872,-0.5131,-0.4805
text-davinci-003,0.9005,0.9585,1.0,0.9381,0.9357,-0.3574,-0.4537,-0.3444
vicuna-13b,0.8786,0.9056,0.9381,1.0,0.9268,-0.1632,-0.2221,-0.1273
alpaca-lora-30b,0.7781,0.8562,0.9357,0.9268,1.0,-0.1226,-0.287,-0.1261
opt-iml-max-1.3b,-0.5218,-0.4872,-0.3574,-0.1632,-0.1226,1.0,0.699,0.9011
llama-65b,-0.4868,-0.5131,-0.4537,-0.2221,-0.287,0.699,1.0,0.7721
opt-66b,-0.4779,-0.4805,-0.3444,-0.1273,-0.1261,0.9011,0.7721,1.0


In [74]:
tex_temp = results_all.corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)
print(tex_temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

\begin{tabular}{lrrrrrrrr}
 & \bfseries gpt-4 & \bfseries gpt-3.5-turbo & \bfseries text-davinci-003 & \bfseries vicuna-13b & \bfseries alpaca-lora-30b & \bfseries opt-iml-max-1.3b & \bfseries llama-65b & \bfseries opt-66b \\
\bfseries gpt-4 & {\cellcolor[HTML]{73A9CF}} \color[HTML]{000000} 1.0000 & {\cellcolor[HTML]{79ABD0}} \color[HTML]{000000} 0.9712 & {\cellcolor[HTML]{88B1D4}} \color[HTML]{000000} 0.9005 & {\cellcolor[HTML]{8CB3D5}} \color[HTML]{000000} 0.8786 & {\cellcolor[HTML]{A1BBDA}} \color[HTML]{000000} 0.7781 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.5218 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.4868 & {\cellcolor[HTML]{FFF7FB}} \color[HTML]{000000} -0.4779 \\
\bfseries gpt-3.5-turbo & {\cellcolor[HTML]{79ABD0}} \color[HTML]{000000} 0.9712 & {\cellcolor[HTML]{73A9CF}} \color[HTML]{000000} 1.0000 & {\cellcolor[HTML]{7DACD1}} \color[HTML]{000000} 0.9585 & {\cellcolor[HTML]{88B1D4}} \color[HTML]{000000} 0.9056 & {\cellcolor[HTML]{91B5D6}} \color[HTML]{00

  tex_temp = results_all.corr().style.background_gradient(cmap=bg_cmap, vmin=bg_vmin, vmax=bg_vmax, text_color_threshold=bg_text_color_threshold, axis=None).format(na_rep=0, precision=4)


In [75]:
df = results_all
for (src, trg) in itertools.combinations_with_replacement(new_order[:-1], 2):
  if src == trg: continue
  res = stats.pearsonr(df[src], df[trg])
  if (res.pvalue < 0.05): continue #or (res.statistic < 0.1)
  print(src, trg)
  print(res)
  print(res.confidence_interval(0.95))