# Evaluation

In this notebook we will evaluate all three models we have up to now (RoBERTa-Sentinel, T5-Sentinel and GPT2-Detector as baseline)

In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from transformers import RobertaModel, RobertaTokenizer

from tqdm.notebook import tqdm

from Model import SentinelNonLinear, Sentinel, T5Sentinel
from Dataset import OpenGPTDataset, GPT2_OutputDataset, download_gpt2
from memoizer import memoize, load_cache

SELF_NAME = "evaluate_experiment.ipynb"

In [2]:
PATH_ROOT = Path(Path.cwd().parent.parent.parent)
PATH_WEBTEXT = Path(PATH_ROOT, "data", "open-web-text-split")
PATH_GPTTEXT = Path(PATH_ROOT, "data", "open-gpt-text-split")
PATH_GPT2_OUTPUT = Path(PATH_ROOT, "data", "gpt2-output")

PATH_CACHE = Path(PATH_ROOT, "result", "cache")

PATH_CHECKPOINT_ROBERTA = Path(PATH_CACHE, "roberta.base.0425.pt")
PATH_CHECKPOINT_T5 = Path(PATH_CACHE, "t5.small.0422.pt")

In [3]:
if len([_ for _ in PATH_GPT2_OUTPUT.iterdir()]) == 0: download_gpt2(PATH_GPT2_OUTPUT)
else: print("GPT2 output dataset already downloaded.")

GPT2 output dataset already downloaded.


In [4]:
clean_opengpt_test = OpenGPTDataset(
    str(Path(PATH_WEBTEXT, "test.jsonl")),
    str(Path(PATH_GPTTEXT, "test.jsonl")),
    force_match=False
)

dirty_opengpt_test = OpenGPTDataset(
    str(Path(PATH_WEBTEXT, "test-dirty.jsonl")),
    str(Path(PATH_GPTTEXT, "test-dirty.jsonl")),
    force_match=False
)

gpt2xl_output_test = GPT2_OutputDataset(
    Path(PATH_GPT2_OUTPUT, "xl-1542M.test.jsonl"),
    Path(PATH_GPT2_OUTPUT, "webtext.test.jsonl")
)

Loading Human text: 100%|███████████████████████████████████| 5000/5000 [00:00<00:00, 100064.03it/s]
Loading GPT text: 100%|██████████████████████████████████████| 5000/5000 [00:00<00:00, 90077.66it/s]

<All data loaded>





In [5]:
def quick_statistics(prediction, threshold=0.5):
    TP, TN, FP, FN = 0, 0, 0, 0
    key:str
    for key in prediction:
        pred = prediction[key]
        p_gpt, p_web = pred[0], pred[1]
        pred_gpt = p_gpt > threshold
        real_gpt = key.endswith("gpt")

        if pred_gpt and real_gpt: TP += 1
        elif (not pred_gpt) and (not real_gpt): TN += 1
        elif pred_gpt and (not real_gpt): FP += 1
        else: FN += 1
    
    return TP, TN, FP, FN

def report_statistics(TP, TN, FP, FN):
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    FPR = FP / (FP + TN)
    FNR = FN / (FN + TP)
    print(f"True Positive: {TP} \t| True Negative: {TN}")
    print(f"False Positive:{FP} \t| False Negative:{FN}")
    print(f"True Positive Rate:  {round(TPR * 100, 2)}\%")
    print(f"True Negative Rate:  {round(TNR * 100, 2)}\%")
    print(f"False Positive Rate: {round(FPR * 100, 2)}\%")
    print(f"False Negative Rate: {round(FNR * 100, 2)}\%")
    print(f"Accuracy: {round(((TP + TN) / (TP + TN + FP + FN)) * 100, 2)}\%")
    print(f"F1 Score: {round((TP) / (TP + 0.5 * (FP + FN)), 2)}")

    print("LaTeX Usable-version\n")

    print(
    f"{round(((TP + TN) / (TP + TN + FP + FN)) * 100, 2)}\%", "&"
    f"{round(TPR * 100, 2)}\%, ({TP})", "&",
    f"{round(TNR * 100, 2)}\%, ({TN})", "&",
    f"{round(FPR * 100, 2)}\%, ({FP})", "&",
    f"{round(FNR * 100, 2)}\%, ({FN})", "\\\\"
    )

## Evaluate T5-Sentinel

In [6]:
checkpoint = torch.load(PATH_CHECKPOINT_T5)
t5_model = T5Sentinel()
t5_model.load_state_dict(checkpoint["model"])
t5_model = t5_model.to("cuda")
t5_model.eval()
print("Load Finish")

Load Finish


### T5-Sentinel on OpenGPTText-Final

In [7]:
@memoize(Path(PATH_CACHE, "eval_t5_opengpt_final.pt"), SELF_NAME)
def calculate_t5_final():
    t5_prediction_clean = dict()
    for index in tqdm(range(len(clean_opengpt_test))):
        (text, label), (uid, _) = clean_opengpt_test[index]
        prediction, _ = t5_model(text)
        uid = uid + ("web" if label == 0 else "gpt")
        t5_prediction_clean[uid] = prediction.cpu().numpy()
    return t5_prediction_clean

t5_prediction_clean = calculate_t5_final()

Reusing existing cache from d:\Projects\GPT-Sentinel\result\cache\eval_t5_opengpt_final.pt
Cache is generated by evaluate_experiment.ipynb


In [8]:
t5_statistics = quick_statistics(t5_prediction_clean)
print("T5 on OpenGPTText-Final")
report_statistics(*t5_statistics)

T5 on OpenGPTText-Final
True Positive: 2906 	| True Negative: 2863
False Positive:81 	| False Negative:38
True Positive Rate:  98.71\%
True Negative Rate:  97.25\%
False Positive Rate: 2.75\%
False Negative Rate: 1.29\%
Accuracy: 97.98\%
F1 Score: 0.98
LaTeX Usable-version

97.98\% &98.71\%, (2906) & 97.25\%, (2863) & 2.75\%, (81) & 1.29\%, (38) \\


### T5-Sentinel on OpenGPTText-Original

In [9]:
@memoize(Path(PATH_CACHE, "eval_t5_opengpt_original.pt"), SELF_NAME)
def calculate_t5_original():
    t5_prediction_orig = dict()
    for index in tqdm(range(len(dirty_opengpt_test))):
        (text, label), (uid, _) = dirty_opengpt_test[index]
        prediction, _ = t5_model(text)
        uid = uid + ("web" if label == 0 else "gpt")
        t5_prediction_orig[uid] = prediction.cpu().numpy()
    return t5_prediction_orig

t5_prediction_orig = calculate_t5_original()

Reusing existing cache from d:\Projects\GPT-Sentinel\result\cache\eval_t5_opengpt_original.pt
Cache is generated by evaluate_experiment.ipynb


In [10]:
t5_statistics = quick_statistics(t5_prediction_orig)
print("T5 on OpenGPTText-Original")
report_statistics(*t5_statistics)

T5 on OpenGPTText-Original
True Positive: 2907 	| True Negative: 2842
False Positive:102 	| False Negative:37
True Positive Rate:  98.74\%
True Negative Rate:  96.54\%
False Positive Rate: 3.46\%
False Negative Rate: 1.26\%
Accuracy: 97.64\%
F1 Score: 0.98
LaTeX Usable-version

97.64\% &98.74\%, (2907) & 96.54\%, (2842) & 3.46\%, (102) & 1.26\%, (37) \\


### T5-Sentinel on GPT2-Output

In [11]:
@memoize(Path(PATH_CACHE, "eval_t5_gpt2_output.pt"), SELF_NAME)
def calculate_t5_gpt2():
    t5_prediction_gpt2 = dict()
    for index in tqdm(range(len(gpt2xl_output_test))):
        text, label = gpt2xl_output_test[index]
        prediction, _ = t5_model(text)
        uid = str(index) + ("web" if label == 0 else "gpt")
        t5_prediction_gpt2[uid] = prediction.cpu().numpy()
    return t5_prediction_gpt2

t5_prediction_gpt2 = calculate_t5_gpt2()

Reusing existing cache from d:\Projects\GPT-Sentinel\result\cache\eval_t5_gpt2_output.pt
Cache is generated by evaluate_experiment.ipynb


In [12]:
t5_statistics = quick_statistics(t5_prediction_gpt2)
print("T5 on GPT2-Output")
report_statistics(*t5_statistics)

T5 on GPT2-Output
True Positive: 165 	| True Negative: 4703
False Positive:297 	| False Negative:4835
True Positive Rate:  3.3\%
True Negative Rate:  94.06\%
False Positive Rate: 5.94\%
False Negative Rate: 96.7\%
Accuracy: 48.68\%
F1 Score: 0.06
LaTeX Usable-version

48.68\% &3.3\%, (165) & 94.06\%, (4703) & 5.94\%, (297) & 96.7\%, (4835) \\


In [13]:
del t5_model
torch.cuda.empty_cache()

## RoBERTa-Sentinel Evaluation

In [14]:
checkpoint = torch.load(PATH_CHECKPOINT_ROBERTA)
roberta_model = SentinelNonLinear(
    RobertaModel.from_pretrained("roberta-base"),
    RobertaTokenizer.from_pretrained("roberta-base")
)
roberta_model.load_state_dict(checkpoint["model"])
roberta_model = roberta_model.to("cuda")
roberta_model.eval()
print("Load Finish")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load Finish


### RoBERTa on OpenGPTText-Final

In [15]:
@memoize(Path(PATH_CACHE, "eval_roberta_opengpt_final.pt"), SELF_NAME)
def calculate_roberta_final():
    with torch.inference_mode():
        roberta_prediction_clean = dict()
        for index in tqdm(range(len(clean_opengpt_test))):
            (text, label), (uid, _) = clean_opengpt_test[index]
            prediction = torch.nn.functional.softmax(roberta_model(text).detach().squeeze(dim=0), dim=-1).cpu()
            prediction = torch.Tensor([prediction[1], prediction[0]])
            uid = uid + ("web" if label == 0 else "gpt")
            roberta_prediction_clean[uid] = prediction.numpy()
    return roberta_prediction_clean

roberta_prediction_clean = calculate_roberta_final()


Reusing existing cache from d:\Projects\GPT-Sentinel\result\cache\eval_roberta_opengpt_final.pt
Cache is generated by evaluate_experiment.ipynb


In [16]:
roberta_statistics = quick_statistics(roberta_prediction_clean)
print("RoBERTa on OpenGPTText-Final")
report_statistics(*roberta_statistics)

RoBERTa on OpenGPTText-Final
True Positive: 2850 	| True Negative: 2680
False Positive:264 	| False Negative:94
True Positive Rate:  96.81\%
True Negative Rate:  91.03\%
False Positive Rate: 8.97\%
False Negative Rate: 3.19\%
Accuracy: 93.92\%
F1 Score: 0.94
LaTeX Usable-version

93.92\% &96.81\%, (2850) & 91.03\%, (2680) & 8.97\%, (264) & 3.19\%, (94) \\


### RoBERTa on OpenGPTText-Original

In [17]:
@memoize(Path(PATH_CACHE, "eval_roberta_opengpt_original.pt"), SELF_NAME)
def calculate_roberta_original():
    roberta_prediction_orig = dict()
    for index in tqdm(range(len(dirty_opengpt_test))):
        (text, label), (uid, _) = dirty_opengpt_test[index]
        prediction = torch.nn.functional.softmax(roberta_model(text).detach().squeeze(dim=0), dim=-1).cpu()
        prediction = torch.Tensor([prediction[1], prediction[0]])
        uid = uid + ("web" if label == 0 else "gpt")
        roberta_prediction_orig[uid] = prediction.numpy()
    return roberta_prediction_orig

roberta_prediction_orig = calculate_roberta_original()

Reusing existing cache from d:\Projects\GPT-Sentinel\result\cache\eval_roberta_opengpt_original.pt
Cache is generated by evaluate_experiment.ipynb


In [18]:
roberta_statistics = quick_statistics(roberta_prediction_orig)
print("RoBERTa on OpenGPTText-Original")
report_statistics(*roberta_statistics)

RoBERTa on OpenGPTText-Original
True Positive: 2889 	| True Negative: 2309
False Positive:635 	| False Negative:55
True Positive Rate:  98.13\%
True Negative Rate:  78.43\%
False Positive Rate: 21.57\%
False Negative Rate: 1.87\%
Accuracy: 88.28\%
F1 Score: 0.89
LaTeX Usable-version

88.28\% &98.13\%, (2889) & 78.43\%, (2309) & 21.57\%, (635) & 1.87\%, (55) \\


### RoBERTa on GPT2-Output

In [19]:
@memoize(Path(PATH_CACHE, "eval_roberta_gpt2_output.pt"), SELF_NAME)
def calculate_roberta_gpt2():
    roberta_prediction_gpt2 = dict()
    for index in tqdm(range(len(gpt2xl_output_test))):
        text, label = gpt2xl_output_test[index]
        prediction = torch.nn.functional.softmax(roberta_model(text).detach().squeeze(dim=0), dim=-1).cpu()
        prediction = torch.Tensor([prediction[1], prediction[0]])
        uid = str(index) + ("web" if label == 0 else "gpt")
        roberta_prediction_gpt2[uid] = prediction.cpu().numpy()
    return roberta_prediction_gpt2

roberta_prediction_gpt2 = calculate_roberta_gpt2()

Reusing existing cache from d:\Projects\GPT-Sentinel\result\cache\eval_roberta_gpt2_output.pt
Cache is generated by evaluate_experiment.ipynb


In [20]:
roberta_statistics = quick_statistics(roberta_prediction_gpt2)
print("RoBERTa on GPT2-Output")
report_statistics(*roberta_statistics)

RoBERTa on GPT2-Output
True Positive: 518 	| True Negative: 4138
False Positive:862 	| False Negative:4482
True Positive Rate:  10.36\%
True Negative Rate:  82.76\%
False Positive Rate: 17.24\%
False Negative Rate: 89.64\%
Accuracy: 46.56\%
F1 Score: 0.16
LaTeX Usable-version

46.56\% &10.36\%, (518) & 82.76\%, (4138) & 17.24\%, (862) & 89.64\%, (4482) \\


## Evaluate ZeroGPT

### ZeroGPT on OpenGPTText-Final

In [21]:
zero_prediction_final = load_cache(Path(PATH_CACHE, "eval_zerogpt_opengpt_final.pt"))
report_statistics(*quick_statistics(zero_prediction_final))

Loading cache from ./data/utility_fns.py
True Positive: 1030 	| True Negative: 2171
False Positive:773 	| False Negative:1914
True Positive Rate:  34.99\%
True Negative Rate:  73.74\%
False Positive Rate: 26.26\%
False Negative Rate: 65.01\%
Accuracy: 54.36\%
F1 Score: 0.43
LaTeX Usable-version

54.36\% &34.99\%, (1030) & 73.74\%, (2171) & 26.26\%, (773) & 65.01\%, (1914) \\


### ZeroGPT on OpenGPTText-Original

In [22]:
zero_prediction_original = load_cache(Path(PATH_CACHE, "eval_zerogpt_opengpt_original.pt"))
report_statistics(*quick_statistics(zero_prediction_original))

Loading cache from ./data/utility_fns.py
True Positive: 844 	| True Negative: 2459
False Positive:485 	| False Negative:2100
True Positive Rate:  28.67\%
True Negative Rate:  83.53\%
False Positive Rate: 16.47\%
False Negative Rate: 71.33\%
Accuracy: 56.1\%
F1 Score: 0.4
LaTeX Usable-version

56.1\% &28.67\%, (844) & 83.53\%, (2459) & 16.47\%, (485) & 71.33\%, (2100) \\


### ZeroGPT on GPT2-Output

In [23]:
zero_prediction_gpt2 = load_cache(Path(PATH_CACHE, "eval_zerogpt_gpt2_output.pt"))
report_statistics(*quick_statistics(zero_prediction_gpt2))

Loading cache from ./data/utility_fns.py
True Positive: 476 	| True Negative: 3832
False Positive:1168 	| False Negative:4522
True Positive Rate:  9.52\%
True Negative Rate:  76.64\%
False Positive Rate: 23.36\%
False Negative Rate: 90.48\%
Accuracy: 43.09\%
F1 Score: 0.14
LaTeX Usable-version

43.09\% &9.52\%, (476) & 76.64\%, (3832) & 23.36\%, (1168) & 90.48\%, (4522) \\


## Evaluate OpenAI Detector

### OpenAI on OpenGPTText-Final

In [28]:
openai_prediction_final = load_cache(Path(PATH_CACHE, "eval_openai_opengpt_final.pt"))
report_statistics(*quick_statistics(openai_prediction_final))

Loading cache from ./data/utility_fns.py
True Positive: 596 	| True Negative: 2800
False Positive:144 	| False Negative:2348
True Positive Rate:  20.24\%
True Negative Rate:  95.11\%
False Positive Rate: 4.89\%
False Negative Rate: 79.76\%
Accuracy: 57.68\%
F1 Score: 0.32
LaTeX Usable-version

57.68\% &20.24\%, (596) & 95.11\%, (2800) & 4.89\%, (144) & 79.76\%, (2348) \\


In [27]:
openai_prediction_original = load_cache(Path(PATH_CACHE, "eval_openai_opengpt_original.pt"))
report_statistics(*quick_statistics(openai_prediction_original))

Loading cache from ./data/utility_fns.py
True Positive: 437 	| True Negative: 2898
False Positive:46 	| False Negative:2507
True Positive Rate:  14.84\%
True Negative Rate:  98.44\%
False Positive Rate: 1.56\%
False Negative Rate: 85.16\%
Accuracy: 56.64\%
F1 Score: 0.26
LaTeX Usable-version

56.64\% &14.84\%, (437) & 98.44\%, (2898) & 1.56\%, (46) & 85.16\%, (2507) \\


In [26]:
openai_prediction_gpt2 = load_cache(Path(PATH_CACHE, "eval_openai_gpt2_output.pt"))
report_statistics(*quick_statistics(openai_prediction_gpt2))

Loading cache from ./data/utility_fns.py
True Positive: 2801 	| True Negative: 4321
False Positive:679 	| False Negative:2199
True Positive Rate:  56.02\%
True Negative Rate:  86.42\%
False Positive Rate: 13.58\%
False Negative Rate: 43.98\%
Accuracy: 71.22\%
F1 Score: 0.66
LaTeX Usable-version

71.22\% &56.02\%, (2801) & 86.42\%, (4321) & 13.58\%, (679) & 43.98\%, (2199) \\
