In [None]:
from datasets import get_dfs
from collections import Counter, defaultdict
import numpy as np
import difflib
import heapq
import pandas as pd
import ngram
import re
from spellchecker import SpellChecker
from tqdm import tqdm
import matplotlib.pyplot as plt
import json
import random

In [None]:
dfs = get_dfs({
    "drcat": "./daigt-v2-train-dataset",
    "drcat_v3": "./daigt-v3-train-dataset",
    "train": "./llm-detect-ai-generated-text/",
    "test": "./llm-detect-ai-generated-text/"
})

In [None]:
words_re = re.compile("[/\w']+")
spell = SpellChecker()
memoize_candidates = ngram.load_memoize_candidates("./persuade_match/memoize_candidates.json")
match_df = pd.read_csv("persuade_match/full_persuade_match.csv", index_col=0)
match_texts = {i: (text, source) for i, text, source in match_df[["text", "text_orig"]].itertuples(index=True, name=None)}

In [None]:
with open("persuade_match/replacements.json") as f:
    replacements = {int(k): v for k, v in json.load(f).items()}
with open("persuade_match/results.json") as f:
    results = {int(k): v for k, v in json.load(f).items()}

In [None]:
word_freqs = ngram.get_word_freqs(
    dfs["drcat_v3"]["text"],
    ngram.preprocess,
    words_re,
)
freq_getter = ngram.FreqGetter(word_freqs, memoize_candidates)

In [None]:
rand = random.Random(42)
augment = lambda text: ngram.augment_random(
    text,
    ngram.preprocess,
    ngram.postprocess,
    words_re,
    spell,
    freq_getter,
    memoize_candidates,
    rand
)

In [None]:
rand.seed(42)
aug_df = match_df["text_orig"].apply(augment)
aug_df = pd.DataFrame(aug_df.tolist(), columns=["text", "replace"], index=match_df.index)
sum(aug_df["text"] == match_df["text"])

In [None]:
ious = []
for i, aug_replace in aug_df["replace"].items():
    replace = replacements[i]
    a = set(aug_replace)
    b = set(tuple(r) for r in replace)
    o = len(a | b)
    iou = len(a & b) / o if o > 0 else 1
    ious.append(iou)

ious = np.array(ious)
ious.mean()

In [None]:
# rand.seed(42)
# aug_df = dfs["drcat_v3"]["text"].apply(augment)
# aug_df = pd.DataFrame(aug_df.tolist(), columns=["text", "replace"], index=dfs["drcat_v3"].index)