In [None]:
from datasets import get_dfs
import numpy as np
import pandas as pd
import ngram
import re
from spellchecker import SpellChecker
import json

In [None]:
dfs = get_dfs({
    "drcat": "./daigt-v2-train-dataset",
    "drcat_v3": "./daigt-v3-train-dataset",
    "train": "./llm-detect-ai-generated-text/",
    "test": "./llm-detect-ai-generated-text/"
})
persuade = dfs["drcat"]
persuade = persuade[persuade["prompt_name"].isin(["Car-free cities", "Does the electoral college work?"])]
train = dfs["train"]
human_train = train[train["generated"] == 0]

In [None]:
words_re = re.compile("[/\w']+")
spell = SpellChecker()
memoize_candidates = ngram.load_memoize_candidates("./persuade_match/memoize_candidates.json")
match_df = pd.read_csv("persuade_match/full_persuade_match.csv", index_col=0)

In [None]:
match_texts = {i: (text, source) for i, text, source in match_df[["text", "text_orig"]].itertuples(index=True, name=None)}

In [None]:
replace_candidates = ngram.find_spell_check_candidates(
    match_texts,
    lambda x, y: -ngram.l1_error(x, y),
    lambda x: ngram.ngrams(x, n=4),
    ngram.preprocess,
    ngram.postprocess,
    words_re,
    spell,
    memoize_candidates,
)

In [None]:
# ngram.save_memoize_candidates(memoize_candidates, "./persuade_match/memoize_candidates.json")

In [None]:
replacements, results, scores_dict = ngram.prune_spell_check_candidates(
    match_texts,
    lambda x, y: -ngram.l1_error(x, y),
    lambda x: ngram.ngrams(x, n=10),
    ngram.preprocess,
    ngram.postprocess,
    replace_candidates
)

In [None]:
np.set_printoptions(suppress=True)
scores = []
for i, (text, source) in match_texts.items():
    scores.append((i, 1.0 if text == results[i] else ngram.text_similarity(text, results[i])))
scores = np.array(scores)
scores[(scores[:, 1] < 1.0).nonzero(), :]

In [None]:
with open("persuade_match/replacements.json", "w") as f:
    json.dump(replacements, f)
with open("persuade_match/results.json", "w") as f:
    json.dump(results, f)
with open("persuade_match/scores_dict.json", "w") as f:
    json.dump(scores_dict, f)