In [2]:
from datasets import get_dfs
from collections import Counter, defaultdict
import numpy as np
import difflib
import heapq
import pandas as pd
import ngram
import re
from spellchecker import SpellChecker
from tqdm import tqdm

In [3]:
dfs = get_dfs({
    "drcat": "./daigt-v2-train-dataset",
    "train": "./llm-detect-ai-generated-text/",
    "test": "./llm-detect-ai-generated-text/"
})
persuade = dfs["drcat"]
persuade = persuade[persuade["prompt_name"].isin(["Car-free cities", "Does the electoral college work?"])]
train = dfs["train"]
human_train = train[train["generated"] == 0]

In [4]:
persuade_match_df = pd.read_csv("./persuade_match/persuade_match.csv", index_col=0)

In [5]:
matches = {}
matches_inv = {}
index_to_train = {}
for i, (train_index, train_id, drcat_index) in persuade_match_df.iterrows():
    matches[train_index] = drcat_index
    matches_inv[drcat_index] = train_index
    index_to_train[i] = train_index

In [6]:
persuade_train = persuade[persuade.index.isin(persuade_match_df["drcat_index"])].copy()

## Allowed Chars

In [7]:
def get_all_chars(df, col):
    return set(list(''.join(df[col].to_list())))

In [8]:
missing_chars = get_all_chars(persuade, "text") - get_all_chars(train, "text")
print("".join(sorted(missing_chars)))

#$()*+-/;<=>[\]^`{|}~ £©®°²¶¹ÓÖáåéóадезопрс‍–—‘’“”─♀️�🌎🌱🎉🏋🏼💚💡💤💥💦💪💸📚📱😁😃😅😌😴🚀🤒🤔🤯🤷🧦


In [9]:
allowed_chars = get_all_chars(persuade, "text") & get_all_chars(train, "text")
print("".join(sorted(allowed_chars)))


 !"%&',.0123456789:?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz


## Check line cleaning

Rules:
1. Lines, separated by paragraph breaks (two new lines)
2. Each line must end in proper punctuation.
3. Each line must have at least one punctuation.
4. Strip spaces from each line.
5. The whole essay must end with a period.

In [10]:
def check_lines(df):
    checks = {
        "non_lower": re.compile("^[^a-z]+:$", flags=re.MULTILINE),
        # "search": re.compile("[A-Z]+[A-Z][A-Z][A-Z]"),
        "non_punctuated": re.compile("^[^.!?]+$", flags=re.MULTILINE),
        "non_punctuated_break": re.compile("[^ .!?\n] *$", flags=re.MULTILINE),
        "non_period": re.compile("[^ .\n] *$", flags=re.MULTILINE),
        "double_space": re.compile("  "),
        "break_space": re.compile("\n "),
        "space_break": re.compile(" \n"),
        "triple_break": re.compile("\n\n\n"),
        "non_period_ending": re.compile("[^.\\s]\\s*$"),
        "space_before": re.compile(" [.]")
    }

    results = defaultdict(list)
    for i, row in df.iterrows():
        text = row["text"]
        for k, check_re in checks.items():
            result = check_re.search(text)
            results[k].append(result is not None)

    results = {k: np.array(v) for k, v in results.items()}
    return results

In [11]:
results = check_lines(train)
{k: v.sum() for k,v in results.items()}

{'non_lower': 0,
 'non_punctuated': 0,
 'non_punctuated_break': 0,
 'non_period': 202,
 'double_space': 0,
 'break_space': 0,
 'space_break': 0,
 'triple_break': 0,
 'non_period_ending': 0,
 'space_before': 16}

In [12]:
results = check_lines(persuade_train)
{k: v.sum() for k,v in results.items()}

{'non_lower': 0,
 'non_punctuated': 374,
 'non_punctuated_break': 888,
 'non_period': 980,
 'double_space': 1372,
 'break_space': 0,
 'space_break': 0,
 'triple_break': 0,
 'non_period_ending': 256,
 'space_before': 110}

Observations:
1. 1774: Line with ',' gets concat to the next line, line with all caps gets dropped
2. 1560: Lines after last period get stripped.
3. 20012: Line with period gets kept, even if all caps
4. 20170: Line not ending in punctuation gets concat. Line with all caps + '!' gets kept. Line with letters only gets dropped.
5. 1310: Line with '"' gets concat
6. 1343: Line with ':' gets dropped
7. 2756: Line with all caps + '-' gets kept.
8. 2345: Line with only letters gets kept.

Therefore, the rules:
1. Split lines and words, then join them with "\n" and " ".
2. For any line not ending in punctuation ('!.?'), and only insignificant characters (including '[a-zA-Z_0-9:]'), drop it.
3. For any line not ending in punctuation, and at least one significant character (including '[",-]'), concat to the next.
4. Other characters ("%&") are undecided, but likely get dropped.
5. After joining, all characters after the last period get stripped.

This process most likely happens after the spell check phase, due to the fact that disallowed characters get stripped after spell check, and not before (e.g. or else "xxx`s" doesn't result in "s" getting spell checked to "i"). Character filtering happens after 3, because "-" is enough to preserve the line, but before 4, since some chars gets stripped before concat. 

Few other rules:
1. "''" gets replaced with '"'
2. " ." gets replaced with "." This happens before the above rules, so " ." can still show up if affected by line joining. Doesn't happen for other characters (" ?", " !").

In [17]:
# Use to make observations
i = 2345
# print(i)
print(persuade_train.loc[i, "text"])
print("-----------")
print(train.loc[matches_inv[i], "text"])

Car alarms, car horns, and engines are basically the only thing people hear nowadays. The number of cars in the streets are simply begining to get out of hand.

Citizens all around the world, we all should really try to limit the amount of time that we are spending using our vehicals.

If you really take a moment to think about it, this could honestly turn out to be a really good thing.

Cars are not neccisarily a need, they are a want. I can undertand if you are going to be traveling a far distance that two feet cant get you, but certianlty and ten minute walk to the grocery store/market cant hurt.

Limiting car usage is very important. Most families tend to spend about about $20-50.00 on gas a week. One advantage to limiting car usuage is saving a ton of money on gas that could easily be spent of food, to feed your family.

" "

When I had a car I was always tense. I'm much happier this way" " People who have decided to limit the amount of time they spent using their car, are much ha

## Preprocess line check

In [16]:
lines1 = np.array([row.text.count("\n") for i, row in human_train.iterrows()])
lines2 = np.array([ngram.preprocess(persuade_train.loc[matches[i], "text"]).count("\n") for i, row in human_train.iterrows()])
(lines1 == lines2).mean()

1.0