In [1]:
import pandas as pd

PATH = 'data/'


In [2]:
train_essays = pd.read_csv(PATH + 'llm-detect-ai-generated-text/train_essays.csv')
train_essays

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,0
1374,ff669174,0,Limiting car usage has many advantages. Such a...,0
1375,ffa247e0,0,There's a new trend that has been developing f...,0
1376,ffc237e9,0,As we all know cars are a big part of our soci...,0


In [3]:
Radek_data_gpt_3_5 = pd.read_csv(PATH + 'llm-generated-essays/ai_generated_train_essays.csv')
Radek_data_gpt_4   = pd.read_csv(PATH + 'llm-generated-essays/ai_generated_train_essays_gpt-4.csv')

In [4]:
PERSUADE_corpus = pd.read_csv(PATH + "persaude-corpus-2/persuade_2.0_human_scores_demo_id_github.csv")
PERSUADE_corpus = PERSUADE_corpus["full_text"].to_frame()
PERSUADE_corpus["generated"] = 0 # human data
PERSUADE_corpus = PERSUADE_corpus.rename(columns = {'full_text':'text'})
PERSUADE_corpus.shape

(25996, 2)

In [5]:
llama_70b_dataset = pd.read_csv(PATH + "daigt-data-llama-70b-and-falcon180b/llama_70b_v1.csv")
llama_70b_dataset = llama_70b_dataset["generated_text"].to_frame()
llama_70b_dataset["generated"] = 1 # AI-LLM data
llama_70b_dataset = llama_70b_dataset.rename(columns = {'generated_text':'text'})

falcon_180b_dataset = pd.read_csv(PATH + "daigt-data-llama-70b-and-falcon180b/falcon_180b_v1.csv")
falcon_180b_dataset = falcon_180b_dataset["generated_text"].to_frame()
falcon_180b_dataset["generated"] = 1 # AI-LLM data
falcon_180b_dataset = falcon_180b_dataset.rename(columns = {'generated_text':'text'})


In [6]:
daigt_external_dataset = pd.read_csv(PATH + "daigt-external-dataset/daigt_external_dataset.csv")
daigt_external_dataset = daigt_external_dataset['text'].to_frame()
daigt_external_dataset["generated"] = 1
daigt_external_dataset.shape

(2421, 2)

In [7]:
import pandas as pd
import re
import random
import numpy as np
from tqdm import tqdm 


BRACKET_SYMBOL = ['[', ']', '(', ')', '{', '}']
SPECIAL_CHARACTERS = ['.', '+', '*', '?', '^', '$', '(', ')', '[', ']', '{', '}', '|', '\\']
CHARACTERS = 'abcdefghijklmnopqrstuvwxyz'
VOWEL = 'ueoai'
CONSONANTS = 'bcdfghjklmnpqrstvwxz'



def normalize_text(text):
    text = text.replace('-', '')
    for symbol in BRACKET_SYMBOL:
        text = text.replace(symbol, f' {symbol} ')

    text = re.sub(' +', ' ', text)
    return text

def is_word(word):
    for c in SPECIAL_CHARACTERS:
        if c in word:
            return False
    return True

class Noise:    
    def remove_consonant(self, words, rate=0.2):
        for i, word in enumerate(words):
            if is_word(word):
                c = random.choice(CONSONANTS)
                prob = np.random.uniform(0, 1, 1)
                if prob[0] < rate:
                    words[i] = words[i].replace(c, '')

        return words
    
    def replace_consonant(self, words):
        v = random.choice(VOWEL)
        c = random.choice(CONSONANTS)
        for i, word in enumerate(words):
            if is_word(word):
                words[i] = words[i].replace(c, v)
                words[i] = words[i].replace(c.upper(), v.upper())
        return words
    
    def remove_space(self, words, n=3):
        sentence = ' '.join(words)
        space_ids = np.random.randint(0, len(words)-2, n)
        phrases = []
        for i in space_ids:
            phrases.append(f'{words[i]} {words[i+1]}')
        for phrase in phrases:
            sentence = sentence.replace(phrase, phrase.replace(' ', ''))
        return sentence.split()
    
    def insert_vowel(self, words, rate=0.4):
        v = random.choice(VOWEL)
        unique_words = list(set(words))
        chosen_words = np.random.choice(unique_words, int(rate*len(unique_words)))
        inserted_words = []
        for word in chosen_words:
            id = random.randint(0, len(word)+1)
            inserted_words.append(word[:id] + v + word[id:])
        sentence = ' '.join(words)
        for w, r in zip(chosen_words, inserted_words):
            sentence = sentence.replace(w, r)
        words = sentence.split()
#         print(' '.join(words))
#         print('inserted_words: ', inserted_words)
        return words
    
    def randomly_lower(self, words, rate=0.3):
        special_words = []
        for i, word in enumerate(words):
            if i == 0:
                continue
            if word[0] == word[0].upper() and not is_word(words[i-1]):
                special_words.append(word)
        n = int(rate*len(special_words)+1)
        if n > len(special_words):
            return words
        chosen_words = np.random.choice(special_words, n)
        
        sentence = ' '.join(words)
        for word in chosen_words:
            sentence = sentence.replace(word, word.lower())
        
        return sentence.split()
    
    def remove_apostrophe(self, words, rate=0.2):
        sentence = ' '.join(words)
        apost_phrase = re.findall('[^ ]*\'[^ ]*', sentence)
        if int(rate*len(apost_phrase)+1) > len(apost_phrase):
            return words
        chosen_words = np.random.choice(apost_phrase, int(rate*len(apost_phrase)+1))
        for word in chosen_words:
            sentence = sentence.replace(word, word.replace("'", ""))
        return sentence.split()
    
    def add_noise_to_corpus(self, corpus, rate=0.9):
        prob = np.random.uniform(0, 1, len(corpus))
        choice = prob > rate
        new_corpus = []
        for i in tqdm(range(len(corpus))):
            if choice[i]:
                new_corpus.append(corpus[i])
            else:
                words = corpus[i].split()
                noise_id = np.random.randint(0, 2, 6)
                if noise_id[0] == 1:
                    words = self.replace_consonant(words)
                if noise_id[1] == 1:
                    words = self.remove_space(words)
                if noise_id[2] == 1:
                    words = self.insert_vowel(words)
                if noise_id[3] == 1:
                    words = self.randomly_lower(words)
                if noise_id[4] == 1:
                    words = self.remove_apostrophe(words)
                if noise_id[5] == 1:
                    words = self.remove_consonant(words)
                new_corpus.append(' '.join(words))
        return new_corpus

In [8]:
dataset = pd.concat([train_essays,
                      Radek_data_gpt_3_5, 
                     Radek_data_gpt_4, 
                     PERSUADE_corpus, 
                     llama_70b_dataset, 
                     falcon_180b_dataset,
                     daigt_external_dataset
                    ], 
                    ignore_index=True)

dataset["generated"].value_counts()

generated
0    27371
1     5351
Name: count, dtype: int64

In [16]:
noise = Noise()
corpus = noise.add_noise_to_corpus(dataset["text"].tolist())
dataset["text"] = corpus
# dataset["text"] = [normalize_text(c) for c in corpus]

100%|██████████| 32722/32722 [00:21<00:00, 1525.79it/s]


In [9]:
# very rudimentary cleaning
def cleaning(dataset):
    
    dataset['text'] = dataset['text'].str.strip()
    dataset["text"] = dataset["text"].replace('\\n',' ')
    dataset["text"] = dataset["text"].str.split('ubject: ').str[-1].str.strip()
    dataset["text"] = dataset["text"].str.split('Zip').str[-1].str.strip()
    dataset["text"] = dataset["text"].str.split('ZIP').str[-1].str.strip()
#     dataset = dataset.rename(columns = {'generated':'label'})
#     dataset = dataset.drop(["id","prompt_id"], axis=1)
    
    return dataset

In [10]:
dataset = cleaning(dataset)
dataset.to_csv('train_essays_3.0.csv', index=False)