In [55]:
#!pip install nltk pandas nlpaug

In [56]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import pandas as pd
from pandas import isnull
from collections import defaultdict
import nltk, os, re

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TreebankWordDetokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cwwojin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
#Creating Profanity-spelling-error-dictionary (txt-file)
data_path = "datasets/"
spelling_path = "spelling/"

train_df = pd.read_csv(data_path + "2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv", sep=',')[['text', 'label']]
profanity_df = pd.read_csv(data_path + "profanity_en.csv", sep=',')
labels = ['canonical_form_1', 'canonical_form_2', 'canonical_form_3']
spelling_dict = defaultdict(list)
fname_profanity = spelling_path + "spelling_en_profanity.txt"

for idx, row in profanity_df.iterrows() :
    text = row['text']
    canons = [i for i in list(row[labels]) if not isnull(i)]
    for word in canons :
        spelling_dict[word].append(text)

if os.path.exists(fname_profanity):
    os.remove(fname_profanity)

with open(fname_profanity, "w+", encoding="UTF-8") as spelling_dict_file :
    for k, v in spelling_dict.items() :
        #print(" ".join([k] + v))
        spelling_dict_file.write(" ".join([k] + v))
        spelling_dict_file.write("\n")

In [58]:
def ascii_range(text) :
    for c in text :
        if ord(c) not in range(128) : return False
    return True

In [59]:
def contains_prof(text, profanities) :
    tokenized_text = word_tokenize(text)
    return (not not set(tokenized_text).intersection(set(profanities)))

#Modifying Original Dataset
profanities = pd.read_csv("datasets/profanity_en_list.csv", sep=',')    #list(spelling_dict.keys())
profanities = list(profanities['word'])
print(len(profanities))
new_texts = []
new_labels = []

for idx, row in train_df.iterrows() :
    text = row['text']
    label = row['label']
    if contains_prof(text, profanities) and len(text) <= 256 and ascii_range(text):
        new_texts.append(text)
        new_labels.append(label)

train_df_prof = pd.DataFrame(list(zip(new_texts, new_labels)), columns=['text', 'label'])
train_df_prof.to_csv(data_path + "2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1-ProfanityOnly.csv")
print(len(train_df_prof))

1052
10424


In [60]:
def augment_sent(sentence, aug, n, profanities) :
    augmented_sents = []
    sent_tokenized = word_tokenize(sentence)
    for i in range(n) :
        #augmented = " ".join([aug.augment(word, n=1) if word in set(profanities) else word for word in sent_tokenized])
        augmented = TreebankWordDetokenizer().detokenize([aug.augment(word, n=1) if word in set(profanities) else word for word in sent_tokenized])
        augmented_sents.append(augmented)
    return augmented_sents

In [61]:
from random import sample

def run_aug_multiple(n, N, train_df, augs, R, output_fname, profanities, path='', add_original=True, random=False) :  #Using Multiple Augmenters
    train_df_sample = train_df.sample(n=N, ignore_index=True)
    #save sample as file
    train_df_sample.to_csv(path + output_fname[:-4] + "original.csv".format(N=N), sep=",")

    #aug.augment(data, n)
    augmented_data_sentences = []
    augmented_data_labels = []

    assert(sum(R)==n)
    assert(len(R)==len(augs))

    for idx, row in train_df_sample.iterrows() :
        sentence = row['text']
        label = row['label']
        augmented = []

        if random :
            random_aug = sample(augs, 1)[0]
            sents = augment_sent(sentence, random_aug, n=1, profanities=profanities)
            augmented += sents
        else :
            for r, aug in zip(R, augs) :
                #sents = aug.augment(sentence, n=r) #if r > 1 else [aug.augment(sentence, n=r)]
                sents = augment_sent(sentence, aug, n=r, profanities=profanities)
                #augmented.append(sents)
                augmented += sents

        #label-preserving
        if add_original :
            augmented_data_sentences += ([sentence] + augmented)
            augmented_data_labels += [label for i in range(n+1)]
        else :
            augmented_data_sentences += augmented
            augmented_data_labels += [label for i in range(n)]
    
    #save new dataset to csv-file
    output_df = pd.DataFrame(list(zip(augmented_data_sentences, augmented_data_labels)), columns=['text', 'label'])
    output_df.to_csv(path + output_fname, sep=",")

    return path + output_fname

In [62]:
#Data augmentation : Multiple Augmenters
stop_words = list(set(stopwords.words('english')))

#5 different augmenters
aug_spelling_prof = naw.SpellingAug(dict_path=fname_profanity, aug_max=None, aug_p=0.5, stopwords=stop_words)
aug_spelling_base = naw.SpellingAug(stopwords=stop_words)
aug_char_keyboard = nac.KeyboardAug(stopwords=stop_words)
aug_char_ocr = nac.OcrAug(stopwords=stop_words)
aug_char_random = nac.RandomCharAug(stopwords=stop_words)

augs = [aug_spelling_prof, aug_spelling_base, aug_char_keyboard, aug_char_ocr, aug_char_random]
R = [1 for i in augs]

fnames = [("Dataset_aug_complex_{i}_.csv").format(i=i) for i in [len(train_df_prof)]]
output_path = "augmented_data/"

for fname in fnames :
    tokens = fname.split("_")
    N = int(tokens[3])
    print("Augmenting (Multiple-Augmenters) - N : {N}".format(N=N))
    #run_aug_multiple(len(augs[1:]), N, train_df_prof, augs[1:], R[1:], fname, profanities, path=output_path, add_original=False)
    run_aug_multiple(len(augs[1:]), N, train_df_prof, augs[1:], R[1:], fname, profanities, path=output_path, add_original=False, random=True)

Augmenting (Multiple-Augmenters) - N : 10424


In [63]:
sample_aug_df = pd.read_csv('augmented_data/Dataset_aug_complex_{i}_.csv'.format(i=len(train_df_prof)),sep=',')
sample_original_df = pd.read_csv('augmented_data/Dataset_aug_complex_{i}_original.csv'.format(i=len(train_df_prof)),sep=',')
print(len(sample_original_df), len(sample_aug_df))

10424 10424


In [64]:
aug_char_keyboard = nac.KeyboardAug(aug_char_max=3, aug_char_p=0.2)
aug_char_ocr = nac.OcrAug(aug_char_max=3, aug_char_p=0.2)
#‘insert’, ‘substitute’, ‘swap’ and ‘delete’.
aug_char_random_in = nac.RandomCharAug(aug_char_max=3, aug_char_p=0.2, action='insert')
aug_char_random_sub = nac.RandomCharAug(aug_char_max=3, aug_char_p=0.2, action='substitute')
aug_char_random_swap = nac.RandomCharAug(aug_char_max=3, aug_char_p=0.2, action='swap')
aug_char_random_del = nac.RandomCharAug(aug_char_max=3, aug_char_p=0.2, action='delete')

In [65]:
augs = [aug_char_keyboard, aug_char_ocr, aug_char_random_in, aug_char_random_sub, aug_char_random_swap, aug_char_random_del]

#profanities -> augmented profanities

profanities = [p for p in profanities if ascii_range(p)]
new_prof = []
prof_augs = []
for p in profanities :
    aug = sample(augs, 1)[0]
    p_aug = aug.augment(p, n=1)
    for c in p_aug :
        if not ascii_range(c) :
            print(p_aug)
            p_aug = p_aug.replace(c,"")
            print(p_aug)
    prof_augs.append(p_aug)


Char_Aug_df = pd.DataFrame(zip(prof_augs, profanities), columns=['text','original'])

print(Char_Aug_df)
Char_Aug_df.to_csv("augmented_data/Dataset_aug_char_{i}.csv".format(i=len(Char_Aug_df)),sep=',')

                      text              original
0                  sidicks             shitdicks
1              baresitcles           breasticles
2     shiafucumo0herfucker  shitfuckmotherfucker
3                 6itcheks              bitchers
4                  cokcsku               cocksuk
...                    ...                   ...
1046          muddekforker          mudderfukker
1047               bstindo             bastinado
1048           cuntboAlonW           cuntbollock
1049             octoupsys             octopussy
1050                  nigu                 niguh

[1051 rows x 2 columns]


In [66]:
profanities = [p for p in profanities if ascii_range(p)]
N=2

new_prof = []
prof_augs = []
for p in profanities :
    p_aug = [aug.augment(p, n=N) for aug in augs]
    p_aug = [p for l in p_aug for p in l]
    prof_augs += p_aug
    new_prof += [p for i in range(len(p_aug))]

Char_Aug_df = pd.DataFrame(zip(prof_augs, new_prof), columns=['text','original'])
print(Char_Aug_df)

Char_Aug_df.to_csv("augmented_data/Dataset_aug_char_{i}.csv".format(i=len(Char_Aug_df)),sep=',')

              text   original
0        sbitdifks  shitdicks
1        shitdKckQ  shitdicks
2        shitdicr8  shitdicks
3        8hitdicrs  shitdicks
4      shCiztdicks  shitdicks
...            ...        ...
12607        jiguh      niguh
12608        ngiuh      niguh
12609        niugh      niguh
12610         niuh      niguh
12611         nigu      niguh

[12612 rows x 2 columns]
