In [12]:
#!pip install nltk pandas nlpaug

In [13]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import pandas as pd
from pandas import isnull
from collections import defaultdict
import nltk, os, re

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cwwojin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
#Creating Profanity-spelling-error-dictionary (txt-file)
data_path = "datasets/"
spelling_path = "spelling/"

train_df = pd.read_csv(data_path + "2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv", sep=',')[['text', 'label']]
profanity_df = pd.read_csv(data_path + "profanity_en.csv", sep=',')
labels = ['canonical_form_1', 'canonical_form_2', 'canonical_form_3']
spelling_dict = defaultdict(list)
fname_profanity = spelling_path + "spelling_en_profanity.txt"

for idx, row in profanity_df.iterrows() :
    text = row['text']
    canons = [i for i in list(row[labels]) if not isnull(i)]
    for word in canons :
        spelling_dict[word].append(text)

if os.path.exists(fname_profanity):
    os.remove(fname_profanity)

with open(fname_profanity, "w+", encoding="UTF-8") as spelling_dict_file :
    for k, v in spelling_dict.items() :
        #print(" ".join([k] + v))
        spelling_dict_file.write(" ".join([k] + v))
        spelling_dict_file.write("\n")

In [15]:
def contains_prof(text, profanities) :
    tokenized_text = word_tokenize(text)
    return (not not set(tokenized_text).intersection(set(profanities)))

#Modifying Original Dataset
profanities = list(spelling_dict.keys())
new_texts = []
new_labels = []

for idx, row in train_df.iterrows() :
    text = row['text']
    label = row['label']
    if contains_prof(text, profanities) and len(word_tokenize(text)) <= 100 :
        new_texts.append(text)
        new_labels.append(label)

train_df_prof = pd.DataFrame(list(zip(new_texts, new_labels)), columns=['text', 'label'])
train_df_prof.to_csv(data_path + "2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1-ProfanityOnly.csv")

In [16]:
def run_aug(n, N, train_df, aug, output_fname, path='') :
    train_df_sample = train_df.sample(n=N, ignore_index=True)
    #save sample as file
    train_df_sample.to_csv(path + output_fname[:-4] + "original.csv".format(N=N), sep=",")

    #aug.augment(data, n)
    augmented_data_sentences = []
    augmented_data_labels = []

    for idx, row in train_df_sample.iterrows() :
        sentence = row['text']
        label = row['label']
        augmented = aug.augment(sentence, n=n) if n > 1 else [aug.augment(sentence, n=n)]

        #label-preserving
        augmented_data_sentences += ([sentence] + augmented)
        augmented_data_labels += [label for i in range(n+1)]
    
    #save new dataset to csv-file
    output_df = pd.DataFrame(list(zip(augmented_data_sentences, augmented_data_labels)), columns=['text', 'label'])
    output_df.to_csv(path + output_fname, sep=",")

    return path + output_fname

In [17]:
def run_aug_multiple(n, N, train_df, augs, R, output_fname, path='') :  #Using Multiple Augmenters
    train_df_sample = train_df.sample(n=N, ignore_index=True)
    #save sample as file
    train_df_sample.to_csv(path + output_fname[:-4] + "original.csv".format(N=N), sep=",")

    #aug.augment(data, n)
    augmented_data_sentences = []
    augmented_data_labels = []

    assert(sum(R)==n)
    assert(len(R)==len(augs))

    for idx, row in train_df_sample.iterrows() :
        sentence = row['text']
        label = row['label']
        augmented = []

        for r, aug in zip(R, augs) :
            sents = aug.augment(sentence, n=r) #if r > 1 else [aug.augment(sentence, n=r)]
            augmented.append(sents)

        #label-preserving
        augmented_data_sentences += ([sentence] + augmented)
        augmented_data_labels += [label for i in range(n+1)]
    
    #save new dataset to csv-file
    output_df = pd.DataFrame(list(zip(augmented_data_sentences, augmented_data_labels)), columns=['text', 'label'])
    output_df.to_csv(path + output_fname, sep=",")

    return path + output_fname

In [18]:
#Data augmentation : Single
# stop_words = list(set(stopwords.words('english')))
# aug_spelling_prof = naw.SpellingAug(dict_path=fname_profanity, aug_max=None, aug_p=0.5, stopwords=stop_words)

# fnames = [("Dataset_aug_profanity_{i}_.csv").format(i=i) for i in [500, 1000, 5000, len(train_df_prof)]]
# output_path = "augmented_data/"

# for fname in fnames :
#     tokens = fname.split("_")
#     N = int(tokens[3])
#     print("Augmenting - N : {N}".format(N=N))
#     run_aug(5, N, train_df_prof, aug_spelling_prof, fname, path=output_path)

In [19]:
#Data augmentation : Multiple Augmenters
stop_words = list(set(stopwords.words('english')))

#5 different augmenters
aug_spelling_prof = naw.SpellingAug(dict_path=fname_profanity, aug_max=None, aug_p=0.5, stopwords=stop_words)
aug_spelling_base = naw.SpellingAug(stopwords=stop_words)
aug_char_keyboard = nac.KeyboardAug(stopwords=stop_words)
aug_char_ocr = nac.OcrAug(stopwords=stop_words)
aug_char_random = nac.RandomCharAug(stopwords=stop_words)

augs = [aug_spelling_prof, aug_spelling_base, aug_char_keyboard, aug_char_ocr, aug_char_random]
R = [1 for i in augs]

fnames = [("Dataset_aug_complex_{i}_.csv").format(i=i) for i in [500, 1000, 5000, len(train_df_prof)]]
output_path = "augmented_data/"

for fname in fnames :
    tokens = fname.split("_")
    N = int(tokens[3])
    print("Augmenting (Multiple-Augmenters) - N : {N}".format(N=N))
    run_aug_multiple(5, N, train_df_prof, augs, R, fname, path=output_path)

Augmenting (Multiple-Augmenters) - N : 500
Augmenting (Multiple-Augmenters) - N : 1000
Augmenting (Multiple-Augmenters) - N : 5000
Augmenting (Multiple-Augmenters) - N : 6513
