In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import random
from typing import Dict
from collections import defaultdict
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import re
from nltk.tokenize.toktok import ToktokTokenizer
nltk.download('stopwords')
nltk.download('punkt')

SEED = 1946614
random.seed(SEED)

def clean_dataset(old_pd:pd.DataFrame) -> pd.DataFrame:
    data = old_pd
    lowercase_review = [i.lower() for i in data['review'].tolist()]
    data['cleaned'] = lowercase_review
    print(data['cleaned'])
    tokenizer=ToktokTokenizer()
    stopword_list = nltk.corpus.stopwords.words('english')
    def _remove_stopwords(text):
        tokens = tokenizer.tokenize(text)
        tokens = [token.strip() for token in tokens]
        final_tokens = [token for token in tokens if token not in stopword_list]
        cleaned_text = ' '.join(final_tokens)    
        return cleaned_text
    data['cleaned'] = data['cleaned'].apply(_remove_stopwords)
    def _remove_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()
    data['cleaned'] = data['cleaned'].apply(_remove_html)
    def _remove_brackets(text):
        return re.sub('\[[^]]*\]', '', text)
    data['cleaned'] = data['cleaned'].apply(_remove_brackets)
    data['label'] = data['sentiment'].astype('category').cat.codes
    return data

def prepare_dataset(df:pd.DataFrame, mode='full') -> Dict[str,list]:
    data = [(r, label) for r, label in zip(df['cleaned'], df['label'])]
    random.shuffle(data)
    datasize = len(df)
    data2 = dict.fromkeys(['train', 'val', 'test'])
    if mode == 'full':
        # 35000 train, 7500 dev test
        data2['train'], data2['val'], data2['test']  = data[0:int(0.7*datasize)], data[int(0.7*datasize):int(0.85*datasize)], data[int(0.85*datasize):]
    elif mode == 'small':
        # 3500 train, 7500 dev test
        data2['train'], data2['val'], data2['test']  = random.choices(data[0:int(0.7*datasize)], k=int(0.7*datasize/5)), data[int(0.7*datasize):int(0.85*datasize)], data[int(0.85*datasize):]
    outputs = defaultdict()
    for x in ['train', 'val', 'test']:
        outputs['%s_sentences'%x] = [d[0] for d in data2[x]]
        outputs['%s_labels'%x] = [d[1] for d in data2[x]]
    return outputs

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Y3S1/CZ4042 Neural network and deep learning/Group project /data/IMDB Dataset.csv')

In [None]:
cleaned_data = clean_dataset(data)
# cleaned_data = prepare_dataset(cleaned_data,mode='small')

In [None]:
!pip install nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action

In [None]:
original_output = prepare_dataset(cleaned_data)
train_sentences_full = original_output['train_sentences']
train_labels_full = original_output['train_labels']
val_sentences_full = np.array(original_output['val_sentences'])
val_labels_full = np.array(original_output['val_labels'])
test_sentences_full = np.array(original_output['test_sentences'])
test_labels_full = np.array(original_output['test_labels'])

## Character augmentation

#### Check augmenter

OCR

In [None]:
text = train_sentences[0]
aug_ocr = nac.OcrAug()
augmented_texts = aug_ocr.augment(text, n=5)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
level high expectation sit watch comedy cast headed cary grant , jayne mansfield , ray walston werner klemperer. expectations buoyed film directed stanley donen , whose comic touch evident , among others , damn yankees ! , bedazzled charade. first five minutes , , seems expectations might met . nothing. supposed light comedy , plunges leaden , heavy handed melodrama , nary chuckle had.relative newcomer suzy parker often criticized performance , lack one , film , movie even great cary grant frequently appears flat wooden , attacking parker seems unfair. even bright light audrey hepburn doris day could changed fortunes meandering , dreary wholly pointless script , drags lamely along drags viewer ' interest patience it.the rest cast , especially ray walston , keep trying breath life proceedings , horrible script beyond resuscitation. desperate , inane effort drag half hearted laugh numbed audience film ' final moments serves add insult injury.this film nothing major disappointme

Keyboard augmenter

In [None]:
aug_key = nac.KeyboardAug()
augmented_text = aug_key.augment(text)
print("Original:")
print(text) 
print("Augmented Text:")
print(augmented_text)

Original:
level high expectation sit watch comedy cast headed cary grant , jayne mansfield , ray walston werner klemperer. expectations buoyed film directed stanley donen , whose comic touch evident , among others , damn yankees ! , bedazzled charade. first five minutes , , seems expectations might met . nothing. supposed light comedy , plunges leaden , heavy handed melodrama , nary chuckle had.relative newcomer suzy parker often criticized performance , lack one , film , movie even great cary grant frequently appears flat wooden , attacking parker seems unfair. even bright light audrey hepburn doris day could changed fortunes meandering , dreary wholly pointless script , drags lamely along drags viewer ' interest patience it.the rest cast , especially ray walston , keep trying breath life proceedings , horrible script beyond resuscitation. desperate , inane effort drag half hearted laugh numbed audience film ' final moments serves add insult injury.this film nothing major disappointme

Random character augmenter

Insertion

In [None]:
aug_rand_ins = nac.RandomCharAug(action="insert")
augmented_text = aug_rand_ins.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
level high expectation sit watch comedy cast headed cary grant , jayne mansfield , ray walston werner klemperer. expectations buoyed film directed stanley donen , whose comic touch evident , among others , damn yankees ! , bedazzled charade. first five minutes , , seems expectations might met . nothing. supposed light comedy , plunges leaden , heavy handed melodrama , nary chuckle had.relative newcomer suzy parker often criticized performance , lack one , film , movie even great cary grant frequently appears flat wooden , attacking parker seems unfair. even bright light audrey hepburn doris day could changed fortunes meandering , dreary wholly pointless script , drags lamely along drags viewer ' interest patience it.the rest cast , especially ray walston , keep trying breath life proceedings , horrible script beyond resuscitation. desperate , inane effort drag half hearted laugh numbed audience film ' final moments serves add insult injury.this film nothing major disappointme

Substitute

In [None]:
aug_rand_sub = nac.RandomCharAug(action="substitute")
augmented_text = aug_rand_sub.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
level high expectation sit watch comedy cast headed cary grant , jayne mansfield , ray walston werner klemperer. expectations buoyed film directed stanley donen , whose comic touch evident , among others , damn yankees ! , bedazzled charade. first five minutes , , seems expectations might met . nothing. supposed light comedy , plunges leaden , heavy handed melodrama , nary chuckle had.relative newcomer suzy parker often criticized performance , lack one , film , movie even great cary grant frequently appears flat wooden , attacking parker seems unfair. even bright light audrey hepburn doris day could changed fortunes meandering , dreary wholly pointless script , drags lamely along drags viewer ' interest patience it.the rest cast , especially ray walston , keep trying breath life proceedings , horrible script beyond resuscitation. desperate , inane effort drag half hearted laugh numbed audience film ' final moments serves add insult injury.this film nothing major disappointme

Swap character randomly

In [None]:
aug_rand_swap = nac.RandomCharAug(action="swap")
augmented_text = aug_rand_swap.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
level high expectation sit watch comedy cast headed cary grant , jayne mansfield , ray walston werner klemperer. expectations buoyed film directed stanley donen , whose comic touch evident , among others , damn yankees ! , bedazzled charade. first five minutes , , seems expectations might met . nothing. supposed light comedy , plunges leaden , heavy handed melodrama , nary chuckle had.relative newcomer suzy parker often criticized performance , lack one , film , movie even great cary grant frequently appears flat wooden , attacking parker seems unfair. even bright light audrey hepburn doris day could changed fortunes meandering , dreary wholly pointless script , drags lamely along drags viewer ' interest patience it.the rest cast , especially ray walston , keep trying breath life proceedings , horrible script beyond resuscitation. desperate , inane effort drag half hearted laugh numbed audience film ' final moments serves add insult injury.this film nothing major disappointme

Delete character randomly

In [None]:
aug_rand_del = nac.RandomCharAug(action="delete")
augmented_text = aug_rand_del.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)                                                                                                                                                                                                                                                                                                          

Original:
level high expectation sit watch comedy cast headed cary grant , jayne mansfield , ray walston werner klemperer. expectations buoyed film directed stanley donen , whose comic touch evident , among others , damn yankees ! , bedazzled charade. first five minutes , , seems expectations might met . nothing. supposed light comedy , plunges leaden , heavy handed melodrama , nary chuckle had.relative newcomer suzy parker often criticized performance , lack one , film , movie even great cary grant frequently appears flat wooden , attacking parker seems unfair. even bright light audrey hepburn doris day could changed fortunes meandering , dreary wholly pointless script , drags lamely along drags viewer ' interest patience it.the rest cast , especially ray walston , keep trying breath life proceedings , horrible script beyond resuscitation. desperate , inane effort drag half hearted laugh numbed audience film ' final moments serves add insult injury.this film nothing major disappointme

#### Augment all data

In [None]:
aug = naf.Sometimes([
    aug_ocr,
    aug_key,
    aug_rand_ins,
    aug_rand_sub,
    aug_rand_del,
    aug_rand_swap,
])

augmented_text = aug.augment(text, n=5)
augmented_text

["level high expectation sit watch comedy cast headed cary grant, jayne mansfield, ray walston werner klemperer. expectations buoyed film directed stanley donen, whose comic touch vidnt, among others, damn yankees! , bedazzled charade. first five minutes, , seems expectations might met. nothing. supposed light comedy, plunges leaden, heavy handed melodrama, nary chuckle had. relative newcomer suzy parker often criticized performance, lack one, film, movie even great cary grant frequently appears flat wooden, attacking parker seems unfair. eve bright light audrey hepburn doris day could changed fortunes meandering, dreary wholl pointless scipt, drags lamely along drags viewer ' interest patience it. the res cast, especially ray walston, keep trying breath life ocedings, horrible script beyond esusciatin. desperate, inane effort drag half hearted lagh numbed audience film ' final moments serves add insult injury. tis flm nothing major disappointment levels.",
 "level high expectation sit

In [None]:
aug_train_sent = train_sentences.copy()
aug_train_label = train_labels.copy()

seconds = time.time()
for i in tqdm(range(len(train_sentences))):
    augmented_texts = aug.augment(train_sentences[i], n=4)
    for j in augmented_texts:
        aug_train_sent.append(j)
        aug_train_label.append(train_labels[i])
end = time.time()
print("Time taken for chracter level augmentation: {} s".format(end - seconds))

100%|██████████| 3500/3500 [00:54<00:00, 64.75it/s]

Time taken for chracter level augmentation: 54.072290658950806 s





In [None]:
char_aug_df = pd.DataFrame(
    {'cleaned': aug_train_sent,
     'label': aug_train_label
    })
char_aug_df.to_csv('char_aug_data.csv',index=False)

## Non contextual word augmenter

Spelling

In [None]:
aug_spell = naw.SpellingAug()
augmented_texts = aug.augment(text, n=3)
print("Original:")
print(text)
print("Augmented Texts:")
print(augmented_texts)

Original:
level high expectation sit watch comedy cast headed cary grant , jayne mansfield , ray walston werner klemperer. expectations buoyed film directed stanley donen , whose comic touch evident , among others , damn yankees ! , bedazzled charade. first five minutes , , seems expectations might met . nothing. supposed light comedy , plunges leaden , heavy handed melodrama , nary chuckle had.relative newcomer suzy parker often criticized performance , lack one , film , movie even great cary grant frequently appears flat wooden , attacking parker seems unfair. even bright light audrey hepburn doris day could changed fortunes meandering , dreary wholly pointless script , drags lamely along drags viewer ' interest patience it.the rest cast , especially ray walston , keep trying breath life proceedings , horrible script beyond resuscitation. desperate , inane effort drag half hearted laugh numbed audience film ' final moments serves add insult injury.this film nothing major disappointme

### TFIDF augmenter

In [None]:
import sklearn.datasets
import re

import nlpaug.augmenter.word as naw
import nlpaug.model.word_stats as nmw

def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

# Load sample data
# train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
# train_x = train_data.data
train_x = train_sentences.copy()


# Tokenize input
train_x_tokens = [_tokenizer(x) for x in train_x]

# # Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_x_tokens)
tfidf_model.save('/content/drive/MyDrive/Y3S1/CZ4042 Neural network and deep learning/Group project /')

# # Load TF-IDF augmenter
aug = naw.TfIdfAug(model_path='/content/drive/MyDrive/Y3S1/CZ4042 Neural network and deep learning/Group project /', tokenizer=_tokenizer)

texts = [
    'The quick brown fox jumps over the lazy dog',
    'asdasd test apple dog asd asd'
]

for text in texts:
    augmented_text = aug.augment(text)
    
    print('-'*20)
    print('Original Input:{}'.format(text))
    print('Agumented Output:{}'.format(augmented_text))

--------------------
Original Input:The quick brown fox jumps over the lazy dog
Agumented Output:The quick brown fox jumps coconut the our dog
--------------------
Original Input:asdasd test apple dog asd asd
Agumented Output:asdasd test apple triggered asd asd


In [None]:
aug_tfidf_ins = naw.TfIdfAug(
    model_path='/content/tfidfaug_w2idf.txt',
    action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
asdasd test apple dog asd asd
Augmented Text:
asdasd test apple avjo asd asd


tfidf --> substitute

In [None]:
aug_tfidf_sub = naw.TfIdfAug(
    model_path='/content/tfidfaug_w2idf.txt',
    action="substitute")
augmented_text = aug.augment(text,n=5)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
asdasd test apple dog asd asd
Augmented Text:
['asdasd test apple grapple asd asd', 'asdasd brainwashing apple dog asd asd', 'asdasd melts apple dog asd asd', 'asdasd test apple travesty asd asd', 'asdasd test pitiable dog asd asd']


In [None]:
aug_word = naf.Sometimes([
    aug_spell,
    aug_tfidf_ins,
    aug_tfidf_sub,
])

augmented_text = aug_word.augment(text, n=4)
augmented_text

['Blasé asdasd test apple dog asd asd',
 'asdasd mecca apple dog asd asd',
 'asdasd test apple denmark dog asd asd',
 'asdasd test apple dadg asd asd']

In [None]:
aug_train_sent = train_sentences.copy()
aug_train_label = train_labels.copy()

start = time.time()
for i in tqdm(range(len(train_sentences))):
    augmented_texts = aug_word.augment(train_sentences[i], n=4)
    while len(augmented_texts)<4:
        augmented_texts += augmented_texts
    augmented_texts = augmented_texts[:4]

    for j in augmented_texts:
        aug_train_sent.append(j)
        aug_train_label.append(train_labels[i])
end = time.time()
print("Time taken for word level non-contextual augmentation: {} s".format(end - start))

100%|██████████| 3500/3500 [04:40<00:00, 12.48it/s]

Time taken for word level non-contextual augmentation: 280.37355756759644 s





In [None]:
non_cont_word_aug_df = pd.DataFrame(
    {'cleaned': aug_train_sent,
     'label': aug_train_label
    })
non_cont_word_aug_df.to_csv('non_cont_word_aug_df.csv',index=False)

synonym and antonym

In [None]:
aug_syn_sub = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug_syn_sub.augment(_text)
print("Original:")
print(_text)
print("Augmented Text:")
print(augmented_text)

In [None]:
aug_ant = naw.AntonymAug()
_text = train_sentences[0]
augmented_text = aug_ant.augment(_text)
print("Original:")
print(_text)
print("Augmented Text:")
print(augmented_text)

In [None]:
aug_train_sent = train_sentences.copy()
aug_train_label = train_labels.copy()

#2 times synonym substitution
for i in tqdm(range(len(train_sentences))):
    augmented_texts_syn = aug_syn_sub.augment(train_sentences[i], n=2)
    for j in augmented_texts_syn:
        aug_train_sent.append(j)
        aug_train_label.append(train_labels[i])

#2 times antonym substitution,  and change label to opposite label
for i in tqdm(range(len(train_sentences))):
    augmented_texts_ant = aug_ant.augment(train_sentences[i], n=2)
    for j in augmented_texts_ant:
        aug_train_sent.append(j)
        if train_labels[i]:
            aug_train_label.append(0)
        else:
            aug_train_label.append(1)