# Library

In [1]:
import platform
import multiprocessing
import itertools
import re

import pandas as pd
import textattack
import textda
import jieba




In [2]:
print('Python version:', platform.python_version())
print('Pandas version:', pd.__version__)
# print('TextAttack version:', textattack.__version__)
print('TextAttack version: 0.2.2')
# print('textda version:', textda.__version__)
print('textda version: 0.1.0.6')
print('Jieba version:', jieba.__version__)


Python version: 3.8.3
Pandas version: 1.0.5
TextAttack version: 0.2.2
textda version: 0.1.0.6
Jieba version: 0.42.1


# Dataset

In [3]:
df_tcn = pd.read_csv('./data/txt/train_tcn.txt', header=None).dropna()
df_en = pd.read_csv('./data/txt/train_en.txt', header=None).dropna()
df_en[0] = df_en[0].apply(lambda t: re.sub('[^a-z ]', '', t))


In [4]:
list_tcn = df_tcn[0].to_list()
list_en = df_en[0].to_list()


# Augment

In [5]:
from textda.data_expansion import data_expansion
wn_augmenter = textattack.augmentation.WordNetAugmenter()


Building prefix dict from /home/ilos-vigil/.local/lib/python3.8/site-packages/synonyms/data/vocab.txt ...
Building prefix dict from /home/ilos-vigil/.local/lib/python3.8/site-packages/synonyms/data/vocab.txt ...
>> Synonyms load wordseg dict [/home/ilos-vigil/.local/lib/python3.8/site-packages/synonyms/data/vocab.txt] ... 
Dumping model to file cache /tmp/jieba.u8c5535dea22e9d74e4ba244d4b27c629.cache
Dumping model to file cache /tmp/jieba.u8c5535dea22e9d74e4ba244d4b27c629.cache
Loading model cost 1.567 seconds.
Loading model cost 1.567 seconds.
Prefix dict has been built successfully.
Prefix dict has been built successfully.
>> Synonyms on loading stopwords [/home/ilos-vigil/.local/lib/python3.8/site-packages/synonyms/data/stopwords.txt] ...
>> Synonyms on loading vectors [/home/ilos-vigil/.local/lib/python3.8/site-packages/synonyms/data/words.vector] ...


In [6]:
def flatten_2d(old_list):
    new_list = []
    for sublist in old_list:
        for item in sublist:
            new_list.append(item)
    return new_list

def split_tcn(text, mode='default'):
    try:
        result = jieba.tokenize(text, mode=mode)
        token = [r[0] for r in result]
        new_text = ' '.join(token)
        return new_text
    except Exception as ex:
        print(f'Text : {text}')
        print(ex)
        return text

def aug_tcn(sentence):
    try:
        augmented_sentences = data_expansion(sentence, alpha_sr=1.0, alpha_ri=0, alpha_rs=0, p_rd=0, num_aug=10)

        augmented_tokenized_sentences = []
        for s in augmented_sentences:
            augmented_tokenized_sentences.append(split_tcn(s, mode='default'))
            augmented_tokenized_sentences.append(split_tcn(s, mode='search'))

        return augmented_tokenized_sentences
    except:
        return [sentence]

def aug_en(sentence):
    sentence = str(sentence)
    sentence = re.sub('[^a-z ]', '', sentence)
    augmented_sentences = [[sentence]]
    try:
        if ' ' in sentence:
            for _ in range(10):
                augmented_sentence = wn_augmenter.augment(sentence)
                augmented_sentences.append(augmented_sentence)
    except Exception as ex:
        print(f'Sentence: {sentence} |\n')


    augmented_sentences = list(itertools.chain.from_iterable(augmented_sentences))
    augmented_sentences = list(set(augmented_sentences))
    augmented_sentences = [s.lower() for s in augmented_sentences]
    return augmented_sentences


In [None]:
list_aug_en = []
with multiprocessing.Pool(multiprocessing.cpu_count() - 1) as pool:
    list_aug_en = pool.map(aug_en, list_en)
    list_aug_en = list(itertools.chain.from_iterable(list_aug_en))


In [None]:
df_aug_en = pd.DataFrame({'en': list_aug_en})
df_aug_en.to_csv('./data/txt/train_en_aug.txt', index=False, header=False)


In [None]:
list_aug_tcn = []

with multiprocessing.Pool(multiprocessing.cpu_count() - 1) as pool:
    list_aug_tcn = pool.map(aug_tcn, list_tcn)
    list_aug_tcn = flatten_2d(list_aug_tcn)


In [None]:
df_aug_tcn = pd.DataFrame({'tcn': list_aug_tcn})
df_aug_tcn.to_csv('./data/txt/train_tcn_aug.txt', index=False, header=False)

