## Text sampling

In [1]:
from random import sample, seed
from utils import lcqmcLoader
from os.path import join

In [2]:
train = lcqmcLoader('train')
seed(3245)

In [3]:
def text_sampling(out_num):
    return [t for t in sample(train, out_num)]


def saveTextFile(data, filepath):
    f = open(filepath, 'w')
    f.write("text_a\ttext_b\tlabel")
    tmp = "\n{}\t{}\t{}"
    for example in data:
        f.write(tmp.format(example[0], example[1], example[-1]))
    f.close()
    print(filepath + " has been saved!")
    
def saveSampledTexts(out_num, dire):
    data = text_sampling(out_num)
    path = join(dire, f"train_{int(out_num/1000)}k.txt")
    saveTextFile(data, path)

#### Dataset size to sample: 5k, 10k, 25k, 50k, 75k, 100k, 125k, 150k, 175k, 200k, full set

In [4]:
dire = '../data/ablation_data/'
saveSampledTexts(5000, dire)
saveSampledTexts(10000, dire)

for i in range(25000, 200001, 25000):
    saveSampledTexts(i, dire)

saveTextFile(train, '../data/ablation_data/train_full.txt')

../data/ablation_data/train_5k.txt has been saved!
../data/ablation_data/train_10k.txt has been saved!
../data/ablation_data/train_25k.txt has been saved!
../data/ablation_data/train_50k.txt has been saved!
../data/ablation_data/train_75k.txt has been saved!
../data/ablation_data/train_100k.txt has been saved!
../data/ablation_data/train_125k.txt has been saved!
../data/ablation_data/train_150k.txt has been saved!
../data/ablation_data/train_175k.txt has been saved!
../data/ablation_data/train_200k.txt has been saved!
../data/ablation_data/train_full.txt has been saved!


## Text Augmentation

For every text pair, we will augment both texts and do cross pairing.

## DA models combined

This is for two reasons: (1) efficiency; (2) more controlled, making sure that the augmented texts are sampled from the same pool.

In [5]:
from ngramLM import NgramLM
from reda import REDA
from itertools import groupby
from random import sample, shuffle


class AugTextsWithTwoModels:
    
    def __init__(self, syn_path=None):
        self._reda = REDA(syn_path)
        self._lm = NgramLM()
    
    @staticmethod
    def _set_choice_num(edit_num, choice_num):
        if choice_num:
            return choice_num
        if edit_num == 1:
            return 20
        if edit_num == 2:
            return 50
        if edit_num == 3:
            return 100
        if edit_num is None:
            return 150
        return edit_num * 50
    
    @staticmethod
    def _out_num(edit_num, choice_num=None):
        if choice_num:
            return choice_num
        if edit_num == 1:
            return 20
        if edit_num == 2:
            return 50
        if edit_num == 3:
            return 100
        return edit_num * 50
    
    @staticmethod
    def deduplicate(ori, lst):
        lst.append(ori)
        lst.sort()
        lst = [l for l,_ in groupby(lst)]
        lst.remove(ori)
        return lst

    
    def _textEdit(self, editFunc, words, edit_rate, out_num, out_str, choice_num):
        
        def _filter(item):
            '''A func to make sure that the data structure is all right as some operation might fail to augment 
            the text (e.g., too short, no synonyms etc.)'''
            if isinstance(item, str):
                return []
            if not out_str and isinstance(item[0], str):
                if len(item) == len(words):
                    for i in range(words_num):
                        if item[i] == words[i]:
                            return []
                return [item]
            return item
        
        if isinstance(words, str):
            words = self._reda.tokenize(words)
        elif isinstance(words, list):
            pass
        else:
            raise TypeError("The input text must be either a str or a list")
        
        
        words_num = len(words)
        
        if isinstance(edit_rate, int):
            edit_num = edit_rate
        else:
            edit_num = round(words_num * edit_rate)
        
        _sample = lambda lst, num: sample(lst, num) if len(lst) >= num else lst
                
        if edit_num:
            choice_num = self._set_choice_num(edit_num, choice_num)
            out = _filter(editFunc(words, edit_num, choice_num, out_str))
            if out:
                reda_out = _sample(out, out_num)
                ngram_out = self._lm.pickBestSent(out, out_num=out_num, out_str=out_str)
            else:
                return [], []
        else:
            return [], []
        
        if out_str:
            reda_out = [''.join(sent) for sent in reda_out]
        # to deduplicate the outputs and ensure that the original text is no returned.
        words = self._reda._out_str(words, out_str)
        
        reda_out = self.deduplicate(words, reda_out)
        ngram_out = self.deduplicate(words, ngram_out)
        
        return reda_out, ngram_out
    
    def replace_syn(self, words, rpl_rate=0.2, out_num=1, out_str=True, choice_num=None):
        return self._textEdit(self._reda.replace_syn, words, rpl_rate, out_num, out_str, choice_num)
    
    def swap_words(self, words, swap_rate=0.2, out_num=1, out_str=True, choice_num=None):
        return self._textEdit(self._reda.swap_words, words, swap_rate, out_num, out_str, choice_num)
    
    def insert_words(self, words, insert_rate=0.1, out_num=1, out_str=True, choice_num=None):
        return self._textEdit(self._reda.insert_words, words, insert_rate, out_num, out_str, choice_num)
    
    def delete_words(self, words, delete_rate=0.1, out_num=1, out_str=True, choice_num=None):
        return self._textEdit(self._reda.delete_words, words, delete_rate, out_num, out_str, choice_num)
    
    def mixed_edits(self, words, max_mix=2, out_num=1, out_str=True, choice_num=None):
        return self._textEdit(self._reda.mixed_edits, words, max_mix, out_num, out_str, choice_num)

In [6]:
def textAugmentation(data_size, editFunc, editName, out_num=2):
    def augment(text_a, text_b, label):
        out_reda = [(text_a, text_b, label)]
        out_ngram = [(text_a, text_b, label)]
        
        aug_reda, aug_ngram = editFunc(text_a, out_num=out_num)
        out_reda.extend([(t, text_b, label) for t in aug_reda])
        out_ngram.extend([(t, text_b, label) for t in aug_ngram])
        
        aug_reda, aug_ngram = editFunc(text_b, out_num=out_num)
        out_reda.extend([(text_a, t, label) for t in aug_reda])
        out_ngram.extend([(text_a, t, label) for t in aug_ngram])
        return out_reda, out_ngram
    
    if data_size not in ['5k', '10k', '25k', '50k']:
        out_num = 1
    
    tmp = '../data/ablation_data/train_{}.txt'
    data = open(tmp.format(data_size)).readlines()[1:]
    
    outputs_reda, outputs_gram = [], []
    for example in data:
        example = example.strip().split()
        out_reda, out_ngram = augment(example[0], example[1], int(example[-1]))
        outputs_reda.extend(out_reda)
        outputs_gram.extend(out_ngram)
    print('Texts augmented.')
    print(f'Before (reda): {len(data)}. Now: {len(outputs_reda)}')
    print(f'Before (ngram): {len(data)}. Now: {len(outputs_gram)}')
    
    del data
    
    saveTextFile(outputs_reda, tmp.format(f"{data_size}_{editName}_reda"))
    saveTextFile(outputs_gram, tmp.format(f"{data_size}_{editName}_reda_ngram"))

In [7]:
aug = AugTextsWithTwoModels()

In [8]:
data_size = ['5k', '10k', '25k', '50k', '75k', '100k', '125k', '150k', '175k', '200k', 'full']

### Synonym Replacement

In [9]:
for size in data_size:
    textAugmentation(size, aug.replace_syn, 'sr')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/w9/d_nplhzj4qx35xxlgljgdtjh0000gn/T/jieba.cache
Loading model cost 0.786 seconds.
Prefix dict has been built successfully.


Texts augmented.
Before (reda): 5000. Now: 24402
Before (ngram): 5000. Now: 24402
../data/ablation_data/train_5k_sr_reda.txt has been saved!
../data/ablation_data/train_5k_sr_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 10000. Now: 48807
Before (ngram): 10000. Now: 48807
../data/ablation_data/train_10k_sr_reda.txt has been saved!
../data/ablation_data/train_10k_sr_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 25000. Now: 122358
Before (ngram): 25000. Now: 122358
../data/ablation_data/train_25k_sr_reda.txt has been saved!
../data/ablation_data/train_25k_sr_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 50000. Now: 244577
Before (ngram): 50000. Now: 244576
../data/ablation_data/train_50k_sr_reda.txt has been saved!
../data/ablation_data/train_50k_sr_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 75000. Now: 220843
Before (ngram): 75000. Now: 220842
../data/ablation_data/train_75k_sr_reda.txt has been saved!
../data/ablati

### Random Swap

In [10]:
for size in data_size:
    textAugmentation(size, aug.swap_words, 'rs')

Texts augmented.
Before (reda): 5000. Now: 24758
Before (ngram): 5000. Now: 23549
../data/ablation_data/train_5k_rs_reda.txt has been saved!
../data/ablation_data/train_5k_rs_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 10000. Now: 49575
Before (ngram): 10000. Now: 47268
../data/ablation_data/train_10k_rs_reda.txt has been saved!
../data/ablation_data/train_10k_rs_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 25000. Now: 124040
Before (ngram): 25000. Now: 118154
../data/ablation_data/train_25k_rs_reda.txt has been saved!
../data/ablation_data/train_25k_rs_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 50000. Now: 248074
Before (ngram): 50000. Now: 235953
../data/ablation_data/train_50k_rs_reda.txt has been saved!
../data/ablation_data/train_50k_rs_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 75000. Now: 223497
Before (ngram): 75000. Now: 208583
../data/ablation_data/train_75k_rs_reda.txt has been saved!
../data/ablati

### Random Insertion

In [11]:
for size in data_size:
    textAugmentation(size, aug.insert_words, 'ri')

Texts augmented.
Before (reda): 5000. Now: 16733
Before (ngram): 5000. Now: 16733
../data/ablation_data/train_5k_ri_reda.txt has been saved!
../data/ablation_data/train_5k_ri_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 10000. Now: 33090
Before (ngram): 10000. Now: 33090
../data/ablation_data/train_10k_ri_reda.txt has been saved!
../data/ablation_data/train_10k_ri_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 25000. Now: 83329
Before (ngram): 25000. Now: 83329
../data/ablation_data/train_25k_ri_reda.txt has been saved!
../data/ablation_data/train_25k_ri_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 50000. Now: 166839
Before (ngram): 50000. Now: 166839
../data/ablation_data/train_50k_ri_reda.txt has been saved!
../data/ablation_data/train_50k_ri_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 75000. Now: 162563
Before (ngram): 75000. Now: 162563
../data/ablation_data/train_75k_ri_reda.txt has been saved!
../data/ablation

### Random Delete

In [12]:
for size in data_size:
    textAugmentation(size, aug.delete_words, 'rd')

Texts augmented.
Before (reda): 5000. Now: 16780
Before (ngram): 5000. Now: 16780
../data/ablation_data/train_5k_rd_reda.txt has been saved!
../data/ablation_data/train_5k_rd_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 10000. Now: 33208
Before (ngram): 10000. Now: 33208
../data/ablation_data/train_10k_rd_reda.txt has been saved!
../data/ablation_data/train_10k_rd_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 25000. Now: 83592
Before (ngram): 25000. Now: 83592
../data/ablation_data/train_25k_rd_reda.txt has been saved!
../data/ablation_data/train_25k_rd_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 50000. Now: 167296
Before (ngram): 50000. Now: 167296
../data/ablation_data/train_50k_rd_reda.txt has been saved!
../data/ablation_data/train_50k_rd_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 75000. Now: 162972
Before (ngram): 75000. Now: 162972
../data/ablation_data/train_75k_rd_reda.txt has been saved!
../data/ablation

### Random Mix

In [13]:
for size in data_size:
    textAugmentation(size, aug.mixed_edits, 'rm')

Texts augmented.
Before (reda): 5000. Now: 24859
Before (ngram): 5000. Now: 24464
../data/ablation_data/train_5k_rm_reda.txt has been saved!
../data/ablation_data/train_5k_rm_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 10000. Now: 49652
Before (ngram): 10000. Now: 48783
../data/ablation_data/train_10k_rm_reda.txt has been saved!
../data/ablation_data/train_10k_rm_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 25000. Now: 124237
Before (ngram): 25000. Now: 122106
../data/ablation_data/train_25k_rm_reda.txt has been saved!
../data/ablation_data/train_25k_rm_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 50000. Now: 248539
Before (ngram): 50000. Now: 244371
../data/ablation_data/train_50k_rm_reda.txt has been saved!
../data/ablation_data/train_50k_rm_reda_ngram.txt has been saved!
Texts augmented.
Before (reda): 75000. Now: 224026
Before (ngram): 75000. Now: 219342
../data/ablation_data/train_75k_rm_reda.txt has been saved!
../data/ablati