# Initialization

In [1]:
from random import sample, seed
from reda_augmenter import Augmenter
from utils_data import *
from os.path import join


seed(852)

In [2]:
def train_aug_all(sizes, size_names, dire):
    
    train = load_dataset('train.txt', dire)
    
    for size, name in zip(sizes, size_names):
        subset = text_sampling(train, size)
        fpath = join(dire, f"train_{name}")
        saveTextFile(subset, fpath + ".txt")
        
        aug_data = Augmenter().do_all_aug(subset)
        check_empty_line(aug_data, raise_err=True)
        saveTextFile(aug_data, f"{fpath}_aug_all.txt")

In [3]:
sizes = [1000, 5000, 10000, 20000, 30000, 40000, 50000]
size_names = ['1k', '5k', '10k', '20k', '30k', '40k', 'full']

# Quora

In [4]:
train_aug_all(sizes, size_names, 'Quora/')

Quora/train_1k.txt has been saved!


100%|█████████████████████████████████████| 1000/1000 [00:00<00:00, 6258.88it/s]


Texts augmented.
Before (reda): 1000. Now: 15584
Quora/train_1k_aug_all.txt has been saved!
Quora/train_5k.txt has been saved!


100%|█████████████████████████████████████| 5000/5000 [00:00<00:00, 7104.59it/s]


Texts augmented.
Before (reda): 5000. Now: 76771
Quora/train_5k_aug_all.txt has been saved!
Quora/train_10k.txt has been saved!


100%|███████████████████████████████████| 10000/10000 [00:01<00:00, 6701.63it/s]


Texts augmented.
Before (reda): 10000. Now: 152910
Quora/train_10k_aug_all.txt has been saved!
Quora/train_20k.txt has been saved!


100%|███████████████████████████████████| 20000/20000 [00:02<00:00, 6720.94it/s]


Texts augmented.
Before (reda): 20000. Now: 305368
Quora/train_20k_aug_all.txt has been saved!
Quora/train_30k.txt has been saved!


100%|███████████████████████████████████| 30000/30000 [00:04<00:00, 7340.64it/s]


Texts augmented.
Before (reda): 30000. Now: 458694
Quora/train_30k_aug_all.txt has been saved!
Quora/train_40k.txt has been saved!


100%|███████████████████████████████████| 40000/40000 [00:05<00:00, 7262.72it/s]


Texts augmented.
Before (reda): 40000. Now: 611165
Quora/train_40k_aug_all.txt has been saved!
Quora/train_full.txt has been saved!


100%|███████████████████████████████████| 50000/50000 [00:06<00:00, 7437.36it/s]


Texts augmented.
Before (reda): 50000. Now: 764242
Quora/train_full_aug_all.txt has been saved!


# MNLI

In [5]:
train_aug_all(sizes, size_names, 'MNLI/')

MNLI/train_1k.txt has been saved!


100%|█████████████████████████████████████| 1000/1000 [00:00<00:00, 5435.22it/s]


Texts augmented.
Before (reda): 1000. Now: 17152
MNLI/train_1k_aug_all.txt has been saved!
MNLI/train_5k.txt has been saved!


100%|█████████████████████████████████████| 5000/5000 [00:00<00:00, 5087.43it/s]


Texts augmented.
Before (reda): 5000. Now: 85675
MNLI/train_5k_aug_all.txt has been saved!
MNLI/train_10k.txt has been saved!


100%|███████████████████████████████████| 10000/10000 [00:01<00:00, 5089.85it/s]


Texts augmented.
Before (reda): 10000. Now: 170082
MNLI/train_10k_aug_all.txt has been saved!
MNLI/train_20k.txt has been saved!


100%|███████████████████████████████████| 20000/20000 [00:03<00:00, 5433.83it/s]


Texts augmented.
Before (reda): 20000. Now: 341460
MNLI/train_20k_aug_all.txt has been saved!
MNLI/train_30k.txt has been saved!


100%|███████████████████████████████████| 30000/30000 [00:05<00:00, 5552.18it/s]


Texts augmented.
Before (reda): 30000. Now: 512319
MNLI/train_30k_aug_all.txt has been saved!
MNLI/train_40k.txt has been saved!


100%|███████████████████████████████████| 40000/40000 [00:07<00:00, 5323.15it/s]


Texts augmented.
Before (reda): 40000. Now: 683623
MNLI/train_40k_aug_all.txt has been saved!
MNLI/train_full.txt has been saved!


100%|███████████████████████████████████| 50000/50000 [00:08<00:00, 5567.68it/s]


Texts augmented.
Before (reda): 50000. Now: 854255
MNLI/train_full_aug_all.txt has been saved!


# SNLI

In [6]:
train_aug_all(sizes, size_names, 'SNLI/')

SNLI/train_1k.txt has been saved!


100%|█████████████████████████████████████| 1000/1000 [00:00<00:00, 5482.34it/s]


Texts augmented.
Before (reda): 1000. Now: 17205
SNLI/train_1k_aug_all.txt has been saved!
SNLI/train_5k.txt has been saved!


100%|█████████████████████████████████████| 5000/5000 [00:00<00:00, 5417.49it/s]


Texts augmented.
Before (reda): 5000. Now: 85138
SNLI/train_5k_aug_all.txt has been saved!
SNLI/train_10k.txt has been saved!


100%|███████████████████████████████████| 10000/10000 [00:01<00:00, 5187.24it/s]


Texts augmented.
Before (reda): 10000. Now: 170277
SNLI/train_10k_aug_all.txt has been saved!
SNLI/train_20k.txt has been saved!


100%|███████████████████████████████████| 20000/20000 [00:03<00:00, 5332.91it/s]


Texts augmented.
Before (reda): 20000. Now: 341092
SNLI/train_20k_aug_all.txt has been saved!
SNLI/train_30k.txt has been saved!


100%|███████████████████████████████████| 30000/30000 [00:05<00:00, 5467.05it/s]


Texts augmented.
Before (reda): 30000. Now: 511891
SNLI/train_30k_aug_all.txt has been saved!
SNLI/train_40k.txt has been saved!


100%|███████████████████████████████████| 40000/40000 [00:07<00:00, 5518.94it/s]


Texts augmented.
Before (reda): 40000. Now: 682750
SNLI/train_40k_aug_all.txt has been saved!
SNLI/train_full.txt has been saved!


100%|███████████████████████████████████| 50000/50000 [00:08<00:00, 5592.18it/s]


Texts augmented.
Before (reda): 50000. Now: 852950
SNLI/train_full_aug_all.txt has been saved!
