# PyTorch pipeline with text augmentation

# Setup

In [1]:
!pip install sentencepiece
!pip install -qq tensorflow_addons genomic-benchmarks
!pip install git+https://github.com/katarinagresova/GLP



In [2]:
import sentencepiece as spm
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanEnhancersCohn
import torch
from torch.utils.data import DataLoader
from torch.nn import ConstantPad1d
from pathlib import Path
from gpl.model.cnn import CNN

  from tqdm.autonotebook import tqdm


# Load data


In [3]:
train_dset = HumanEnhancersCohn('train', version=0)

# Train subword tokenizer

We don't want to train new model if we already have one. This step takes about X minutes in Google Colab. 

In [4]:
if not Path('m.model').exists():
    spm.SentencePieceTrainer.train(sentence_iterator=iter([x[0] for x in train_dset]), model_prefix='m', vocab_size=512,)

sp = spm.SentencePieceProcessor(model_file='m.model')

# Tokenize data

In [5]:
def check_seq_lengths(dataset, tokenizer):
    # Compute length of the longest sequence
    max_tok_len = max([len(tokenizer(dataset[i])) for i in range(len(dataset))])
    print("max_tok_len ", max_tok_len)
    same_length = [len(tokenizer(dataset[i])) == max_tok_len for i in range(len(dataset))]
    if not all(same_length):
        print("not all sequences are of the same length")

    return max_tok_len

In [6]:
def tokenize(sp, inputs, labels, augment_factor=1):
    """Tokenize texts and do augmentation if augment_factor is bigger then 1
    
    Args:
        tokenizer (SentencePieceProcessor): trained SentencePiece tokenizer
        dset (List[(str, int)]): List of examples
        augment_factor (int): 
    
    Returns:
        tokenized_texts (List[List[str]]): List of list of indices of tokens
    """

    xs, ys = [], []
    for i in range(len(labels)):
        
        if augment_factor == 1:
            ys.append(labels[i])
            xs.append([token for token in sp.encode(inputs[i])])
        
        elif augment_factor > 1:
            ys.extend([labels[i] for _ in range(augment_factor)])

            x = [0 for _ in range(augment_factor)]
            for j in range(augment_factor):
                x[j] = [token for token in sp.encode(inputs[i], out_type=int, enable_sampling=True, alpha=0.1, nbest_size=-1)]
            xs.extend(x)

        else:
            raise(ValueError('augment_factor have to be > 0.'))

    return xs, ys

In [30]:
def pad(tokenized_texts, max_len):

  padded_texts = tokenized_texts[:]
  for i in range(len(padded_texts)):
      padded_texts[i] = padded_texts[i] + [0] * (max_len - len(padded_texts[i]))
  return padded_texts

In [8]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

def data_loader(inputs, labels, batch_size=50):
    """Convert data sets to torch.Tensors and load it to DataLoader.
    """
    # Convert data type to torch.Tensor
    inputs, labels = tuple(torch.tensor(data) for data in [inputs, labels])

    # Create DataLoader for data
    tensor_data = TensorDataset(inputs, labels)
    sampler = RandomSampler(tensor_data)
    dataloader = DataLoader(tensor_data, sampler=sampler, batch_size=batch_size)

    return dataloader

In [21]:
from sklearn.model_selection import train_test_split

inputs = [x[0] for x in train_dset]
labels = [x[1] for x in train_dset]

# Train Test Split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    inputs, labels, test_size=0.2, random_state=42)

In [22]:
print(len(train_inputs))
print(len(train_labels))
print(len(val_inputs))
print(len(val_labels))

16674
16674
4169
4169


In [31]:
train_tokens, train_labels = tokenize(sp, train_inputs, train_labels, augment_factor=1)
val_tokens, val_labels = tokenize(sp, val_inputs, val_labels, augment_factor=1)

max_len = max([len(tokenized_text) for tokenized_text in train_tokens])
max_len = max(max_len, max([len(tokenized_text) for tokenized_text in val_tokens]))
train_tokens = pad(train_tokens, max_len)
val_tokens = pad(val_tokens, max_len)
print(len(val_tokens[0]))

train_tokens_augment, train_labels_augment = tokenize(sp, train_inputs, train_labels, augment_factor=10)

max_len_augment = max([len(tokenized_text) for tokenized_text in train_tokens_augment])
max_len_augment = max(max_len_augment, max([len(tokenized_text) for tokenized_text in val_tokens]))
train_tokens_augment = pad(train_tokens_augment, max_len_augment)
print(len(val_tokens[0]))
val_tokens_augment = pad(val_tokens, max_len_augment)
print(len(val_tokens[0]))

140
140
140


In [25]:
print(len(train_tokens))
print(len(train_labels))
print(len(train_tokens_augment))
print(len(train_labels_augment))
print(len(val_tokens))
print(len(val_labels))

16674
16674
166740
166740
4169
4169


In [26]:
print(len(train_tokens[0]))
print(len(train_tokens_augment[0]))
print(len(val_tokens[0]))
print(len(val_tokens_augment[0]))

140
286
286
286


In [36]:
# Load data to PyTorch DataLoader
train_dataloader = data_loader(train_tokens, train_labels)
train_dataloader_augment = data_loader(train_tokens_augment, train_labels_augment)
val_dataloader = data_loader(val_tokens, val_labels)
val_dataloader_augment = data_loader(val_tokens_augment, val_labels)

# Original model

In [34]:
max_len = max([len(tokenized_text) for tokenized_text in train_tokens])
model = CNN(
    number_of_classes=2,
    vocab_size=len(sp),
    embedding_dim=100,
    input_len=max_len
).to('cpu')

In [35]:
model.train(train_dataloader, epochs=5)

Epoch 0
Train metrics: 
 Accuracy: 68.3%, Avg loss: 0.637673 

Epoch 1
Train metrics: 
 Accuracy: 70.3%, Avg loss: 0.625635 

Epoch 2
Train metrics: 
 Accuracy: 74.9%, Avg loss: 0.610579 

Epoch 3
Train metrics: 
 Accuracy: 76.4%, Avg loss: 0.604910 

Epoch 4
Train metrics: 
 Accuracy: 77.0%, Avg loss: 0.599936 



In [37]:
model.test(val_dataloader)

p  2076 ; tp  1006.3442997932434 ; fp  388.09259682893753
recall  0.484751589495782 ; precision  0.721685077489677
num_batches 84
correct 2715
size 4169
Test metrics: 
 Accuracy: 0.651235, F1 score: 0.579953, Avg loss: 0.656427 



(0.6512353082273926, 0.5799525130525963)

# Model with augmentation

In [38]:
max_len_augment = max([len(tokenized_text) for tokenized_text in train_tokens_augment])
model_augment = CNN(
    number_of_classes=2,
    vocab_size=len(sp),
    embedding_dim=100,
    input_len=max_len_augment
).to('cpu')

In [39]:
model_augment.train(train_dataloader_augment, epochs=5)

Epoch 0
Train metrics: 
 Accuracy: 68.9%, Avg loss: 0.642953 

Epoch 1
Train metrics: 
 Accuracy: 67.7%, Avg loss: 0.637315 

Epoch 2
Train metrics: 
 Accuracy: 70.7%, Avg loss: 0.632817 

Epoch 3
Train metrics: 
 Accuracy: 71.2%, Avg loss: 0.631393 

Epoch 4
Train metrics: 
 Accuracy: 69.9%, Avg loss: 0.628583 



In [40]:
model_augment.test(val_dataloader_augment)

p  2076 ; tp  1247.8806495666504 ; fp  501.35011422634125
recall  0.6010985787893306 ; precision  0.7133882363586922
num_batches 84
correct 2837
size 4169
Test metrics: 
 Accuracy: 0.680499, F1 score: 0.652447, Avg loss: 0.650656 



(0.6804989206044615, 0.652447251746604)