# PyTorch pipeline with text augmentation

# Setup

In [53]:
%load_ext autoreload
%autoreload 2

In [63]:
!pip install sentencepiece
!pip install -qq tensorflow_addons genomic-benchmarks
!pip install git+https://github.com/katarinagresova/GLP

You should consider upgrading via the '/home/katarina/git/GLP/env/bin/python -m pip install --upgrade pip' command.[0m
Collecting git+https://github.com/katarinagresova/GLP
  Cloning https://github.com/katarinagresova/GLP to /tmp/pip-req-build-f1d7jroe
  Running command git clone --filter=blob:none -q https://github.com/katarinagresova/GLP /tmp/pip-req-build-f1d7jroe
  Resolved https://github.com/katarinagresova/GLP to commit 5a390144f56d0db0f4187b6e6ddf7a491ec6a04b
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: glp
  Building wheel for glp (setup.py) ... [?25ldone
[?25h  Created wheel for glp: filename=glp-0.0.1-py3-none-any.whl size=10448 sha256=4a0f18ccb3534edf199b01c9d9f7cd8304c71c80423272af84d66a4cda353e02
  Stored in directory: /tmp/pip-ephem-wheel-cache-ik441a9b/wheels/29/b5/06/a4f295964f0f97a8449bd111e3a5e1a8923b3b31a02661ce85
Successfully built glp
Installing collected packages: glp
Successfully installed glp-0.0.1
You should consider

In [64]:
import sentencepiece as spm
from genomic_benchmarks.dataset_getters.pytorch_datasets import HumanEnhancersCohn
import torch
from torch.utils.data import DataLoader
from torch.nn import ConstantPad1d
from pathlib import Path
from glp.models.cnn import CNN
from sklearn.model_selection import train_test_split

# Load data


In [76]:
train_dset = HumanEnhancersCohn('train', version=0)
test_dset = HumanEnhancersCohn('test', version=0)

# Train subword tokenizer

We don't want to train new model if we already have one. This step takes about X minutes in Google Colab. 

In [38]:
model_preffix = 'm'
model_file = model_preffix + '.model'

if not Path(model_file).exists():
    spm.SentencePieceTrainer.train(sentence_iterator=iter([x[0] for x in train_dset]), model_prefix=model_preffix, vocab_size=512,)

sp = spm.SentencePieceProcessor(model_file=model_file)

# Tokenize data

In [39]:
def tokenize(sp, inputs, labels, augment_factor=1):
    """Tokenize texts and do augmentation if augment_factor is bigger then 1
    
    Args:
        tokenizer (SentencePieceProcessor): trained SentencePiece tokenizer
        dset (List[(str, int)]): List of examples
        augment_factor (int): 
    
    Returns:
        tokenized_texts (List[List[str]]): List of list of indices of tokens
    """

    xs, ys = [], []
    for i in range(len(labels)):
        
        if augment_factor == 1:
            ys.append(labels[i])
            xs.append([token for token in sp.encode(inputs[i])])
        
        elif augment_factor > 1:
            ys.extend([labels[i] for _ in range(augment_factor)])

            x = [0 for _ in range(augment_factor)]
            for j in range(augment_factor):
                # TODO: improve augmentation by https://colab.research.google.com/github/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb
                x[j] = [token for token in sp.encode(inputs[i], out_type=int, enable_sampling=True, alpha=0.1, nbest_size=-1)]
            xs.extend(x)

        else:
            raise(ValueError('augment_factor have to be > 0.'))

    return xs, ys

In [40]:
def pad(tokenized_texts, max_len):

  padded_texts = tokenized_texts[:]
  for i in range(len(padded_texts)):
      padded_texts[i] = padded_texts[i] + [0] * (max_len - len(padded_texts[i]))
  return padded_texts

In [41]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

def data_loader(inputs, labels, batch_size=50):
    """Convert data sets to torch.Tensors and load it to DataLoader.
    """
    # Convert data type to torch.Tensor
    inputs, labels = tuple(torch.tensor(data) for data in [inputs, labels])

    # Create DataLoader for data
    tensor_data = TensorDataset(inputs, labels)
    sampler = RandomSampler(tensor_data)
    dataloader = DataLoader(tensor_data, sampler=sampler, batch_size=batch_size)

    return dataloader

In [73]:
def preprocess_train_dataset(dset, sp, augment_factor=1, split_factor=0.2):
    
    inputs = [x[0] for x in dset]
    labels = [x[1] for x in dset]

    # Train Test Split
    train_inputs, val_inputs, train_labels, val_labels = train_test_split(
        inputs, labels, test_size=0.2, random_state=42)

    train_tokens, train_labels = tokenize(sp, train_inputs, train_labels, augment_factor=augment_factor)
    val_tokens, val_labels = tokenize(sp, val_inputs, val_labels, augment_factor=1)

    max_len = max([len(tokenized_text) for tokenized_text in train_tokens])
    max_len = max(max_len, max([len(tokenized_text) for tokenized_text in val_tokens]))
    train_tokens = pad(train_tokens, max_len)
    val_tokens = pad(val_tokens, max_len)

    # Load data to PyTorch DataLoader
    train_dataloader = data_loader(train_tokens, train_labels)  
    val_dataloader = data_loader(val_tokens, val_labels)

    return train_dataloader, val_dataloader, max_len

In [71]:
def preprocess_test_dataset(dset, sp, max_len):
    inputs = [x[0] for x in dset]
    labels = [x[1] for x in dset]

    train_tokens, train_labels = tokenize(sp, inputs, labels, augment_factor=1)
    train_tokens = pad(train_tokens, max_len)

    # Load data to PyTorch DataLoader
    return data_loader(train_tokens, train_labels) 

In [86]:
train_dataloader, val_dataloader, max_len = preprocess_train_dataset(train_dset, sp)
train_dataloader_augment, val_dataloader_augment, max_len_augment = preprocess_train_dataset(train_dset, sp, augment_factor=2)
test_dataloader = preprocess_test_dataset(test_dset, sp, max_len=max_len)
test_dataloader_augment = preprocess_test_dataset(test_dset, sp, max_len=max_len_augment)

In [117]:
import torch
from torch import dropout, nn

# A simple CNN model inspired by https://github.com/ML-Bioinfo-CEITEC/genomic_benchmarks/blob/main/src/genomic_benchmarks/models/torch.py
class CNN(nn.Module):
    def __init__(self, number_of_classes, vocab_size, embedding_dim, input_len, dropout):
        super(CNN, self).__init__()
        if number_of_classes == 2:
            number_of_output_neurons = 1
            loss = torch.nn.functional.binary_cross_entropy_with_logits
            output_activation = nn.Sigmoid()
        else:
            raise Exception("Not implemented for number_of_classes!=2")
            # number_of_output_neurons = number_of_classes
            # loss = torch.nn.CrossEntropyLoss()
            # output_activation = nn.Softmax(dim=)

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=16, kernel_size=8, bias=True)
        self.norm1 = nn.BatchNorm1d(16)
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool1d(2)

        self.conv2 = nn.Conv1d(in_channels=16, out_channels=8, kernel_size=8, bias=True)
        self.norm2 = nn.BatchNorm1d(8)
        self.pool2 = nn.MaxPool1d(2)

        self.conv3 = nn.Conv1d(in_channels=8, out_channels=4, kernel_size=8, bias=True)
        self.norm3 = nn.BatchNorm1d(4)
        self.pool3 = nn.MaxPool1d(2)

        #         compute output shape of conv layers
        self.flatten = nn.Flatten()
        self.lin1 = nn.Linear(self.count_flatten_size(input_len), 512)
        self.dropout = nn.Dropout(dropout)
        self.lin2 = nn.Linear(512, number_of_output_neurons)
        self.output_activation = output_activation
        self.loss = loss

    def count_flatten_size(self, input_len):
        zeros = torch.zeros([1, input_len], dtype=torch.long)
        x = self.embeddings(zeros)
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.norm2(x)
        x = self.relu(x)
        x = self.pool2(x)

        x = self.conv3(x)
        x = self.norm3(x)
        x = self.relu(x)
        x = self.pool3(x)

        x = self.flatten(x)
        return x.size()[1]

    def forward(self, x):
        x = self.embeddings(x)
        x = x.transpose(1, 2)
        x = self.conv1(x)
        x = self.norm1(x)
        x = self.relu(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.norm2(x)
        x = self.relu(x)
        x = self.pool2(x)

        x = self.conv3(x)
        x = self.norm3(x)
        x = self.relu(x)
        x = self.pool3(x)

        x = self.flatten(x)
        x = self.lin1(x)
        x = self.dropout(x)
        x = self.relu(x)
        x = self.lin2(x)
        x = self.dropout(x)
        x = self.output_activation(x)
        return x

    def train_loop(self, dataloader, optimizer, val_dataloader):
        for x, y in dataloader:
            optimizer.zero_grad()
            pred = self(x)
            if y.shape != pred.shape:
                y = y.unsqueeze(1)
                y = y.float()
            loss = self.loss(pred, y)
            loss.backward()
            optimizer.step()


        train_loss, train_correct = self._eval(dataloader=dataloader)
        if val_dataloader != None:
            val_loss, val_correct = self._eval(dataloader=val_dataloader)
            print(f"Train metrics: \n Accuracy: {(100*train_correct):>0.1f}%, Avg loss: {train_loss:>8f} Val accuracy: {(100*val_correct):>0.1f}%, Val avg loss: {val_loss:>8f} \n")
        else:
            print(f"Train metrics: \n Accuracy: {(100*train_correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")


    def train(self, dataloader, epochs, val_datdaloader = None):
        optimizer = torch.optim.Adam(self.parameters())
        for t in range(epochs):
            print(f"Epoch {t}")
            self.train_loop(dataloader, optimizer, val_datdaloader)

    def _eval(self, dataloader):
        size = dataloader.dataset.__len__()
        num_batches = len(dataloader)
        loss, correct = 0, 0

        with torch.no_grad():
            for X, y in dataloader:
                pred = self(X)
                if y.shape != pred.shape:
                    y = y.unsqueeze(1)
                    y = y.float()
                loss += self.loss(pred, y).item()
                correct += (torch.round(pred) == y).sum().item()

        loss /= num_batches
        correct /= size

        return loss, correct

# TODO: update for multiclass classification datasets
    def test(self, dataloader, positive_label = 1):
        size = dataloader.dataset.__len__()
        num_batches = len(dataloader)
        test_loss, correct = 0, 0
        tp, p, fp = 0, 0, 0

        with torch.no_grad():
            for X, y in dataloader:
                pred = self(X)
                if y.shape != pred.shape:
                    y = y.unsqueeze(1)
                    y = y.float()
                test_loss += self.loss(pred, y).item()
                correct += (torch.round(pred) == y).sum().item()
                p += (y == positive_label).sum().item() 
                if(positive_label == 1):
                    tp += (y * pred).sum(dim=0).item()
                    fp += ((1 - y) * pred).sum(dim=0).item()
                else:
                    tp += ((1 - y) * (1 - pred)).sum(dim=0).item()
                    fp += (y * (1 - pred)).sum(dim=0).item()

        print("p ", p, "; tp ", tp, "; fp ", fp)
        recall = tp / p
        precision = tp / (tp + fp)
        print("recall ", recall, "; precision ", precision)
        f1_score = 2 * precision * recall / (precision + recall)
        
        print("num_batches", num_batches)
        print("correct", correct)
        print("size", size)

        test_loss /= num_batches
        accuracy = correct / size
        print(f"Test metrics: \n Accuracy: {accuracy:>6f}, F1 score: {f1_score:>6f}, Avg loss: {test_loss:>6f} \n")
        
        return accuracy, f1_score

# Original model

In [119]:
model = CNN(
    number_of_classes=2,
    vocab_size=len(sp),
    embedding_dim=100,
    input_len=max_len,
    dropout=0.3
).to('cpu')

In [120]:
model.train(train_dataloader, epochs=10, val_datdaloader=val_dataloader)

Epoch 0
Train metrics: 
 Accuracy: 62.2%, Avg loss: 0.664582 Val accuracy: 61.3%, Val avg loss: 0.673484 

Epoch 1
Train metrics: 
 Accuracy: 63.1%, Avg loss: 0.658658 Val accuracy: 61.6%, Val avg loss: 0.669294 

Epoch 2
Train metrics: 
 Accuracy: 64.6%, Avg loss: 0.654869 Val accuracy: 60.6%, Val avg loss: 0.673794 

Epoch 3
Train metrics: 
 Accuracy: 65.4%, Avg loss: 0.650748 Val accuracy: 61.4%, Val avg loss: 0.668497 

Epoch 4
Train metrics: 
 Accuracy: 67.4%, Avg loss: 0.645921 Val accuracy: 62.4%, Val avg loss: 0.672925 

Epoch 5
Train metrics: 
 Accuracy: 67.5%, Avg loss: 0.641617 Val accuracy: 62.4%, Val avg loss: 0.668769 

Epoch 6
Train metrics: 
 Accuracy: 68.5%, Avg loss: 0.641635 Val accuracy: 62.9%, Val avg loss: 0.669401 

Epoch 7
Train metrics: 
 Accuracy: 68.8%, Avg loss: 0.639918 Val accuracy: 61.8%, Val avg loss: 0.673261 

Epoch 8
Train metrics: 
 Accuracy: 69.3%, Avg loss: 0.636802 Val accuracy: 62.2%, Val avg loss: 0.669207 

Epoch 9
Train metrics: 
 Accuracy: 69

In [121]:
model.test(test_dataloader)

p  3474 ; tp  1910.278869152069 ; fp  1053.4717557430267
recall  0.5498787763822882 ; precision  0.6445477743997725
num_batches 139
correct 4320
size 6948
Test metrics: 
 Accuracy: 0.621762, F1 score: 0.593462, Avg loss: 0.672600 



(0.6217616580310881, 0.5934615925521959)

# Model with augmentation

In [122]:
model_augment = CNN(
    number_of_classes=2,
    vocab_size=len(sp),
    embedding_dim=100,
    input_len=max_len_augment,
    dropout=0.3
).to('cpu')

In [123]:
model_augment.train(train_dataloader_augment, epochs=10, val_datdaloader=val_dataloader_augment)

Epoch 0
Train metrics: 
 Accuracy: 60.1%, Avg loss: 0.671901 Val accuracy: 50.2%, Val avg loss: 0.701830 

Epoch 1
Train metrics: 
 Accuracy: 59.9%, Avg loss: 0.671125 Val accuracy: 50.2%, Val avg loss: 0.701667 

Epoch 2
Train metrics: 
 Accuracy: 62.3%, Avg loss: 0.666124 Val accuracy: 50.3%, Val avg loss: 0.699698 

Epoch 3
Train metrics: 
 Accuracy: 60.2%, Avg loss: 0.668663 Val accuracy: 50.2%, Val avg loss: 0.701759 

Epoch 4
Train metrics: 
 Accuracy: 63.5%, Avg loss: 0.662358 Val accuracy: 53.9%, Val avg loss: 0.691741 

Epoch 5
Train metrics: 
 Accuracy: 63.1%, Avg loss: 0.660313 Val accuracy: 53.0%, Val avg loss: 0.696190 

Epoch 6
Train metrics: 
 Accuracy: 63.2%, Avg loss: 0.658675 Val accuracy: 50.3%, Val avg loss: 0.698819 

Epoch 7
Train metrics: 
 Accuracy: 63.4%, Avg loss: 0.657307 Val accuracy: 50.9%, Val avg loss: 0.698131 

Epoch 8
Train metrics: 
 Accuracy: 65.4%, Avg loss: 0.654311 Val accuracy: 51.3%, Val avg loss: 0.696907 

Epoch 9
Train metrics: 
 Accuracy: 63

In [118]:
model_augment.test(test_dataloader_augment)

p  3474 ; tp  1579.5565600395203 ; fp  908.4003386497498
recall  0.45467949339076574 ; precision  0.6348809984898363
num_batches 139
correct 4192
size 6948
Test metrics: 
 Accuracy: 0.603339, F1 score: 0.529879, Avg loss: 0.677870 



(0.6033390903857225, 0.5298785572860427)

# Comparing with character model

In [8]:
from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from glp.models import CNN
from glp.tokenizers import get_tokenizer
from glp.tokenizers.utils import build_vocab, coll_factory, check_config, check_seq_lengths

In [12]:
DATASET = 'human_nontata_cohn'
VOCAB_SIZE = 0
TOKENIZER = 'character'
KMER = 0

In [13]:
config = {
    "dataset": DATASET,
    "tokenizer": TOKENIZER,
    "dataset_version": 0,
    "epochs": 5,
    "batch_size": 32,
    "use_padding": True,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

In [15]:
train_size = int(0.8 * len(train_dset))
val_size = len(train_dset) - train_size
train_data, val_data = torch.utils.data.random_split(train_dset, [train_size, val_size], generator=torch.Generator().manual_seed(42))

In [16]:
tokenizer = get_tokenizer(config['tokenizer'])
tokenizer.train(train_dset=train_data, vocab_size=VOCAB_SIZE, kmer=KMER)
vocabulary = build_vocab(train_data, tokenizer, use_padding=config["use_padding"])


In [17]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_tok_len = check_seq_lengths(dataset=train_data, tokenizer=tokenizer)

# Data Loader
collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = max_tok_len)

train_loader = DataLoader(train_data, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device
max_tok_len  502


In [27]:
model_char = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=max_tok_len
).to(device)

In [28]:
model_char.train(train_loader, epochs=config["epochs"])

Epoch 0
Train metrics: 
 Accuracy: 63.7%, Avg loss: 0.652409 

Epoch 1
Train metrics: 
 Accuracy: 65.2%, Avg loss: 0.648000 

Epoch 2
Train metrics: 
 Accuracy: 66.6%, Avg loss: 0.644181 

Epoch 3
Train metrics: 
 Accuracy: 67.7%, Avg loss: 0.638699 

Epoch 4
Train metrics: 
 Accuracy: 70.2%, Avg loss: 0.642637 



In [29]:
test_loader = DataLoader(val_data, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model_char.test(test_loader)
acc, f1

p  2094 ; tp  1309.8123128414154 ; fp  544.2741401195526
recall  0.625507312722739 ; precision  0.7064461911954811
num_batches 131
correct 2866
size 4169
Test metrics: 
 Accuracy: 0.687455, F1 score: 0.663518, Avg loss: 0.649707 



(0.6874550251858959, 0.6635175437250562)