# PYTORCH CNN Classifier

To run this notebook on an another benchmark, use

```
papermill utils/torch_cnn_character.ipynb torch_cnn_character_experiments/[DATASET NAME].ipynb -p DATASET [DATASET NAME]
```

In [1]:
DATASET = 'no_dataset'
VERSION = 0
BATCH_SIZE = 32
EPOCHS = 5

In [2]:
print(DATASET, VERSION, BATCH_SIZE, EPOCHS)

demo_human_or_worm 0 32 1


## Config

In [3]:
import torch
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from genomic_benchmarks.models.torch import CNN
from genomic_benchmarks.dataset_getters.utils import coll_factory, LetterTokenizer, build_vocab, check_seq_lengths, check_config, VARIABLE_LENGTH_DATASETS

In [7]:
USE_PADDING = DATASET in VARIABLE_LENGTH_DATASETS
    
config = {
    "dataset": DATASET,
    "dataset_version": VERSION,
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "use_padding": USE_PADDING,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [8]:
train_dset = get_dataset(config["dataset"], 'train')

Downloading 1Vuc44bXRISqRDXNrxt5lGYLpLsJbrSg8 into /root/.genomic_benchmarks/demo_human_or_worm.zip... 



Done.
Unzipping...Done.


## Tokenizer and vocab

In [9]:
tokenizer = get_tokenizer(LetterTokenizer())
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 9
{'<pad>': 8, '<eos>': 6, 'N': 7, '<bos>': 1, '<unk>': 0, 'C': 2, 'A': 3, 'T': 4, 'G': 5}


## Dataloader and batch preparation

In [10]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_seq_len, nn_input_len = check_seq_lengths(dataset=train_dset, config=config)

# Data Loader
if(config["use_padding"]):
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = nn_input_len)
else:
    collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = None)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device
max_seq_len  200
not all sequences are of the same length


## Model

In [11]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=nn_input_len
).to(device)

## Training

In [12]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 91.4%, Avg loss: 0.543535 



## Testing

In [14]:
def test(self, dataloader, positive_label = 1):
    size = dataloader.dataset.__len__()
    num_batches = len(dataloader)
    test_loss, correct = 0, 0
    tp, p, fp = 0, 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = self(X)
            test_loss += self.loss(pred, y).item()
            correct += (torch.round(pred) == y).sum().item()
            p += (y == positive_label).sum().item() 
            if(positive_label == 1):
                tp += (y * pred).sum(dim=0).item()
                fp += ((1 - y) * pred).sum(dim=0).item()
            else:
                tp += ((1 - y) * (1 - pred)).sum(dim=0).item()
                fp += (y * (1 - pred)).sum(dim=0).item()

    print("p ", p, "; tp ", tp, "; fp ", fp)
    recall = tp / p
    precision = tp / (tp + fp)
    print("recall ", recall, "; precision ", precision)
    f1_score = 2 * precision * recall / (precision + recall)
    
    print("num_batches", num_batches)
    print("correct", correct)
    print("size", size)

    test_loss /= num_batches
    accuracy = correct / size
    print(f"Test metrics: \n Accuracy: {accuracy:>6f}, F1 score: {f1_score:>6f}, Avg loss: {test_loss:>6f} \n")
    
    return accuracy, f1_score

In [15]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = test(model, test_loader)
acc, f1

  x = torch.tensor(pad(x), dtype=torch.long)


p  12500 ; tp  11173.35557794571 ; fp  813.8180167268797
recall  0.8938684462356568 ; precision  0.9321092657664888
num_batches 782
correct 22881
size 25000
Test metrics: 
 Accuracy: 0.915240, F1 score: 0.912588, Avg loss: 0.542609 



(0.91524, 0.9125884238740052)