# PYTORCH CNN Classifier

In [1]:
DATASET = 'none'
VOCAB_SIZE = 0
TOKENIZER = 'none'
KMER = 0

In [2]:
# Parameters
DATASET = "human_nontata_promoters"
TOKENIZER = "kmer"
KMER = 2


In [3]:
print(DATASET, VOCAB_SIZE, TOKENIZER, KMER)

human_nontata_promoters 0 kmer 2


## Config

In [4]:
import torch
from torch.utils.data import DataLoader

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from glp.models import CNN
from glp.tokenizers import get_tokenizer
from glp.tokenizers.utils import build_vocab, coll_factory, check_config, check_seq_lengths

In [5]:
config = {
    "dataset": DATASET,
    "tokenizer": TOKENIZER,
    "dataset_version": 0,
    "epochs": 5,
    "batch_size": 32,
    "use_padding": True,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [6]:
train_dset = get_dataset(config["dataset"], 'train')

## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(config['tokenizer'])
tokenizer.train(train_dset=train_dset, vocab_size=VOCAB_SIZE, kmer=KMER)
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 18
{'TT': 15, 'CC': 14, 'CG': 12, 'GT': 13, 'AT': 11, 'GC': 10, '<pad>': 17, 'CT': 8, 'TA': 7, 'TG': 9, 'GA': 5, 'TC': 4, 'AG': 16, 'AA': 3, 'GG': 2, 'CA': 1, 'AC': 6, '<unk>': 0}


## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_tok_len = check_seq_lengths(dataset=train_dset, tokenizer=tokenizer)

# Data Loader
collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = max_tok_len)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device


max_tok_len  125


In [9]:
tokenizer(train_dset[1][0])

['CA',
 'CT',
 'CT',
 'GT',
 'GT',
 'TC',
 'TT',
 'TT',
 'AA',
 'AA',
 'TC',
 'CC',
 'TC',
 'AG',
 'GT',
 'TT',
 'TA',
 'TG',
 'TT',
 'TT',
 'AT',
 'TG',
 'CT',
 'AC',
 'CA',
 'GA',
 'GT',
 'CT',
 'GC',
 'CT',
 'CC',
 'CT',
 'GA',
 'GG',
 'TT',
 'CT',
 'TG',
 'TA',
 'TA',
 'GA',
 'CT',
 'AG',
 'TT',
 'AT',
 'TT',
 'CC',
 'CT',
 'CT',
 'GT',
 'AA',
 'AG',
 'AA',
 'GC',
 'TG',
 'TT',
 'CT',
 'AT',
 'TC',
 'GT',
 'TC',
 'TC',
 'GC',
 'CT',
 'GG',
 'TT',
 'TG',
 'GA',
 'AC',
 'AA',
 'AC',
 'TG',
 'AA',
 'CA',
 'CT',
 'TC',
 'CA',
 'AA',
 'GG',
 'AG',
 'GC',
 'AG',
 'TC',
 'CT',
 'TG',
 'CA',
 'GC',
 'CT',
 'TG',
 'TC',
 'TC',
 'CT',
 'TC',
 'CA',
 'CT',
 'CC',
 'CC',
 'TC',
 'CT',
 'CC',
 'CC',
 'AC',
 'AG',
 'TC',
 'CT',
 'GG',
 'CT',
 'GG',
 'AG',
 'CA',
 'GC',
 'GA',
 'GT',
 'CT',
 'GT',
 'CG',
 'AT',
 'CC',
 'CA',
 'GG',
 'CC',
 'AG',
 'AG',
 'AC',
 'AA',
 'GG']

## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=max_tok_len
).to(device)

## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 80.9%, Avg loss: 0.617436 

Epoch 1


Train metrics: 
 Accuracy: 80.3%, Avg loss: 0.609074 

Epoch 2


Train metrics: 
 Accuracy: 82.3%, Avg loss: 0.603836 

Epoch 3


Train metrics: 
 Accuracy: 82.7%, Avg loss: 0.599312 

Epoch 4


Train metrics: 
 Accuracy: 83.8%, Avg loss: 0.594474 



## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

p  4119 ; tp  3002.943624019623 ; fp  630.4149087881381
recall  0.7290467647534894 ; precision  0.8264925128924807
num_batches 283
correct 7298
size 9034
Test metrics: 
 Accuracy: 0.807837, F1 score: 0.774717, Avg loss: 0.607943 



(0.8078370599955723, 0.7747174260094527)