# PYTORCH CNN Classifier

In [1]:
DATASET = 'none'
VOCAB_SIZE = 0
TOKENIZER = 'none'
KMER = 0

In [2]:
# Parameters
DATASET = "human_nontata_promoters"
TOKENIZER = "kmer"
KMER = 4


In [3]:
print(DATASET, VOCAB_SIZE, TOKENIZER, KMER)

human_nontata_promoters 0 kmer 4


## Config

In [4]:
import torch
from torch.utils.data import DataLoader

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from glp.models import CNN
from glp.tokenizers import get_tokenizer
from glp.tokenizers.utils import build_vocab, coll_factory, check_config, check_seq_lengths

In [5]:
config = {
    "dataset": DATASET,
    "tokenizer": TOKENIZER,
    "dataset_version": 0,
    "epochs": 5,
    "batch_size": 32,
    "use_padding": True,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [6]:
train_dset = get_dataset(config["dataset"], 'train')

## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(config['tokenizer'])
tokenizer.train(train_dset=train_dset, vocab_size=VOCAB_SIZE, kmer=KMER)
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 258
{'CGTT': 256, 'CCTA': 251, 'CGGA': 249, 'GCAT': 248, 'TAGT': 247, 'CGGT': 245, 'TACG': 244, 'CTCA': 243, 'CACA': 241, 'TACT': 238, 'GCGA': 237, '<pad>': 257, 'AGTA': 235, 'TGGT': 234, 'AACG': 232, 'ACAT': 230, 'TCAA': 229, 'ATGG': 228, 'TGCA': 227, 'AATT': 225, 'ATAC': 224, 'TAAT': 223, 'GTAT': 222, 'TATA': 231, 'CCTT': 221, 'TAGC': 219, 'ATCT': 217, 'ATTA': 216, 'ATCC': 215, 'TTAC': 213, 'CAAT': 252, 'ATCA': 212, 'ACCG': 218, 'AATA': 210, 'TTGC': 209, 'CGTC': 207, 'GTCA': 206, 'TGTG': 204, 'GAAG': 203, 'GACC': 202, 'TTTC': 201, 'ACGG': 200, 'GGAC': 208, 'ATTT': 199, 'TAAG': 198, 'TCAT': 197, 'AACC': 196, 'AGTC': 82, 'GTTA': 194, 'GGGT': 189, 'TCTA': 188, 'GGTC': 103, 'TTGG': 141, 'ACTG': 186, 'CGCT': 185, 'TGTC': 183, 'GTCG': 180, 'TTAG': 171, 'GTAG': 168, 'CAAC': 192, 'AGAG': 92, 'GTCC': 167, 'TTTA': 166, 'GATG': 110, 'GATT': 162, 'CTAC': 59, 'TTGA': 190, 'TGTT': 165, 'AATG': 161, 'TATT': 233, 'GATC': 159, 'CTTC': 15, 'CTGA': 157, 'TACC': 253, 'CAAG': 175, 'GACA': 193,

## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_tok_len = check_seq_lengths(dataset=train_dset, tokenizer=tokenizer)

# Data Loader
collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = max_tok_len)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device


max_tok_len  62


In [9]:
tokenizer(train_dset[1][0])

['CACT',
 'CTGT',
 'GTTC',
 'TTTT',
 'AAAA',
 'TCCC',
 'TCAG',
 'GTTT',
 'TATG',
 'TTTT',
 'ATTG',
 'CTAC',
 'CAGA',
 'GTCT',
 'GCCT',
 'CCCT',
 'GAGG',
 'TTCT',
 'TGTA',
 'TAGA',
 'CTAG',
 'TTAT',
 'TTCC',
 'CTCT',
 'GTAA',
 'AGAA',
 'GCTG',
 'TTCT',
 'ATTC',
 'GTTC',
 'TCGC',
 'CTGG',
 'TTTG',
 'GAAC',
 'AAAC',
 'TGAA',
 'CACT',
 'TCCA',
 'AAGG',
 'AGGC',
 'AGTC',
 'CTTG',
 'CAGC',
 'CTTG',
 'TCTC',
 'CTTC',
 'CACT',
 'CCCC',
 'TCCT',
 'CCCC',
 'ACAG',
 'TCCT',
 'GGCT',
 'GGAG',
 'CAGC',
 'GAGT',
 'CTGT',
 'CGAT',
 'CCCA',
 'GGCC',
 'AGAG',
 'ACAA']

## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=max_tok_len
).to(device)

## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 79.4%, Avg loss: 0.623087 

Epoch 1


Train metrics: 
 Accuracy: 79.7%, Avg loss: 0.615126 

Epoch 2


Train metrics: 
 Accuracy: 81.9%, Avg loss: 0.608219 

Epoch 3


Train metrics: 
 Accuracy: 82.8%, Avg loss: 0.601781 

Epoch 4


Train metrics: 
 Accuracy: 84.2%, Avg loss: 0.599038 



## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

p  4119 ; tp  3220.605858564377 ; fp  975.5388323804613
recall  0.7818902302899676 ; precision  0.7675154447162782
num_batches 283
correct 7176
size 9034
Test metrics: 
 Accuracy: 0.794333, F1 score: 0.774636, Avg loss: 0.622656 



(0.7943325215851229, 0.7746361556574247)