# PYTORCH CNN Classifier

In [1]:
DATASET = 'none'
VOCAB_SIZE = 0
TOKENIZER = 'none'
KMER = 0

In [2]:
# Parameters
DATASET = "human_enhancers_cohn"
TOKENIZER = "kmer"
KMER = 3


In [3]:
print(DATASET, VOCAB_SIZE, TOKENIZER, KMER)

human_enhancers_cohn 0 kmer 3


## Config

In [4]:
import torch
from torch.utils.data import DataLoader

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from glp.models import CNN
from glp.tokenizers import get_tokenizer
from glp.tokenizers.utils import build_vocab, coll_factory, check_config, check_seq_lengths

In [5]:
config = {
    "dataset": DATASET,
    "tokenizer": TOKENIZER,
    "dataset_version": 0,
    "epochs": 5,
    "batch_size": 32,
    "use_padding": True,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [6]:
train_dset = get_dataset(config["dataset"], 'train')

## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(config['tokenizer'])
tokenizer.train(train_dset=train_dset, vocab_size=VOCAB_SIZE, kmer=KMER)
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 66
{'GCG': 64, 'CGC': 62, 'CCG': 61, 'TCG': 60, 'ACT': 59, 'GGA': 56, 'AAC': 55, 'ATT': 53, 'ATC': 63, 'TAT': 52, 'AAT': 51, 'ACC': 49, 'TTC': 46, 'GTA': 50, 'ATA': 42, 'CGT': 45, 'ACG': 44, 'TGG': 8, 'TTA': 40, 'GGT': 39, 'CCA': 20, 'CAA': 36, 'TGA': 57, 'GAG': 6, 'TAG': 34, 'CAT': 27, 'AAG': 43, 'TAC': 33, 'CCT': 31, 'TTT': 30, 'TTG': 29, 'GTT': 32, 'CTA': 26, 'GCT': 24, 'GCA': 28, 'AAA': 3, 'ACA': 23, 'GAC': 11, 'CCC': 4, 'AGG': 21, 'GAT': 22, 'GGG': 15, '<pad>': 65, 'TCA': 25, 'CAG': 13, 'TCT': 9, 'TAA': 37, 'ATG': 2, 'GAA': 35, 'TGC': 41, 'AGA': 54, 'CAC': 48, '<unk>': 0, 'CGG': 58, 'CTT': 17, 'CGA': 47, 'GTC': 19, 'GCC': 10, 'AGT': 38, 'AGC': 12, 'GTG': 7, 'GGC': 5, 'CTG': 1, 'TCC': 16, 'TGT': 14, 'CTC': 18}


## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_tok_len = check_seq_lengths(dataset=train_dset, tokenizer=tokenizer)

# Data Loader
collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = max_tok_len)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device


max_tok_len  166


In [9]:
tokenizer(train_dset[1][0])

['CCA',
 'TCT',
 'ATT',
 'TTT',
 'GAA',
 'TCC',
 'TTT',
 'TAA',
 'CAC',
 'TTT',
 'ATG',
 'AGA',
 'GAG',
 'AAA',
 'GAA',
 'CAG',
 'AAC',
 'AGG',
 'CAG',
 'GCA',
 'GGC',
 'TGT',
 'GTT',
 'TAA',
 'CCC',
 'CAC',
 'AAC',
 'ACC',
 'ACC',
 'ATA',
 'GAG',
 'CCT',
 'AAA',
 'AGC',
 'TTG',
 'GGA',
 'GTA',
 'AAT',
 'ACA',
 'ATT',
 'TCT',
 'TTG',
 'GCA',
 'CCT',
 'TGT',
 'AAT',
 'AAA',
 'ACA',
 'TGA',
 'ACA',
 'ATG',
 'AGT',
 'TTC',
 'TCA',
 'CCA',
 'TGT',
 'TAT',
 'TAG',
 'TCT',
 'CTG',
 'GAT',
 'TCC',
 'ACG',
 'CTC',
 'CGG',
 'CAT',
 'ACT',
 'TAC',
 'ACA',
 'TTT',
 'ACA',
 'GAA',
 'AAG',
 'AGA',
 'GAC',
 'AGA',
 'CAA',
 'AGA',
 'GAG',
 'TAC',
 'TGT',
 'GCT',
 'GAA',
 'AGG',
 'TTC',
 'AGA',
 'CAG',
 'AAG',
 'AAT',
 'TCG',
 'GGA',
 'CTC',
 'AAG',
 'CCA',
 'CAA',
 'CAA',
 'CAT',
 'TAA',
 'TGA',
 'GGA',
 'AAG',
 'GGA',
 'GGG',
 'TAG',
 'GGC',
 'AGC',
 'CCA',
 'GTG',
 'AAC',
 'AAT',
 'GAG',
 'CTG',
 'CTC',
 'GCA',
 'TCA',
 'GTT',
 'TTG',
 'AAA',
 'GAG',
 'ACA',
 'GTA',
 'GAG',
 'AAG',
 'ACA',
 'CAC',


## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=max_tok_len
).to(device)

## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 66.9%, Avg loss: 0.642733 

Epoch 1


Train metrics: 
 Accuracy: 69.9%, Avg loss: 0.635220 

Epoch 2


Train metrics: 
 Accuracy: 71.1%, Avg loss: 0.627893 

Epoch 3


Train metrics: 
 Accuracy: 71.1%, Avg loss: 0.626039 

Epoch 4


Train metrics: 
 Accuracy: 74.1%, Avg loss: 0.620532 



## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

p  3474 ; tp  2065.9900700449944 ; fp  840.5472082160413
recall  0.594700653438398 ; precision  0.7108080414096957
num_batches 218
correct 4708
size 6948
Test metrics: 
 Accuracy: 0.677605, F1 score: 0.647591, Avg loss: 0.653015 



(0.6776050662061025, 0.647591254449363)