# PYTORCH CNN Classifier

In [1]:
DATASET = 'none'
VOCAB_SIZE = 0
TOKENIZER = 'none'
KMER = 0

In [2]:
# Parameters
DATASET = "human_enhancers_cohn"
TOKENIZER = "kmer"
KMER = 7


In [3]:
print(DATASET, VOCAB_SIZE, TOKENIZER, KMER)

human_enhancers_cohn 0 kmer 7


## Config

In [4]:
import torch
from torch.utils.data import DataLoader

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from glp.models import CNN
from glp.tokenizers import get_tokenizer
from glp.tokenizers.utils import build_vocab, coll_factory, check_config, check_seq_lengths

In [5]:
config = {
    "dataset": DATASET,
    "tokenizer": TOKENIZER,
    "dataset_version": 0,
    "epochs": 5,
    "batch_size": 32,
    "use_padding": True,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [6]:
train_dset = get_dataset(config["dataset"], 'train')

## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(config['tokenizer'])
tokenizer.train(train_dset=train_dset, vocab_size=VOCAB_SIZE, kmer=KMER)
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 16363
{'<pad>': 16362, 'TCCGACG': 16361, 'TACGACG': 16357, 'CGCGTAC': 16356, 'ATAGTCG': 16355, 'ACCGACG': 16354, 'CGTAACG': 16352, 'GTTCGAC': 16351, 'CGACCTA': 16350, 'GTTCGCA': 16347, 'CGACTAG': 16345, 'ATGCGCG': 16342, 'ACGTCTA': 16340, 'ACGACTA': 16338, 'TTTTCGC': 16337, 'GTATACG': 16334, 'TCGACCG': 16333, 'CGTCGCA': 16332, 'TATCGAC': 16331, 'ATCGCTA': 16328, 'TCGCGAT': 16326, 'GTCGCGA': 16325, 'TACCGTT': 16324, 'CGGGTTG': 16320, 'CGTCACG': 16318, 'CCGATCG': 16317, 'TCGACGA': 16315, 'TACTCGA': 16314, 'GTCGACC': 16313, 'TAGCGGT': 16312, 'AGTACGA': 16311, 'GCGTAAC': 16310, 'CGTCGAA': 16309, 'CGCGACA': 16305, 'GAACGAC': 16303, 'ATATCGT': 16302, 'TTTCGCG': 16301, 'TACGAGT': 16295, 'CGCTCGA': 16292, 'GCCGCAA': 16291, 'AGACGAT': 16289, 'CGTCGTA': 16288, 'ATTGCGA': 16287, 'GTCGTTC': 16286, 'CGATGCA': 16283, 'ATCGCCG': 16282, 'GACGATA': 16279, 'CCGACGA': 16278, 'GCAACGA': 16277, 'TCGCTAA': 16275, 'TACCGAT': 16274, 'TTGAGCG': 16272, 'TATAACG': 16267, 'CTGACGA': 16266, 'CCGTCGT': 1

## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_tok_len = check_seq_lengths(dataset=train_dset, tokenizer=tokenizer)

# Data Loader
collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = max_tok_len)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device


max_tok_len  71


In [9]:
tokenizer(train_dset[1][0])

['CCATCTA',
 'TTTTTGA',
 'ATCCTTT',
 'TAACACT',
 'TTATGAG',
 'AGAGAAA',
 'GAACAGA',
 'ACAGGCA',
 'GGCAGGC',
 'TGTGTTT',
 'AACCCCA',
 'CAACACC',
 'ACCATAG',
 'AGCCTAA',
 'AAGCTTG',
 'GGAGTAA',
 'ATACAAT',
 'TTCTTTG',
 'GCACCTT',
 'GTAATAA',
 'AACATGA',
 'ACAATGA',
 'GTTTCTC',
 'ACCATGT',
 'TATTAGT',
 'CTCTGGA',
 'TTCCACG',
 'CTCCGGC',
 'ATACTTA',
 'CACATTT',
 'ACAGAAA',
 'AGAGAGA',
 'CAGACAA',
 'AGAGAGT',
 'ACTGTGC',
 'TGAAAGG',
 'TTCAGAC',
 'AGAAGAA',
 'TTCGGGA',
 'CTCAAGC',
 'CACAACA',
 'ACATTAA',
 'TGAGGAA',
 'AGGGAGG',
 'GTAGGGC',
 'AGCCCAG',
 'TGAACAA',
 'TGAGCTG',
 'CTCGCAT',
 'CAGTTTT',
 'GAAAGAG',
 'ACAGTAG',
 'AGAAGAC',
 'ACACAAA',
 'CATTCCT',
 'AATTTCT',
 'TGCCTTG',
 'GCTGGAT',
 'AACTCGG',
 'CTGCAAA',
 'AGGTAAG',
 'ATTCTTT',
 'GCTCTTC',
 'CGGTTTT',
 'CGGGATC',
 'AGAGGTA',
 'ACCCGGG',
 'GGCCTAG',
 'CACAGGT',
 'GGTCTCA',
 'CATCTGG']

## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=max_tok_len
).to(device)

## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 58.6%, Avg loss: 0.661313 

Epoch 1


Train metrics: 
 Accuracy: 70.5%, Avg loss: 0.624147 

Epoch 2


Train metrics: 
 Accuracy: 76.8%, Avg loss: 0.602046 

Epoch 3


Train metrics: 
 Accuracy: 79.1%, Avg loss: 0.592817 

Epoch 4


Train metrics: 
 Accuracy: 80.3%, Avg loss: 0.585114 



## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

p  3474 ; tp  1456.072885274887 ; fp  719.2670883350074
recall  0.419134394149363 ; precision  0.6693541712740143
num_batches 218
correct 4221
size 6948
Test metrics: 
 Accuracy: 0.607513, F1 score: 0.515484, Avg loss: 0.676561 



(0.6075129533678757, 0.5154842484526436)