# PYTORCH CNN Classifier

In [1]:
DATASET = 'none'
VOCAB_SIZE = 0
TOKENIZER = 'none'
KMER = 0

In [2]:
# Parameters
DATASET = "human_enhancers_cohn"
TOKENIZER = "kmer"
KMER = 4


In [3]:
print(DATASET, VOCAB_SIZE, TOKENIZER, KMER)

human_enhancers_cohn 0 kmer 4


## Config

In [4]:
import torch
from torch.utils.data import DataLoader

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from glp.models import CNN
from glp.tokenizers import get_tokenizer
from glp.tokenizers.utils import build_vocab, coll_factory, check_config, check_seq_lengths

In [5]:
config = {
    "dataset": DATASET,
    "tokenizer": TOKENIZER,
    "dataset_version": 0,
    "epochs": 5,
    "batch_size": 32,
    "use_padding": True,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [6]:
train_dset = get_dataset(config["dataset"], 'train')

## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(config['tokenizer'])
tokenizer.train(train_dset=train_dset, vocab_size=VOCAB_SIZE, kmer=KMER)
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 258
{'TCGA': 255, 'CGCT': 254, 'CGTT': 253, 'AACG': 251, 'TCCG': 250, 'GACG': 249, 'GAAC': 248, 'ACGT': 246, 'GCCG': 245, 'CGGT': 242, 'TAGC': 240, 'GCGT': 239, 'CACG': 237, 'AGCG': 244, 'TCGT': 236, 'CGAC': 233, 'GCAT': 232, 'ATAG': 230, 'AGAT': 229, 'TACT': 228, '<pad>': 257, 'AGTA': 227, 'GCGA': 226, 'CTAA': 225, 'GTAT': 224, 'TCTA': 223, 'TCGG': 221, 'ATAT': 218, 'ATCC': 217, 'TCCT': 216, 'ATTC': 215, 'CTTA': 214, 'TCAT': 211, 'AATC': 210, 'TAGG': 209, 'TCAA': 205, 'TTCG': 204, 'ATCT': 202, 'ACTA': 200, 'CCTA': 199, 'CTCA': 198, 'CGCC': 195, 'TGAC': 193, 'GGGT': 185, 'CAGG': 184, 'CCGA': 243, 'GGTT': 183, 'GACC': 182, 'CGAA': 181, 'AACT': 172, 'CCCA': 189, 'TATC': 152, 'TTGT': 168, 'GTCG': 167, 'TGAT': 166, 'TGGA': 164, 'GTCA': 162, 'AGGG': 174, 'CCCG': 160, 'AACC': 159, 'GGAT': 161, 'CTTG': 156, 'CAGT': 177, 'TAAA': 49, 'GAAT': 94, 'GTTG': 154, 'CTAT': 92, 'ACTT': 151, 'AGCA': 69, 'ATGC': 148, 'CTAG': 146, 'TAAC': 144, 'CAAT': 113, 'ATCA': 143, 'CCGG': 142, 'TTGC': 141,

## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_tok_len = check_seq_lengths(dataset=train_dset, tokenizer=tokenizer)

# Data Loader
collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = max_tok_len)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device


max_tok_len  125


In [9]:
tokenizer(train_dset[1][0])

['CCAT',
 'CTAT',
 'TTTT',
 'GAAT',
 'CCTT',
 'TTAA',
 'CACT',
 'TTAT',
 'GAGA',
 'GAGA',
 'AAGA',
 'ACAG',
 'AACA',
 'GGCA',
 'GGCA',
 'GGCT',
 'GTGT',
 'TTAA',
 'CCCC',
 'ACAA',
 'CACC',
 'ACCA',
 'TAGA',
 'GCCT',
 'AAAA',
 'GCTT',
 'GGGA',
 'GTAA',
 'ATAC',
 'AATT',
 'TCTT',
 'TGGC',
 'ACCT',
 'TGTA',
 'ATAA',
 'AACA',
 'TGAA',
 'CAAT',
 'GAGT',
 'TTCT',
 'CACC',
 'ATGT',
 'TATT',
 'AGTC',
 'TCTG',
 'GATT',
 'CCAC',
 'GCTC',
 'CGGC',
 'ATAC',
 'TTAC',
 'ACAT',
 'TTAC',
 'AGAA',
 'AAGA',
 'GAGA',
 'CAGA',
 'CAAA',
 'GAGA',
 'GTAC',
 'TGTG',
 'CTGA',
 'AAGG',
 'TTCA',
 'GACA',
 'GAAG',
 'AATT',
 'CGGG',
 'ACTC',
 'AAGC',
 'CACA',
 'ACAA',
 'CATT',
 'AATG',
 'AGGA',
 'AAGG',
 'GAGG',
 'GTAG',
 'GGCA',
 'GCCC',
 'AGTG',
 'AACA',
 'ATGA',
 'GCTG',
 'CTCG',
 'CATC',
 'AGTT',
 'TTGA',
 'AAGA',
 'GACA',
 'GTAG',
 'AGAA',
 'GACA',
 'CACA',
 'AACA',
 'TTCC',
 'TAAT',
 'TTCT',
 'TGCC',
 'TTGG',
 'CTGG',
 'ATAA',
 'CTCG',
 'GCTG',
 'CAAA',
 'AGGT',
 'AAGA',
 'TTCT',
 'TTGC',
 'TCTT',
 'CCGG',
 

## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=max_tok_len
).to(device)

## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 66.5%, Avg loss: 0.640658 

Epoch 1


Train metrics: 
 Accuracy: 72.5%, Avg loss: 0.627891 

Epoch 2


Train metrics: 
 Accuracy: 73.8%, Avg loss: 0.616994 

Epoch 3


Train metrics: 
 Accuracy: 74.6%, Avg loss: 0.610680 

Epoch 4


Train metrics: 
 Accuracy: 76.3%, Avg loss: 0.610392 



## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

p  3474 ; tp  2045.6479489803314 ; fp  892.8125833951271
recall  0.5888451206045859 ; precision  0.6961631529304988
num_batches 218
correct 4631
size 6948
Test metrics: 
 Accuracy: 0.666523, F1 score: 0.638023, Avg loss: 0.658182 



(0.6665227403569373, 0.6380227803827224)