# PYTORCH CNN Classifier

In [1]:
DATASET = 'none'
VOCAB_SIZE = 0
TOKENIZER = 'none'
KMER = 0

In [2]:
# Parameters
DATASET = "human_enhancers_cohn"
TOKENIZER = "kmer"
KMER = 5


In [3]:
print(DATASET, VOCAB_SIZE, TOKENIZER, KMER)

human_enhancers_cohn 0 kmer 5


## Config

In [4]:
import torch
from torch.utils.data import DataLoader

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from glp.models import CNN
from glp.tokenizers import get_tokenizer
from glp.tokenizers.utils import build_vocab, coll_factory, check_config, check_seq_lengths

In [5]:
config = {
    "dataset": DATASET,
    "tokenizer": TOKENIZER,
    "dataset_version": 0,
    "epochs": 5,
    "batch_size": 32,
    "use_padding": True,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [6]:
train_dset = get_dataset(config["dataset"], 'train')

## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(config['tokenizer'])
tokenizer.train(train_dset=train_dset, vocab_size=VOCAB_SIZE, kmer=KMER)
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 1026
{'<pad>': 1025, 'CGATA': 1023, 'GTCGA': 1020, 'TACCG': 1019, 'CGATC': 1017, 'GTCGG': 1016, 'CGACG': 1014, 'CGTAT': 1010, 'AGCGA': 1009, 'CCGAT': 1008, 'CGTAA': 1005, 'GCGAA': 1004, 'CTCGC': 1002, 'ATCGA': 1001, 'CGGAT': 995, 'CGCAC': 994, 'CCGTA': 993, 'CGGAG': 991, 'CGCTG': 988, 'TTACG': 987, 'CACGT': 986, 'GCCGC': 984, 'ACGAC': 980, 'GTTCG': 977, 'TCCGC': 976, 'CGCGT': 975, 'GCGGT': 971, 'CGTTA': 970, 'CGGAC': 968, 'AATCG': 966, 'CACGA': 964, 'AACGC': 962, 'AGCCG': 961, 'TATTA': 959, 'GTATA': 957, 'GGGTA': 955, 'ACGTT': 954, 'CGGGA': 953, 'TCGTA': 950, 'GCGAT': 949, 'ATATG': 948, 'GTACC': 947, 'CCTCG': 942, 'CCCGA': 940, 'TCCGA': 939, 'CGGTG': 938, 'TCATC': 936, 'CGGGC': 934, 'ACGGC': 933, 'CAGTA': 935, 'GCAAC': 932, 'GACTT': 931, 'CCCGG': 929, 'TACGA': 997, 'GCGCG': 926, 'AGCGT': 925, 'CTGTA': 920, 'GGTAC': 918, 'GGGAT': 917, 'GTTGC': 916, 'TAATA': 915, 'CGTAG': 914, 'CATCG': 913, 'AGGCG': 912, 'GATGC': 909, 'GTTAT': 906, 'TTAGA': 905, 'CATTG': 904, 'ACCGT': 946, 'TA

## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_tok_len = check_seq_lengths(dataset=train_dset, tokenizer=tokenizer)

# Data Loader
collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = max_tok_len)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device


max_tok_len  100


In [9]:
tokenizer(train_dset[1][0])

['CCATC',
 'TATTT',
 'TTGAA',
 'TCCTT',
 'TTAAC',
 'ACTTT',
 'ATGAG',
 'AGAGA',
 'AAGAA',
 'CAGAA',
 'CAGGC',
 'AGGCA',
 'GGCTG',
 'TGTTT',
 'AACCC',
 'CACAA',
 'CACCA',
 'CCATA',
 'GAGCC',
 'TAAAA',
 'GCTTG',
 'GGAGT',
 'AAATA',
 'CAATT',
 'TCTTT',
 'GGCAC',
 'CTTGT',
 'AATAA',
 'AACAT',
 'GAACA',
 'ATGAG',
 'TTTCT',
 'CACCA',
 'TGTTA',
 'TTAGT',
 'CTCTG',
 'GATTC',
 'CACGC',
 'TCCGG',
 'CATAC',
 'TTACA',
 'CATTT',
 'ACAGA',
 'AAAGA',
 'GAGAC',
 'AGACA',
 'AAGAG',
 'AGTAC',
 'TGTGC',
 'TGAAA',
 'GGTTC',
 'AGACA',
 'GAAGA',
 'ATTCG',
 'GGACT',
 'CAAGC',
 'CACAA',
 'CAACA',
 'TTAAT',
 'GAGGA',
 'AAGGG',
 'AGGGT',
 'AGGGC',
 'AGCCC',
 'AGTGA',
 'ACAAT',
 'GAGCT',
 'GCTCG',
 'CATCA',
 'GTTTT',
 'GAAAG',
 'AGACA',
 'GTAGA',
 'GAAGA',
 'CACAC',
 'AAACA',
 'TTCCT',
 'AATTT',
 'CTTGC',
 'CTTGG',
 'CTGGA',
 'TAACT',
 'CGGCT',
 'GCAAA',
 'AGGTA',
 'AGATT',
 'CTTTG',
 'CTCTT',
 'CCGGT',
 'TTTCG',
 'GGATC',
 'AGAGG',
 'TAACC',
 'CGGGG',
 'GCCTA',
 'GCACA',
 'GGTGG',
 'TCTCA',
 'CATCT',
 'GGGAA']

## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=max_tok_len
).to(device)

## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 67.5%, Avg loss: 0.639483 

Epoch 1


Train metrics: 
 Accuracy: 72.3%, Avg loss: 0.622474 

Epoch 2


Train metrics: 
 Accuracy: 75.6%, Avg loss: 0.613997 

Epoch 3


Train metrics: 
 Accuracy: 77.3%, Avg loss: 0.600440 

Epoch 4


Train metrics: 
 Accuracy: 77.9%, Avg loss: 0.597837 



## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

p  3474 ; tp  1736.425568819046 ; fp  705.4748484790325
recall  0.4998346484798636 ; precision  0.7110959793931202
num_batches 218
correct 4513
size 6948
Test metrics: 
 Accuracy: 0.649539, F1 score: 0.587037, Avg loss: 0.659458 



(0.6495394358088659, 0.5870367809916955)