# PYTORCH CNN Classifier

In [1]:
DATASET = 'none'
VOCAB_SIZE = 0
TOKENIZER = 'none'
KMER = 0

In [2]:
# Parameters
DATASET = "human_enhancers_cohn"
TOKENIZER = "kmer"
KMER = 6


In [3]:
print(DATASET, VOCAB_SIZE, TOKENIZER, KMER)

human_enhancers_cohn 0 kmer 6


## Config

In [4]:
import torch
from torch.utils.data import DataLoader

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from glp.models import CNN
from glp.tokenizers import get_tokenizer
from glp.tokenizers.utils import build_vocab, coll_factory, check_config, check_seq_lengths

In [5]:
config = {
    "dataset": DATASET,
    "tokenizer": TOKENIZER,
    "dataset_version": 0,
    "epochs": 5,
    "batch_size": 32,
    "use_padding": True,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [6]:
train_dset = get_dataset(config["dataset"], 'train')

## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(config['tokenizer'])
tokenizer.train(train_dset=train_dset, vocab_size=VOCAB_SIZE, kmer=KMER)
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

vocab len: 4098
{'TACGCG': 4094, 'CGTACG': 4093, 'ACGACC': 4092, 'ACGCGT': 4091, 'TCGACG': 4089, 'GTCGAC': 4088, 'GCGGTA': 4086, 'CGACGA': 4081, 'CGACAA': 4080, 'ATAACG': 4079, 'TTAACG': 4076, 'CGACCA': 4073, 'CGATAA': 4071, 'CGGTCG': 4067, 'CGTCGA': 4066, 'TTCGCG': 4065, 'GTCGCG': 4064, 'CCGATA': 4063, 'CGGATT': 4061, 'CCCGTA': 4059, 'TCGCCG': 4056, 'CGACTA': 4055, 'CGTCTA': 4053, 'TCGTTC': 4052, 'CCGTAT': 4047, 'TACGGC': 4042, 'CCGTTT': 4041, 'CTCGTA': 4039, 'TCGTTG': 4038, 'CGTACC': 4034, 'TAGCGA': 4032, 'AGCACG': 4031, 'TCGATC': 4028, 'CGTTGG': 4027, 'ACGCGC': 4023, 'TAACGG': 4019, 'TTCGGG': 4018, 'CCGACA': 4015, 'ATCGTA': 4014, 'GTAGCG': 4011, 'CGCCGT': 4006, 'TAGCGG': 4005, 'GCACCG': 4000, 'AACGCG': 3999, 'CGTTTG': 3998, 'CGCGAC': 4072, 'AACCGT': 3996, 'TCGCGG': 3994, 'TACCGG': 3993, 'AGCTCG': 3990, 'TACGGA': 3989, 'CGAATA': 3988, 'CGGAAC': 3987, 'TCGTAA': 3986, 'CGACAT': 3984, 'CGCTAC': 3983, 'AGTCGC': 3982, 'TAACGA': 3981, 'CGTCGT': 3980, 'TACGAC': 3978, 'TATCCG': 4029, 'GTACGC

## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_tok_len = check_seq_lengths(dataset=train_dset, tokenizer=tokenizer)

# Data Loader
collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = max_tok_len)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device


max_tok_len  83


In [9]:
tokenizer(train_dset[1][0])

['CCATCT',
 'ATTTTT',
 'GAATCC',
 'TTTTAA',
 'CACTTT',
 'ATGAGA',
 'GAGAAA',
 'GAACAG',
 'AACAGG',
 'CAGGCA',
 'GGCTGT',
 'GTTTAA',
 'CCCCAC',
 'AACACC',
 'ACCATA',
 'GAGCCT',
 'AAAAGC',
 'TTGGGA',
 'GTAAAT',
 'ACAATT',
 'TCTTTG',
 'GCACCT',
 'TGTAAT',
 'AAAACA',
 'TGAACA',
 'ATGAGT',
 'TTCTCA',
 'CCATGT',
 'TATTAG',
 'TCTCTG',
 'GATTCC',
 'ACGCTC',
 'CGGCAT',
 'ACTTAC',
 'ACATTT',
 'ACAGAA',
 'AAGAGA',
 'GACAGA',
 'CAAAGA',
 'GAGTAC',
 'TGTGCT',
 'GAAAGG',
 'TTCAGA',
 'CAGAAG',
 'AATTCG',
 'GGACTC',
 'AAGCCA',
 'CAACAA',
 'CATTAA',
 'TGAGGA',
 'AAGGGA',
 'GGGTAG',
 'GGCAGC',
 'CCAGTG',
 'AACAAT',
 'GAGCTG',
 'CTCGCA',
 'TCAGTT',
 'TTGAAA',
 'GAGACA',
 'GTAGAG',
 'AAGACA',
 'CACAAA',
 'CATTCC',
 'TAATTT',
 'CTTGCC',
 'TTGGCT',
 'GGATAA',
 'CTCGGC',
 'TGCAAA',
 'AGGTAA',
 'GATTCT',
 'TTGCTC',
 'TTCCGG',
 'TTTTCG',
 'GGATCA',
 'GAGGTA',
 'ACCCGG',
 'GGGCCT',
 'AGCACA',
 'GGTGGT',
 'CTCACA',
 'TCTGGG']

## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=max_tok_len
).to(device)

## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 68.3%, Avg loss: 0.644644 

Epoch 1


Train metrics: 
 Accuracy: 71.9%, Avg loss: 0.624048 

Epoch 2


Train metrics: 
 Accuracy: 73.9%, Avg loss: 0.612115 

Epoch 3


Train metrics: 
 Accuracy: 76.2%, Avg loss: 0.605154 

Epoch 4


Train metrics: 
 Accuracy: 78.2%, Avg loss: 0.595036 



## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

p  3474 ; tp  1608.573250234127 ; fp  677.6251577127259
recall  0.46303202367130886 ; precision  0.7036017716759434
num_batches 218
correct 4405
size 6948
Test metrics: 
 Accuracy: 0.633995, F1 score: 0.558513, Avg loss: 0.663565 



(0.6339953943580886, 0.5585131401081309)