# PYTORCH CNN Classifier

In [1]:
DATASET = 'none'
VOCAB_SIZE = 0
TOKENIZER = 'none'
KMER = 0

In [2]:
# Parameters
DATASET = "demo_human_or_worm"
TOKENIZER = "subword"
VOCAB_SIZE = 128


In [3]:
print(DATASET, VOCAB_SIZE, TOKENIZER, KMER)

demo_human_or_worm 128 subword 0


## Config

In [4]:
import torch
from torch.utils.data import DataLoader

from genomic_benchmarks.dataset_getters.pytorch_datasets import get_dataset
from glp.models import CNN
from glp.tokenizers import get_tokenizer
from glp.tokenizers.utils import build_vocab, coll_factory, check_config, check_seq_lengths

In [5]:
config = {
    "dataset": DATASET,
    "tokenizer": TOKENIZER,
    "dataset_version": 0,
    "epochs": 5,
    "batch_size": 32,
    "use_padding": True,
    "force_download": False,
    "run_on_gpu": True,
    "number_of_classes": 2,
    "embedding_dim": 100,
}
check_config(config)

## Choose the dataset

In [6]:
train_dset = get_dataset(config["dataset"], 'train')

## Tokenizer and vocab

In [7]:
tokenizer = get_tokenizer(config['tokenizer'])
tokenizer.train(train_dset=train_dset, vocab_size=VOCAB_SIZE, kmer=KMER)
vocabulary = build_vocab(train_dset, tokenizer, use_padding=config["use_padding"])

print("vocab len:" ,vocabulary.__len__())
print(vocabulary.get_stoi())

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=sample.csv --model_prefix=sample --vocab_size=128 --model_type=unigram
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: sample.csv
  input_format: 
  model_prefix: sample
  model_type: UNIGRAM
  vocab_size: 128
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
}


trainer_interface.cc(456) LOG(INFO) all chars count=15074802
trainer_interface.cc(467) LOG(INFO) Done: 100% characters are covered.
trainer_interface.cc(477) LOG(INFO) Alphabet size=6
trainer_interface.cc(478) LOG(INFO) Final character coverage=1


trainer_interface.cc(510) LOG(INFO) Done! preprocessed 75000 sentences.


unigram_model_trainer.cc(138) LOG(INFO) Making suffix array...


unigram_model_trainer.cc(142) LOG(INFO) Extracting frequent sub strings...


unigram_model_trainer.cc(193) LOG(INFO) Initialized 1000000 seed sentencepieces


trainer_interface.cc(516) LOG(INFO) Tokenizing input sentences with whitespace: 75000


trainer_interface.cc(526) LOG(INFO) Done! 73109
unigram_model_trainer.cc(488) LOG(INFO) Using 73109 sentences for EM training


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=615634 obj=263.975 num_tokens=1523674 num_tokens/piece=2.47497


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=432903 obj=264.111 num_tokens=1607807 num_tokens/piece=3.71401


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=280345 obj=263.272 num_tokens=1662269 num_tokens/piece=5.92937


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=255347 obj=260.477 num_tokens=1694304 num_tokens/piece=6.6353


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=188520 obj=261.101 num_tokens=1730663 num_tokens/piece=9.18026


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=183877 obj=259.479 num_tokens=1741958 num_tokens/piece=9.4735


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=137880 obj=260.582 num_tokens=1779032 num_tokens/piece=12.9028


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=137706 obj=259.09 num_tokens=1782251 num_tokens/piece=12.9424


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=103277 obj=261.112 num_tokens=1830635 num_tokens/piece=17.7255


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=103270 obj=259.064 num_tokens=1831554 num_tokens/piece=17.7356


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=77451 obj=261.748 num_tokens=1888857 num_tokens/piece=24.3878


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=77449 obj=259.449 num_tokens=1889327 num_tokens/piece=24.3945


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=58086 obj=262.394 num_tokens=1948317 num_tokens/piece=33.5419


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=58085 obj=260.069 num_tokens=1948788 num_tokens/piece=33.5506


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=43563 obj=263.026 num_tokens=2006108 num_tokens/piece=46.0507


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=43563 obj=260.849 num_tokens=2006704 num_tokens/piece=46.0644


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=32672 obj=263.668 num_tokens=2062112 num_tokens/piece=63.1156


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=32672 obj=261.603 num_tokens=2062878 num_tokens/piece=63.139


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=24504 obj=264.388 num_tokens=2120186 num_tokens/piece=86.5241


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=24504 obj=262.356 num_tokens=2120966 num_tokens/piece=86.5559


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=18378 obj=265.228 num_tokens=2182532 num_tokens/piece=118.758


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=18378 obj=263.091 num_tokens=2183241 num_tokens/piece=118.796


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=13783 obj=266.021 num_tokens=2247574 num_tokens/piece=163.069


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=13783 obj=263.682 num_tokens=2248379 num_tokens/piece=163.127


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=10337 obj=266.68 num_tokens=2317782 num_tokens/piece=224.222


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=10337 obj=264.216 num_tokens=2318537 num_tokens/piece=224.295


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=7752 obj=267.335 num_tokens=2392945 num_tokens/piece=308.687


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=7752 obj=264.662 num_tokens=2393924 num_tokens/piece=308.814


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=5814 obj=268.043 num_tokens=2472700 num_tokens/piece=425.301


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=5814 obj=265.154 num_tokens=2473779 num_tokens/piece=425.487


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=4360 obj=268.791 num_tokens=2557988 num_tokens/piece=586.694


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=4360 obj=265.678 num_tokens=2558917 num_tokens/piece=586.908


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=3270 obj=269.693 num_tokens=2650996 num_tokens/piece=810.702


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=3270 obj=266.096 num_tokens=2652193 num_tokens/piece=811.068


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=2452 obj=270.515 num_tokens=2751192 num_tokens/piece=1122.02


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=2452 obj=266.522 num_tokens=2752409 num_tokens/piece=1122.52


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=1839 obj=270.958 num_tokens=2854434 num_tokens/piece=1552.17


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=1839 obj=266.785 num_tokens=2855582 num_tokens/piece=1552.79


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=1379 obj=272.786 num_tokens=2964765 num_tokens/piece=2149.94


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=1379 obj=267.191 num_tokens=2965693 num_tokens/piece=2150.61


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=1034 obj=273.21 num_tokens=3085015 num_tokens/piece=2983.57


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=1034 obj=267.494 num_tokens=3086021 num_tokens/piece=2984.55


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=775 obj=274.61 num_tokens=3215223 num_tokens/piece=4148.67


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=775 obj=267.869 num_tokens=3216378 num_tokens/piece=4150.17


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=581 obj=274.127 num_tokens=3366989 num_tokens/piece=5795.16


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=581 obj=268.052 num_tokens=3368462 num_tokens/piece=5797.7


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=435 obj=274.708 num_tokens=3533024 num_tokens/piece=8121.89


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=435 obj=268.287 num_tokens=3535366 num_tokens/piece=8127.28


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=326 obj=274.986 num_tokens=3712883 num_tokens/piece=11389.2


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=326 obj=268.57 num_tokens=3716418 num_tokens/piece=11400.1


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=244 obj=275.049 num_tokens=3925406 num_tokens/piece=16087.7


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=244 obj=268.772 num_tokens=3929389 num_tokens/piece=16104.1


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=183 obj=275.837 num_tokens=4147514 num_tokens/piece=22664


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=183 obj=268.947 num_tokens=4152894 num_tokens/piece=22693.4


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=140 obj=276.025 num_tokens=4374239 num_tokens/piece=31244.6


unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=140 obj=269.044 num_tokens=4381748 num_tokens/piece=31298.2
trainer_interface.cc(604) LOG(INFO) Saving model: sample.model
trainer_interface.cc(615) LOG(INFO) Saving vocabs: sample.vocab


vocab len: 127
{'A': 122, 'GCG': 120, 'CCAC': 123, 'TTCC': 119, 'CTCC': 117, 'AGAG': 115, 'T': 112, 'GCAG': 114, 'TACA': 110, 'ACAG': 109, 'CCG': 107, '<pad>': 126, 'CCAG': 106, 'GAGA': 104, 'CAAT': 105, 'TGTC': 103, 'ACAC': 101, 'CAGT': 100, 'N': 124, 'AACT': 98, 'CGT': 97, 'GGT': 102, 'CTGG': 96, 'GGAG': 95, 'AAAG': 40, 'GAAA': 91, 'TCTG': 90, 'TGCA': 89, 'ACG': 99, 'AGTA': 88, 'TGTA': 87, 'GACA': 118, 'ATAT': 86, 'TTC': 85, 'AAAC': 19, 'AGTG': 83, 'GTG': 60, 'AAAAT': 82, 'CTGA': 81, 'ACAT': 80, 'AGG': 16, 'AGAA': 79, 'TCT': 55, 'TTTTT': 31, 'CGC': 76, 'GGC': 74, 'G': 70, 'TCC': 69, 'AATA': 94, 'TCG': 67, 'CAGC': 66, 'NNNNNNNNNNNNNNNN': 125, 'CTC': 64, 'AATT': 84, 'TGC': 68, 'GAAT': 121, 'AGCA': 62, 'ATTTT': 23, 'GCT': 61, 'TCAG': 56, 'TAA': 12, 'ACC': 59, 'TGTT': 54, 'AGTT': 93, 'TGAG': 50, 'AAT': 24, 'CAGA': 49, 'TGTG': 65, 'TTG': 47, 'TTCT': 10, 'AAATT': 30, 'CCT': 46, 'CAGG': 45, 'AAC': 42, '▁': 1, 'TTT': 38, 'AAAA': 72, 'TTA': 25, 'TGG': 36, 'ACAA': 39, 'TCAA': 34, 'AACA': 21, '

## Dataloader and batch preparation

In [8]:
# Run on GPU or CPU
device = 'cuda' if config["run_on_gpu"] and torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

max_tok_len = check_seq_lengths(dataset=train_dset, tokenizer=tokenizer)

# Data Loader
collate = coll_factory(vocabulary, tokenizer, device, pad_to_length = max_tok_len)

train_loader = DataLoader(train_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

Using cpu device


max_tok_len  80


not all sequences are of the same length


In [9]:
tokenizer(train_dset[1][0])

['▁',
 'TTA',
 'AAC',
 'TTT',
 'AAA',
 'TTTT',
 'CCC',
 'AAA',
 'CTA',
 'CAGG',
 'AAC',
 'CCT',
 'TTG',
 'TTG',
 'AATTT',
 'TAA',
 'CAGA',
 'TTTT',
 'TGAG',
 'CAC',
 'GAA',
 'TAC',
 'TAC',
 'ATTTT',
 'AAG',
 'ACAA',
 'TTT',
 'CCT',
 'CAT',
 'TCA',
 'TGTT',
 'ACT',
 'TCT',
 'TATTT',
 'AAAG',
 'AAAT',
 'TTA',
 'TCAG',
 'AAATT',
 'TAT',
 'TTA',
 'TTG',
 'AGG',
 'ACT',
 'ATG',
 'TAAT',
 'AAAC',
 'ATTTT',
 'TTA',
 'ATC',
 'AAC',
 'ATC',
 'TTTT',
 'TAT',
 'GAA',
 'CAA',
 'TTG',
 'ACC',
 'ATA',
 'GTG']

## Model

In [10]:
model = CNN(
    number_of_classes=config["number_of_classes"],
    vocab_size=vocabulary.__len__(),
    embedding_dim=config["embedding_dim"],
    input_len=max_tok_len
).to(device)

## Training

In [11]:
model.train(train_loader, epochs=config["epochs"])

Epoch 0


  x = torch.tensor(pad(x), dtype=torch.long)


Train metrics: 
 Accuracy: 87.8%, Avg loss: 0.558036 

Epoch 1


Train metrics: 
 Accuracy: 88.5%, Avg loss: 0.556494 

Epoch 2


Train metrics: 
 Accuracy: 89.6%, Avg loss: 0.552644 

Epoch 3


Train metrics: 
 Accuracy: 89.1%, Avg loss: 0.551689 

Epoch 4


Train metrics: 
 Accuracy: 89.9%, Avg loss: 0.547731 



## Testing

In [12]:
test_dset = get_dataset(config["dataset"], 'test')
test_loader = DataLoader(test_dset, batch_size=config["batch_size"], shuffle=True, collate_fn=collate)

acc, f1 = model.test(test_loader)
acc, f1

p  12500 ; tp  10428.25254869461 ; fp  842.5873661469022
recall  0.8342602038955689 ; precision  0.9252418300221462
num_batches 782
correct 22082
size 25000
Test metrics: 
 Accuracy: 0.883280, F1 score: 0.877399, Avg loss: 0.554831 



(0.88328, 0.8773987445166924)