In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
import torch
import numpy as np
from src.models.logistic_regression import LogisticRegression
from src.models.lstm import LstmClassifier
from src.models.transformer import TransformerEncoderClassifier
from src.models.cnn import CnnClassifier
from src.utils.factories import get_transformation_function
from src.data_models.models import DnaRepresentation

# Testing DNA representations

In [3]:
nucleotides = np.array(['A', 'C', 'T', 'G'])
sequence_arr = np.random.choice(nucleotides, size=500)
sequence = "".join(sequence_arr.tolist())

In [4]:
refined_f = get_transformation_function(DnaRepresentation.refined)
huffman_f = get_transformation_function(DnaRepresentation.huffman)
grayscale_f = get_transformation_function(DnaRepresentation.grayscale)

In [5]:
refined_f(sequence).shape

(500, 2)

In [6]:
huffman_f(sequence).shape

(1000, 2)

In [7]:
grayscale_f(sequence).shape

(500,)

# Testing models input shapes

In [8]:
xb_r = torch.rand(64, 500, 2)
xb_h = torch.rand(64, 1000, 2)
xb_c = torch.rand(64, 500)

In [9]:
LogisticRegression(sequence_len=500)(xb_r)

tensor([0.4353, 0.5329, 0.5059, 0.5131, 0.4207, 0.4692, 0.5112, 0.5403, 0.4913,
        0.4790, 0.4585, 0.5345, 0.4971, 0.4790, 0.5020, 0.4614, 0.5344, 0.4073,
        0.4760, 0.4795, 0.4970, 0.4887, 0.5173, 0.5154, 0.4429, 0.5778, 0.5513,
        0.5193, 0.4888, 0.5444, 0.4990, 0.4308, 0.5366, 0.4808, 0.5886, 0.5105,
        0.5704, 0.5587, 0.5011, 0.4434, 0.5669, 0.5066, 0.4180, 0.5468, 0.5295,
        0.4330, 0.5343, 0.5414, 0.5761, 0.4590, 0.4590, 0.5309, 0.5383, 0.5124,
        0.5051, 0.5535, 0.5757, 0.4641, 0.4379, 0.5318, 0.4536, 0.4750, 0.5323,
        0.4782], grad_fn=<SqueezeBackward0>)

In [10]:
LogisticRegression(sequence_len=1000)(xb_h)

tensor([0.5783, 0.5577, 0.5092, 0.5492, 0.5518, 0.6140, 0.5553, 0.5925, 0.5571,
        0.5508, 0.5878, 0.5333, 0.5099, 0.5505, 0.5514, 0.5069, 0.5962, 0.5727,
        0.5078, 0.4800, 0.5648, 0.5522, 0.5786, 0.5759, 0.4928, 0.5736, 0.5470,
        0.6162, 0.5943, 0.5590, 0.5390, 0.6299, 0.6003, 0.5332, 0.4734, 0.5351,
        0.5286, 0.5905, 0.5387, 0.5699, 0.5893, 0.5778, 0.5406, 0.5890, 0.5122,
        0.5490, 0.5951, 0.6149, 0.5696, 0.5117, 0.4920, 0.5236, 0.5672, 0.4994,
        0.5875, 0.5794, 0.6010, 0.5118, 0.5458, 0.5741, 0.5529, 0.5912, 0.6035,
        0.6160], grad_fn=<SqueezeBackward0>)

In [11]:
LstmClassifier()(xb_r)

tensor([0.4454, 0.4520, 0.4474, 0.4503, 0.4482, 0.4465, 0.4465, 0.4433, 0.4467,
        0.4424, 0.4560, 0.4439, 0.4460, 0.4500, 0.4471, 0.4472, 0.4567, 0.4570,
        0.4524, 0.4400, 0.4544, 0.4547, 0.4523, 0.4586, 0.4500, 0.4530, 0.4554,
        0.4495, 0.4538, 0.4494, 0.4520, 0.4447, 0.4432, 0.4536, 0.4465, 0.4442,
        0.4490, 0.4565, 0.4505, 0.4448, 0.4518, 0.4497, 0.4499, 0.4515, 0.4482,
        0.4412, 0.4517, 0.4448, 0.4501, 0.4515, 0.4534, 0.4517, 0.4476, 0.4507,
        0.4490, 0.4423, 0.4471, 0.4491, 0.4448, 0.4539, 0.4430, 0.4430, 0.4442,
        0.4552], grad_fn=<SqueezeBackward0>)

In [12]:
LstmClassifier()(xb_h)

tensor([0.5814, 0.5816, 0.5778, 0.5809, 0.5774, 0.5819, 0.5788, 0.5780, 0.5751,
        0.5742, 0.5746, 0.5738, 0.5802, 0.5786, 0.5757, 0.5809, 0.5749, 0.5722,
        0.5813, 0.5795, 0.5760, 0.5777, 0.5816, 0.5743, 0.5760, 0.5722, 0.5809,
        0.5807, 0.5770, 0.5776, 0.5736, 0.5736, 0.5791, 0.5736, 0.5719, 0.5757,
        0.5785, 0.5785, 0.5776, 0.5787, 0.5747, 0.5747, 0.5819, 0.5809, 0.5817,
        0.5746, 0.5795, 0.5754, 0.5786, 0.5810, 0.5789, 0.5789, 0.5772, 0.5754,
        0.5780, 0.5813, 0.5790, 0.5786, 0.5776, 0.5803, 0.5751, 0.5752, 0.5827,
        0.5799], grad_fn=<SqueezeBackward0>)

In [13]:
TransformerEncoderClassifier(device=torch.device("cpu"), max_seq_length=500)(xb_r)

tensor([0.5219, 0.5224, 0.5221, 0.5211, 0.5209, 0.5215, 0.5208, 0.5211, 0.5210,
        0.5217, 0.5205, 0.5215, 0.5212, 0.5236, 0.5209, 0.5215, 0.5221, 0.5214,
        0.5211, 0.5220, 0.5221, 0.5212, 0.5216, 0.5228, 0.5224, 0.5221, 0.5216,
        0.5208, 0.5214, 0.5218, 0.5224, 0.5205, 0.5203, 0.5212, 0.5221, 0.5229,
        0.5208, 0.5223, 0.5210, 0.5213, 0.5211, 0.5215, 0.5199, 0.5215, 0.5226,
        0.5224, 0.5223, 0.5209, 0.5197, 0.5218, 0.5204, 0.5215, 0.5216, 0.5219,
        0.5230, 0.5210, 0.5211, 0.5216, 0.5204, 0.5209, 0.5221, 0.5216, 0.5208,
        0.5220], grad_fn=<SqueezeBackward0>)

In [14]:
TransformerEncoderClassifier(device=torch.device("cpu"), max_seq_length=1000)(xb_h)

tensor([0.4860, 0.4855, 0.4852, 0.4842, 0.4862, 0.4868, 0.4848, 0.4864, 0.4856,
        0.4854, 0.4847, 0.4856, 0.4860, 0.4851, 0.4868, 0.4867, 0.4857, 0.4852,
        0.4848, 0.4857, 0.4859, 0.4864, 0.4857, 0.4849, 0.4857, 0.4855, 0.4878,
        0.4874, 0.4855, 0.4838, 0.4855, 0.4871, 0.4844, 0.4841, 0.4865, 0.4850,
        0.4849, 0.4843, 0.4859, 0.4854, 0.4853, 0.4843, 0.4863, 0.4861, 0.4848,
        0.4850, 0.4859, 0.4862, 0.4849, 0.4858, 0.4853, 0.4860, 0.4869, 0.4855,
        0.4842, 0.4866, 0.4856, 0.4863, 0.4861, 0.4856, 0.4845, 0.4855, 0.4854,
        0.4843], grad_fn=<SqueezeBackward0>)

In [15]:
CnnClassifier()(xb_c)

tensor([0.5111, 0.5111, 0.5112, 0.5112, 0.5112, 0.5112, 0.5111, 0.5113, 0.5113,
        0.5113, 0.5110, 0.5111, 0.5112, 0.5112, 0.5113, 0.5111, 0.5111, 0.5112,
        0.5113, 0.5112, 0.5112, 0.5111, 0.5113, 0.5111, 0.5112, 0.5111, 0.5111,
        0.5110, 0.5110, 0.5111, 0.5111, 0.5112, 0.5110, 0.5111, 0.5114, 0.5111,
        0.5111, 0.5111, 0.5112, 0.5111, 0.5111, 0.5113, 0.5112, 0.5111, 0.5113,
        0.5112, 0.5111, 0.5113, 0.5112, 0.5112, 0.5111, 0.5110, 0.5111, 0.5111,
        0.5112, 0.5112, 0.5111, 0.5111, 0.5112, 0.5112, 0.5111, 0.5112, 0.5113,
        0.5112], grad_fn=<SqueezeBackward0>)