In [10]:
import tensorflow as tf
import os
import tensorflow_addons as tfa
import numpy as np
import time
#import pyracular
from biobeaker.utils import get_angles, positional_encoding
from biobeaker import BEAKER
from tensorflow.keras.layers import Dense, Embedding, Flatten, Lambda, Subtract, Input, Concatenate, AveragePooling1D, LocallyConnected1D, Conv1D, GaussianNoise, BatchNormalization, Reshape, GlobalAveragePooling1D, Dropout
from tensorflow.keras.models import Model, Sequential
import pandas as pd
#import umap
import plotly.express as px
#import phate

In [11]:
# Hyper parameters
k = 21
window_size = 32
num_layers = 8
embedding_dims = 32
output_dims = 128 # Output dims are also internal dims!
intermediate_dims = 256
num_heads = 8
dropout_rate = 0.15
max_positions = 512
batch_size = 128

In [12]:
transformer = BEAKER(num_layers, embedding_dims, output_dims, num_heads, intermediate_dims, max_positions,
                          dropout=dropout_rate, attention_dropout=dropout_rate, activation=tfa.activations.mish)

# Magic embeddings 
# 
# Kmer -> DNA Embedding
# Where kmer1 (k1) and kmer2 (k2)
# manhattan_distance(k1, k2) =~ alignment_distance(k1, k2)

magic = Dense(embedding_dims, 
                activation=tf.nn.swish, 
                name="Magic", 
                use_bias=False, 
                trainable=False,
                dtype=tf.float32)

magic.build((window_size+1,k*5))

#Load up the weights
weights = np.load("../../precomputed/weights_wide_singlelayer_k21_3Aug2020model_21_dims_32_epochs256.npy", allow_pickle=True)
magic.set_weights([weights[0][0]])

transformer.load_weights("../../precomputed/beaker_medium_tripleloss")

cls = np.asarray([[1] * 105])

2022-06-21 11:29:13.523362: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-06-21 11:29:13.526277: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-06-21 11:29:13.556105: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:42:00.0 name: NVIDIA GeForce GTX 1070 Ti computeCapability: 6.1
coreClock: 1.683GHz coreCount: 19 deviceMemorySize: 7.93GiB deviceMemoryBandwidth: 238.66GiB/s
2022-06-21 11:29:13.556231: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2022-06-21 11:29:13.566672: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2022-06-21 11:29:13.567069: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.s

In [13]:
kmerwindowsgen = pyracular.FastaKmersGenerator(k, "../../example/Arabidopsis_chr1.sfasta", window_size, False, False, False)
first = next(kmerwindowsgen)

NameError: name 'pyracular' is not defined

In [35]:
np.asarray(first[0][0], dtype=int)

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [36]:
# With a few functions, we can convert back to string representation
def convert_all_kmers(kmers):
    kmers_as_str = list()
    for x in kmers:
        y = "".join(list(map(convert_letter_to_string, np.array_split(x, k))))
        kmers_as_str.append(y)
    return kmers_as_str

def convert_letter_to_string(x):
    y = np.nonzero(x)[0][0]
    if y == 0:
        return "A"
    elif y == 1:
        return "T"
    elif y == 2:
        return "N"
    elif y == 3:
        return "C"
    elif y == 4:
        return "G"

In [37]:
convert_all_kmers(first[0])

['CCCTAAACCCTAAACCCTAAA',
 'CCCTAAACCTCTGAATCCTTA',
 'ATCCCTAAATCCCTAAATCTT',
 'TAAATCCTACATCCATGAATC',
 'CCTAAATACCTAATTCCCTAA',
 'ACCCGAAACCGGTTTCTCTGG',
 'TTGAAAATCATTGTGTATATA',
 'ATGATAATTTTATCGTTTTTA',
 'TGTAATTGCTTATTGTTGTGT',
 'GTAGATTTTTTAAAAATATCA',
 'TTTGAGGTCAATACAAATCCT',
 'ATTTCTTGTGGTTTTCTTTCC',
 'TTCACTTAGCTATGGATGGTT',
 'TATCTTCATTTGTTATATTGG',
 'ATACAAGCTTTGCTACGATCT',
 'ACATTTGGGAATGTGAGTCTC',
 'TTATTGTAACCTTAGGGTTGG',
 'TTTATCTCAAGAATCTTATTA',
 'ATTGTTTGGACTGTTTATGTT',
 'TGGACATTTATTGTCATTCTT',
 'ACTCCTTTGTGGAAATGTTTG',
 'TTCTATCAATTTATCTTTTGT',
 'GGGAAAATTATTTAGTTGTAG',
 'GGATGAAGTCTTTCTTCGTTG',
 'TTGTTACGCTTGTCATCTCAT',
 'CTCTCAATGATATGGGATGGT',
 'CCTTTAGCATTTATTCTGAAG',
 'TTCTTCTGCTTGATGATTTTA',
 'TCCTTAGCCAAAAGGATTGGT',
 'GGTTTGAAGACACATCATATC',
 'AAAAAAGCTATCGCCTCGACG',
 'ATGCTCTATTTCTATCCTTGT']

In [39]:
kmers = np.asarray(first[0], dtype=int)
kmers.shape

(32, 105)

In [55]:
kmers_modified = kmers.copy()
#kmers_modified[0][1] = 1
#kmers_modified[0][3] = 0
#kmers_modified[0][9] = 1
#kmers_modified[0][8] = 0
kmers_modified[0][13] = 0
kmers_modified[0][12] = 1

#magic(kmers)

In [56]:
kmers[0]

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [57]:
np.sum(magic(kmers) - magic(kmers_modified))

0.75135255