In [None]:
import mdtraj as md
import numpy as np
np.set_printoptions(precision=4)
import pandas as pd
import tensorflow as tf
import math
from tqdm import tqdm
import concurrent.futures
tfk = tf.keras


In [None]:
# func must be function that takes in model, ion, and conc as parameters in this order
def apply_to_dict(func):
    return {model: {ion: {conc: func(model, ion, conc)
                           for conc in CONCS} 
                     for ion in IONS}
             for model in MODEL_N}

In [None]:
from time import process_time
MODEL_N = [i for i in range(1, 2)]
IONS = ["Na+", "MG"]
CONCS = [c for c in range(10, 60, 10)]
TIME_FRAMES = [t for t in range(101)]
datafiles = apply_to_dict(lambda m, i, c: f"../data/distance_npz/filtered_sasdfb9_m{m}_{c}_{i}.npz")
t1 = process_time()
npz_dict = apply_to_dict(lambda m, i, c: np.load(datafiles[m][i][c])["arr_0"])
t2 = process_time()
print(t2-t1)

0.3119973509999996


In [13]:
spec_tuple = [(m,i,c,t) for m in MODEL_N for i in IONS for c in CONCS for t in TIME_FRAMES]
np.random.seed(1)
np.random.shuffle(spec_tuple)
# construct dataset with label shuffled
label = np.array([str(m)+"_"+str(c)+"_"+i+"_"+str(t) for m,i,c,t in spec_tuple])
data = np.stack([npz_dict[m][i][c][t] for m, i, c, t in spec_tuple])
# split set into train and test
n_set = len(label)
# split train and test dataset to be 90% and 10% of the entire dataset
split_idx = math.ceil(n_set * 0.9)
# construct train dataset
train_feature_dataset = tf.data.Dataset.from_tensor_slices(data[:split_idx])
train_label_dataset = tf.data.Dataset.from_tensor_slices(label[:split_idx])
train_dataset = tf.data.Dataset.zip((train_feature_dataset, train_label_dataset))
# construct test dataset
test_feature_dataset = tf.data.Dataset.from_tensor_slices(data[split_idx:])
test_label_dataset = tf.data.Dataset.from_tensor_slices(label[split_idx:])
test_dataset = tf.data.Dataset.zip((test_feature_dataset, test_label_dataset))
# save dataset
train_dataset.save("../dataset/m1_ssRNA_train_distributed100atoms_dataset")
test_dataset.save("../dataset/m1_ssRNA_test_distributed100atoms_dataset")

In [33]:
n_atom_idx_set = len(atom_idx_list)
dataset_array = np.concatenate([preprocess_atom_set(rna, atom_idx_list[i]) for i in tqdm(range(n_atom_idx_set))])
np.random.seed(1)
np.random.shuffle(dataset_array)
n_set = len(dataset_array)
split_idx = math.ceil(n_set/2)
train_dataset = tf.data.Dataset.from_tensor_slices(dataset_array[:split_idx])
test_dataset = tf.data.Dataset.from_tensor_slices(dataset_array[split_idx:])
train_dataset.save("VAE/dataset/ssRNA_train_dataset")
test_dataset.save("VAE/dataset/ssRNA_test_dataset")

  0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# DEBUGGING SECCTION
atom_idx = atom_idx_list[0]
pair_meshgrid = np.meshgrid(atom_idx, np.transpose(atom_idx))
pair_grid = np.dstack((pair_meshgrid[0], pair_meshgrid[1]))
pair_distance_array_frames = compute_and_pack_distance_in_array_frames(rna, pair_grid)
formatted_pair_distance_array_frames = np.array(
    [trunc_or_pad_distance_array(frame, maxlen = 100) 
     for frame in pair_distance_array_frames])

In [None]:
# # LEGACY CODE, now the gro files are filtered in the MD_sim pipeline so no need to select residues
# t = md.load('VAE/md_0_1.gro')
# rna_idx = t.top.select('resi < 30')
# rna = t.atom_slice(rna_idx)
def compute_and_pack_distance_in_array_frames(rna, pair_grid):
    # in distances_frames, the axes were (number of columns, number of frames, number of rows)
    # where number of columns and rows are both equal to the number of atoms
    distances_by_columns = np.array([md.compute_distances(rna, column) for column in pair_grid])
    # in axes_ordered_distances_frames, the axes are 
    # (number of frames, number of columns, number of rows)
    axes_ordered_distances_frames = distances_by_columns.transpose(1,0,2)
    packed_distances_frames = np.apply_along_axis(
        lambda arr: np.array_split(arr, len(arr)), 2, axes_ordered_distances_frames)
    return(packed_distances_frames)

# Given an 3D array (n_atom x n_atom x 1), n_atom is the number of atom in one batch of residue and 
# may vary depending on the number of atoms in each residue; return a 100x100x1 array either by 
# truncating or padding the given array
def trunc_or_pad_distance_array(distance_array, maxlen=100):
    if distance_array.shape[0] != distance_array.shape[1]:
        raise Exception("the first and second dimension of the given distance array does not match")
    n_atom = distance_array.shape[0]
    if n_atom < maxlen:
        pad_n_pre = int((maxlen-n_atom)/2)
        pad_n_post = (maxlen-n_atom)/2
        if pad_n_pre != pad_n_post:
            pad_n_post = math.ceil(pad_n_post)
        distance_array = np.pad(
            distance_array, 
            ((pad_n_pre, pad_n_post), 
             (pad_n_pre, pad_n_post),
             (0, 0)))
    elif n_atom > maxlen:
        trunc_n_pre = int((n_atom-maxlen)/2)
        trunc_n_post = (n_atom-maxlen)/2
        if trunc_n_pre != trunc_n_post:
            trunc_n_post = math.ceil(trunc_n_post)
        end_idx = int(n_atom - trunc_n_post)
        # truncate first dimension
        distance_array = distance_array[trunc_n_pre:end_idx]
        # truncate second dimension
        distance_array = np.array([col[trunc_n_pre:end_idx] for col in distance_array])    
    return distance_array

def preprocess_atom_set(rna, atom_idx):
    pair_meshgrid = np.meshgrid(atom_idx, np.transpose(atom_idx))
    pair_grid = np.dstack((pair_meshgrid[0], pair_meshgrid[1]))
    # TODO can be optimized to only calculate distance once for each pair but the computation quick so I am skipping for now
    pair_distance_array_frames = compute_and_pack_distance_in_array_frames(rna, pair_grid)
    formatted_pair_distance_array_frames = np.array(
        [trunc_or_pad_distance_array(frame, maxlen = 100) 
         for frame in pair_distance_array_frames])
    return formatted_pair_distance_array_frames
n_atom_idx_set = len(atom_idx_list)
n_atom_idx_set = 2
dataset_array = np.concatenate([preprocess_atom_set(rna, atom_idx_list[i]) for i in tqdm(range(n_atom_idx_set))])
# from time import process_time
# t1 = process_time()
# tt = np.load("../data/distance_npy/filtered_sasdfb9_m1_20_MG.npz")["arr_0"]
# t2 = process_time()
# ttt = np.load("../data/distance_npy/filtered_sasdfb9_m1_20_MG.npy")
# t3 = process_time()
# print(f"{t2-t1}; {t3-t2}")
# n_residues_dict = rna.top.n_residues
# batch_residue_idx = [(i, i+3) for i in range(0,n_residues,3)]
# atom_idx_list = []
# for batch_start, batch_end in batch_residue_idx:
#     atom_idx_list.append(rna.top.select(f'resi >= {batch_start} and resi < {batch_end}'))