In [31]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import ray
import h5py

In [13]:
# trajectory_coordinates_prepared
def align_frame_to_ref(trajectory_coordinates, varframe, coord_ref):
    """
    Gets coordinates, translates by centroid and rotates by rotation matrix R
    """
    coord_var = trajectory_coordinates[varframe]
    trans = centroid(coord_ref)
    coord_var_cen = coord_var - centroid(coord_var)
    coord_ref_cen = coord_ref - centroid(coord_ref)
    R = kabsch(coord_var_cen, coord_ref_cen)
    coord_var_shifted = np.dot(coord_var_cen,R) + trans
    return coord_var_shifted

def rmsd(A, B):
    """
    Not used yet, but might be helpful for some applications.
    """
    Coord = len(A[0])
    NAtom = len(A)
    cum = 0.0
    for i in range(NAtom):
        for j in range(Coord):
            cum += (A[i][j] - B[i][j])**2.0
    return np.sqrt(cum / NAtom)

def centroid(A):
    A = A.mean(axis=0)
    return A

def kabsch(coord_var, coord_ref):
    """
    calculation of Rotation Matrix R
    see SVD  http://en.wikipedia.org/wiki/Kabsch_algorithm
    and  proper/improper rotation, JCC 2004, 25, 1894.
    """
    covar = np.dot(coord_var.T, coord_ref)
    v, s, wt = np.linalg.svd(covar)
    d = (np.linalg.det(v) * np.linalg.det(wt)) < 0.0
    if d: # antialigns of the last singular vector
        s[-1] = -s[-1]
        v[:, -1] = -v[:, -1]
    R = np.dot(v, wt)
    return R

def adaptability(trajectory_coordinates):
    ref = trajectory_coordinates[0]
    NAtom = len(ref)
    dist_to_ref_mat = np.zeros((100, NAtom))
    for ind in range(100):
        aligned = align_frame_to_ref(trajectory_coordinates, ind, ref)
        squared_dist = np.sum((ref-aligned)**2, axis=1)
        dist_to_ref_mat[ind, :] = np.sqrt(squared_dist)
    return dist_to_ref_mat
    # return np.mean(dist_to_ref_mat, axis=1), np.std(dist_to_ref_mat, axis=1), ref 


In [14]:
import ray
ray.init(ignore_reinit_error=True)

2023-06-01 03:12:43,212	INFO worker.py:1454 -- Calling ray.init() again after it has already been called.


0,1
Python version:,3.10.11
Ray version:,2.4.0


In [15]:
data_directories = [
    "out_big",
    "out_big_5000",
    "out_big_10000"
]

In [16]:
all_file_paths = []
for data_dir in data_directories:
    data_dir = Path(data_dir) / "npz"
    file_paths = [x for x in data_dir.glob("*.npz") if x.is_file() and len(x.stem) == 4]
    all_file_paths.extend(file_paths)
all_file_paths = sorted(all_file_paths, key=lambda x: x.stem)

In [17]:
len(all_file_paths)

14453

In [18]:
path = all_file_paths[0]

In [25]:
@ray.remote
def get_adaptabilities(input_path, SAVEDIR):
    path = SAVEDIR/input_path.name
    if path.exists():
        return True
    data = np.load(input_path)
    trajectory_coordinates = data['trajectory_coordinates_prepared']
    nframes, n_residues, n_backbone_atoms, ncoords = trajectory_coordinates.shape
    trajectory_coordinates_reshaped = trajectory_coordinates.reshape(nframes, n_residues*n_backbone_atoms, ncoords)
    adaptability_values = adaptability(trajectory_coordinates_reshaped)
    adaptability_values = adaptability_values.reshape(nframes, n_residues, n_backbone_atoms)
    np.savez_compressed(path, data=adaptability_values)
    return True

In [26]:
SAVEDIR = Path("adaptabilities_npz")
SAVEDIR.mkdir(exist_ok=True)

In [27]:
size=200
for i in tqdm(range(0, len(all_file_paths), size), total=len(all_file_paths)//size+1):
  result_ids = []
  for file_name in all_file_paths[i:i+size]:
      result_ids.append(get_adaptabilities.remote(file_name, SAVEDIR))

  results = ray.get(result_ids)

  0%|          | 0/73 [00:00<?, ?it/s]

In [None]:
# data = np.random.randint(10, size=(2,3,5, 7))

In [None]:
# (data.reshape(2, 3*5, 7).reshape(2, 3, 5, 7) == data).all()

In [28]:
adaptability_files = list(SAVEDIR.glob("*.npz"))
adaptability_files = [x for x in adaptability_files if x.is_file() and len(x.stem) == 4]
adaptability_files = sorted(adaptability_files, key=lambda x: x.stem)

In [29]:
len(adaptability_files)

14453

In [35]:
adaptability_shapes = []
for filename in tqdm(adaptability_files):
    pdbid = filename.stem
    adaptabilities = np.load(filename)['data']
    adaptability_shapes.append({
        "pdbid": pdbid,
        "shape": adaptabilities.shape
    })
    with h5py.File("md_adaptabilities.hdf5", 'a') as f:
        f.create_dataset(pdbid, data=adaptabilities, compression="gzip")

  0%|          | 0/14453 [00:00<?, ?it/s]

In [38]:
adaptability_shapes = pd.DataFrame(adaptability_shapes)
adaptability_shapes.to_csv("adaptability_shapes.csv", index=None)
# to check that everything is in order and all the shapes are equal

In [39]:
esm_if_outputs_info = []
with h5py.File("npz_frame0_collected.hdf5") as f:
    for pdbid in tqdm(f.keys()):
        shape = f[pdbid][()].shape
        esm_if_outputs_info.append({
            "pdbid": pdbid,
            "shape": shape
        })

  0%|          | 0/14453 [00:00<?, ?it/s]

In [40]:
esm_if_outputs_info = pd.DataFrame(esm_if_outputs_info)
esm_if_outputs_info.to_csv("esm_if_outputs_shapes.csv", index=None)

In [45]:
merged_df = esm_if_outputs_info.rename(columns={"shape": "esm_out_shape"}).merge(adaptability_shapes, on="pdbid")

In [46]:
embeddings_dir = Path("npz_frame0")

In [51]:
embedding_files = list(embeddings_dir.glob("*.npz"))

In [63]:
for filename in embedding_files:
    pdbid = filename.stem
    break

In [74]:
@ray.remote
def get_collected_embedding(filename, SAVEDIR):
    savepath = SAVEDIR / filename.name
    if savepath.exists():
        return True
    data = np.load(filename)
    sorted_keys = sorted(data.keys(), key=int)
    data = [data[key].squeeze(0)[1:-1] for key in sorted_keys]
    data = np.concatenate(data)
    np.savez_compressed(savepath, data=data)
    return True

In [64]:
adaptability_shapes[adaptability_shapes.pdbid == pdbid]

Unnamed: 0,pdbid,shape
2871,2O4N,"(100, 198, 3)"


In [75]:
SAVEDIR = Path("npz_frame0_collected")
SAVEDIR.mkdir(exist_ok=True)
size=200
for i in tqdm(range(0, len(embedding_files), size), total=len(embedding_files)//size+1):
  result_ids = []
  for file_name in embedding_files[i:i+size]:
      result_ids.append(get_collected_embedding.remote(file_name, SAVEDIR))

  results = ray.get(result_ids)

  0%|          | 0/73 [00:00<?, ?it/s]

In [78]:
# !ls npz_frame0_collected | wc

In [81]:
embedding_files = list(SAVEDIR.glob("*.npz"))
embedding_files = sorted(embedding_files, key=lambda x: x.stem)

In [87]:
embedding_shapes = []
for filename in tqdm(embedding_files):
    pdbid = filename.stem
    data = np.load(filename)
    embeddings = data['data']
    embedding_shapes.append({
        "pdbid": pdbid,
        "shape": embeddings.shape
    })
    with h5py.File("esm_if_out_frame0.hdf5", 'a') as f:
        f.create_dataset(pdbid, data=embeddings, compression="gzip")
    # break

  0%|          | 0/14453 [00:00<?, ?it/s]

In [88]:
embedding_shapes = pd.DataFrame(embedding_shapes)

In [90]:
merged_df = adaptability_shapes.merge(embedding_shapes, on="pdbid")

In [94]:
np.all(merged_df.shape_x.apply(lambda x: x[1]).values == merged_df.shape_y.apply(lambda x: x[0]).values)
# if this is True, the shapes are correct

True