In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import torch
np.set_printoptions(suppress=True)

In [2]:
from importlib import reload
import protein_transformer
from protein_transformer.losses import mse_over_angles, mse_over_angles_numpy, inverse_trig_transform
from protein_transformer.losses import angles_to_coords

In [3]:
data = torch.load("../../data/proteinnet/casp12_191101_100.pt")

In [4]:
def add_noise(angs, stdev, truncate=True, keep_zeros=False):
    noise = np.random.normal(0, stdev, angs.shape)
    new_ang = angs + noise
    if truncate:
        new_ang[new_ang > 1] = 1
        new_ang[new_ang < -1] = -1
    if keep_zeros:
        new_ang[angs == 0] = 0
    return new_ang

def rmse(a, b):
    return np.sqrt(((a-b)**2).mean())

In [5]:
a = data["train"]["ang"][0][:]
a[np.isnan(a)] = 0
np.random.seed(0)
a_noise = add_noise(a, 1, truncate=False)
np.random.seed(0)
a_noise_truncate = add_noise(a, 1, truncate=True)
print(rmse(a, a_noise))
print(rmse(a, a_noise_truncate))

a_torch, a_noise_truncate_torch = torch.tensor(a), torch.tensor(a_noise_truncate)
print(torch.sqrt(mse_over_angles(a_torch.unsqueeze(dim=0), a_noise_truncate_torch.unsqueeze(dim=0))).item())
print(np.sqrt(mse_over_angles_numpy(np.expand_dims(a, 0), np.expand_dims(a_noise_truncate, 0))))

0.9884343496134423
0.6921556701212284
0.6921556701212276
0.6921556701212284


In [10]:
from protein_transformer.protein.structure_utils import get_backbone_from_full_coords

In [None]:
def angles_have_missing_res(angles):
    return np.isnan(ang).all(axis=-1).any()

In [11]:
def get_coordinates_from_numpy_data(seq, ang_sincos):
    # Add batch dimension
    ang_sincos = ang_sincos[np.newaxis, :]
    
    # Compute angles in radians from sin/cos representaion
    ang_rad = inverse_trig_transform(torch.tensor(ang_sincos, dtype=torch.float))[0]
    
    # Remove nans
    ang_rad[torch.isnan(ang_rad)] = 0
    
    if torch.isnan(ang_rad).any():
        print("Nan in ang_rad.")
        
    seq_as_ints = protein_transformer.dataset.VOCAB.aa_seq2indices(seq, add_sos_eos=False)
    seq_as_ints = torch.tensor(seq_as_ints, dtype=torch.long)
    
    coords = angles_to_coords(ang_rad, seq_as_ints, remove_batch_padding=False)
    coords = get_backbone_from_full_coords(coords)
    return coords
    
    

In [12]:
i = 11
for seq, ang, crd in zip(data["train"]["seq"][i:], data["train"]["ang"][i:], data["train"]["crd"][i:]):
    print(seq)
    print(ang)
    print(crd)
    break

TGRPEWIWLALGTALMGLGTLYFLVKGMGVSDPDAKPFYAITTLVPAIAFTMYLSMLLGYGLTMVPFGGEQNPIYWARYADWLFTTPLLLLDLALLVDADQGTILALVGADGIMIGTGLVGALTKVYSYRFVWWAISTAAMLYILYVLFFGFTSKAESMRPEVASTFKVLRNVTVVLWSAYPVVWLIGSEGAGIVPLNIETLLFMVLDVSAKVGFGLILLRSRAIFG
[[        nan         nan  0.95092025 ...         nan         nan
          nan]
 [ 0.55080746  0.83463234  0.85758262 ...         nan         nan
          nan]
 [ 0.54672625 -0.83731141 -0.77509018 ... -0.85431134  0.99999702
   0.0024416 ]
 ...
 [ 0.0557567  -0.99844439  0.99982571 ...         nan         nan
          nan]
 [-0.15643064 -0.98768895  0.3760204  ...         nan         nan
          nan]
 [ 0.99868198 -0.05132552         nan ...         nan         nan
          nan]]
[[23.775 71.397 13.279]
 [24.255 72.119 12.067]
 [23.344 71.81  10.88 ]
 ...
 [   nan    nan    nan]
 [   nan    nan    nan]
 [   nan    nan    nan]]


In [13]:
def add_noise_and_get_rmse_drmsd(seq, ang, crd, stdev):
    
    noisy_ang = add_noise(ang, stdev, truncate=True, keep_zeros=False)
#     print(noisy_ang -ang)
    
    coords = get_coordinates_from_numpy_data(seq, ang)
    noisy_coords = get_coordinates_from_numpy_data(seq, noisy_ang)
    
    rmse_val = np.sqrt(mse_over_angles_numpy(noisy_ang[np.newaxis, :], ang[np.newaxis, :]))
    drmsd_val = protein_transformer.losses.drmsd(coords, noisy_coords)
    
    crd[np.isnan(crd)] = 0
    crd[(coords == 0).all(dim=1)] = 0
    reconstruction_drmsd = protein_transformer.losses.drmsd(coords, torch.tensor(crd))
    self_drmsd_1 = protein_transformer.losses.drmsd(coords, coords)
    self_drmsd_2 = protein_transformer.losses.drmsd(torch.tensor(crd), torch.tensor(crd))
    
    return rmse_val, drmsd_val, reconstruction_drmsd, coords, self_drmsd_1, self_drmsd_2
    
    

In [14]:
r, d, d_recon, c, sd1, sd2 = add_noise_and_get_rmse_drmsd(seq, ang, crd, 1)
print(f"RMSD:\t\t{r:.4f},\nDRMSD:\t\t{d:.2f},\nDRMSD-rec:\t{d_recon:.2f},\nSelf-d1:\t{sd1},\nSelf-d2:\t{sd2}")

  """
  


IndexError: The shape of the mask [13] at index 0does not match the shape of the indexed tensor [2951, 3] at index 1

In [None]:
c

In [None]:
from protein_transformer.protein.PDB_Creator import PDB_Creator
creator1 = PDB_Creator(c.numpy(), seq=seq)
creator2 = PDB_Creator(crd, seq=seq)

In [None]:
creator1.save_pdb("reconstructed.pdb")

In [None]:
creator2.save_pdb("original.pdb")

In [None]:
pwd1 = protein_transformer.losses.pairwise_internal_dist(c)
pwd2 = protein_transformer.losses.pairwise_internal_dist(torch.tensor(crd))

In [None]:
pwd2

In [None]:
pwd1

In [None]:
c.numpy()[:50]

In [None]:
crd[:50]

In [None]:
torch.isnan(c).any(), np.isnan(crd).any() 

In [None]:
ang

In [None]:
np.random.seed(0)
ang_noisy = add_noise(ang, 1, truncate=True, keep_zeros=True)

In [None]:
ang_noisy = add_noise

In [None]:
c = get_coordinates_from_numpy_data(seq, ang)

In [None]:
c.shape

In [None]:
ang.shape

In [None]:
327*13