In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import torch
np.set_printoptions(suppress=True)

In [2]:
from importlib import reload
import protein_transformer
from protein_transformer.losses import mse_over_angles, mse_over_angles_numpy, inverse_trig_transform
from protein_transformer.losses import angles_to_coords

In [3]:
data = torch.load("../../data/proteinnet/casp12_191101_100.pt")

In [4]:
def add_noise(angs, stdev, truncate=True, keep_zeros=False):
    noise = np.random.normal(0, stdev, angs.shape)
    new_ang = angs + noise
    if truncate:
        new_ang[new_ang > 1] = 1
        new_ang[new_ang < -1] = -1
    if keep_zeros:
        new_ang[angs == 0] = 0
    return new_ang

def rmse(a, b):
    return np.sqrt(((a-b)**2).mean())

In [5]:
a = data["train"]["ang"][0][:]
a[np.isnan(a)] = 0
np.random.seed(0)
a_noise = add_noise(a, 1, truncate=False)
np.random.seed(0)
a_noise_truncate = add_noise(a, 1, truncate=True)
print(rmse(a, a_noise))
print(rmse(a, a_noise_truncate))

a_torch, a_noise_truncate_torch = torch.tensor(a), torch.tensor(a_noise_truncate)
print(torch.sqrt(mse_over_angles(a_torch.unsqueeze(dim=0), a_noise_truncate_torch.unsqueeze(dim=0))).item())
print(np.sqrt(mse_over_angles_numpy(np.expand_dims(a, 0), np.expand_dims(a_noise_truncate, 0))))

0.9884343496134423
0.6921556701212284
0.6921556701212276
0.6921556701212284


In [6]:
from protein_transformer.protein.structure_utils import get_backbone_from_full_coords

In [7]:
def angles_have_missing_res(angles):
    return np.isnan(ang).all(axis=-1).any()

In [23]:
def get_coordinates_from_numpy_data(seq, ang_sincos):
    # Add batch dimension
    ang_sincos = ang_sincos[np.newaxis, :]
    
    # Compute angles in radians from sin/cos representaion
    ang_rad = inverse_trig_transform(torch.tensor(ang_sincos, dtype=torch.float))[0]
    
    # Remove nans
    ang_rad[torch.isnan(ang_rad)] = 0
    
    if torch.isnan(ang_rad).any():
        print("Nan in ang_rad.")
        
    seq_as_ints = protein_transformer.dataset.VOCAB.aa_seq2indices(seq, add_sos_eos=False)
    seq_as_ints = torch.tensor(seq_as_ints, dtype=torch.long)
    
    coords = angles_to_coords(ang_rad, seq_as_ints, remove_batch_padding=False)
    return coords
    
    

In [9]:
i = 11
for seq, ang, crd in zip(data["train"]["seq"][i:], data["train"]["ang"][i:], data["train"]["crd"][i:]):
    print(seq)
    print(ang)
    print(crd)
    break

TGRPEWIWLALGTALMGLGTLYFLVKGMGVSDPDAKPFYAITTLVPAIAFTMYLSMLLGYGLTMVPFGGEQNPIYWARYADWLFTTPLLLLDLALLVDADQGTILALVGADGIMIGTGLVGALTKVYSYRFVWWAISTAAMLYILYVLFFGFTSKAESMRPEVASTFKVLRNVTVVLWSAYPVVWLIGSEGAGIVPLNIETLLFMVLDVSAKVGFGLILLRSRAIFG
[[        nan         nan  0.95092025 ...         nan         nan
          nan]
 [ 0.55080746  0.83463234  0.85758262 ...         nan         nan
          nan]
 [ 0.54672625 -0.83731141 -0.77509018 ... -0.85431134  0.99999702
   0.0024416 ]
 ...
 [ 0.0557567  -0.99844439  0.99982571 ...         nan         nan
          nan]
 [-0.15643064 -0.98768895  0.3760204  ...         nan         nan
          nan]
 [ 0.99868198 -0.05132552         nan ...         nan         nan
          nan]]
[[23.775 71.397 13.279]
 [24.255 72.119 12.067]
 [23.344 71.81  10.88 ]
 ...
 [   nan    nan    nan]
 [   nan    nan    nan]
 [   nan    nan    nan]]


In [45]:
def add_noise_and_get_rmse_drmsd(seq, ang, crd, stdev):
    
    noisy_ang = add_noise(ang, stdev, truncate=True, keep_zeros=False)
#     print(noisy_ang -ang)
    
    coords = get_coordinates_from_numpy_data(seq, ang)
    noisy_coords = get_coordinates_from_numpy_data(seq, noisy_ang)
    coords = get_backbone_from_full_coords(coords)
    noisy_coords = get_backbone_from_full_coords(noisy_coords)
    
    rmse_val = np.sqrt(mse_over_angles_numpy(noisy_ang[np.newaxis, :], ang[np.newaxis, :]))
    drmsd_val = protein_transformer.losses.drmsd(coords, noisy_coords)
    rmsd_val = protein_transformer.losses.rmsd(coords.numpy(), noisy_coords.numpy())
    
    crd[np.isnan(crd)] = 0
    true_coords = get_backbone_from_full_coords(crd)
    reconstruction_drmsd = protein_transformer.losses.drmsd(coords, torch.tensor(true_coords))
    reconstruction_rmsd = protein_transformer.losses.rmsd(coords.numpy(), true_coords) 
    
    return rmse_val, rmsd_val, drmsd_val, reconstruction_drmsd, reconstruction_rmsd
    
    

In [46]:
rm, rd, d, d_recon, rd_recon = add_noise_and_get_rmse_drmsd(seq, ang, crd, 1)
print(f"RMSE:\t\t{rm:.4f},\nRMSD:\t\t{rd:.4f},\nRMSD-recon:\t{rd_recon:.4f},\nDRMSD:\t\t{d:.2f},\nDRMSD-rec:\t{d_recon:.2f}")

  """
  


RMSE:		0.6967,
RMSD:		15.8145,
RMSD-recon:	0.4479,
DRMSD:		11.42,
DRMSD-rec:	0.54


In [40]:
import tqdm

In [49]:
def study_noise(stdev):
    pbar = tqdm.tqdm(zip(data["train"]["seq"], data["train"]["ang"], data["train"]["crd"]))
    stats = {"RMSE": [], "RMSD": [], "RMSD-recon": [], "DRMSD" : [], "DRMSD-recon": []}
    for seq, ang, crd in pbar:
        rmse_total, rmsd_total, drmsd_total, drmsd_recon_total, rmsd_recon_total = 0, 0, 0, 0, 0
        for i in range(10):
            rm, rd, d, d_recon, rd_recon = add_noise_and_get_rmse_drmsd(seq, ang, crd, stdev)
            rmse_total += rm
            rmsd_total += rd
            drmsd_total += d
            drmsd_recon_total += d_recon
            rmsd_recon_total += rd_recon
            
        stats["RMSE"].append(rmse_total/10)
        stats["RMSD"].append(rmsd_total/10)
        stats["RMSD-recon"].append(rmsd_recon_total/10)
        stats["DRMSD-recon"].append(drmsd_recon_total/10)
        stats["DRMSD"].append(drmsd_total/10)
    return stats
        

In [51]:
def work(seq_ang_crd_stdev):
    seq, ang, crd, stdev = seq_ang_crd_stdev
    rmse_total, rmsd_total, drmsd_total, drmsd_recon_total, rmsd_recon_total = 0, 0, 0, 0, 0
    for i in range(10):
        rm, rd, d, d_recon, rd_recon = add_noise_and_get_rmse_drmsd(seq, ang, crd, stdev)
        rmse_total += rm
        rmsd_total += rd
        drmsd_total += d
        drmsd_recon_total += d_recon
        rmsd_recon_total += rd_recon
    return rmse_total/10, rmsd_total/10, rmsd_recon_total/10, drmsd_total/10, drmsd_recon_total/10
    

In [52]:
from multiprocessing import Pool

In [53]:
p = Pool(16)
results = list(tqdm.tqdm(p.imap(work, 
                                 zip(data["train"]["seq"], 
                                     data["train"]["ang"], 
                                     data["train"]["crd"], 
                                     [1]*len(data["train"]["seq"]))
                               ), total=len(data["train"]["seq"])))



  """
  
  """
  """
  
  
  """
  """
  """
  
  """
  """
  
  
  
  """
  
  """
  
  
  """
  
  """
  
  """
  """
  
  """
  
  


KeyboardInterrupt: 

In [54]:
results = p.map(work, 
          zip(data["train"]["seq"], 
              data["train"]["ang"], 
              data["train"]["crd"], 
              [1]*len(data["train"]["seq"])))

KeyboardInterrupt: 

In [20]:
from protein_transformer.protein.PDB_Creator import PDB_Creator
creator1 = PDB_Creator(c.numpy(), seq=seq, atoms_per_res=3)
creator2 = PDB_Creator(get_backbone_from_full_coords(crd), seq=seq, atoms_per_res=3)

In [21]:
creator1.save_pdb("reconstructed.pdb")

In [22]:
creator2.save_pdb("original.pdb")

In [None]:
pwd1 = protein_transformer.losses.pairwise_internal_dist(c)
pwd2 = protein_transformer.losses.pairwise_internal_dist(torch.tensor(crd))

In [None]:
pwd2

In [None]:
pwd1

In [None]:
c.numpy()[:50]

In [None]:
crd[:50]

In [None]:
torch.isnan(c).any(), np.isnan(crd).any() 

In [None]:
ang

In [None]:
np.random.seed(0)
ang_noisy = add_noise(ang, 1, truncate=True, keep_zeros=True)

In [None]:
ang_noisy = add_noise

In [None]:
c = get_coordinates_from_numpy_data(seq, ang)

In [None]:
c.shape

In [None]:
ang.shape

In [None]:
327*13