In [1]:
import sys
sys.path.append('../..')

In [2]:
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from Bio import PDB

In [3]:
from DomainPrediction.protein.base import BaseProtein
from DomainPrediction.protein.base import FoldedProtein
from DomainPrediction.utils import helper
from DomainPrediction.eval import metrics
from DomainPrediction.utils.hmmtools import HmmerTools
from DomainPrediction.utils.tmalign import TMalign
from DomainPrediction.utils.constants import *

In [4]:
root = '../../..'
data_path = '/data/users/kgeorge/workspace/Data'
pmpnn_path = os.path.join(data_path, 'pmpnn_experiments/6mfw_exp')
esm3_path = os.path.join(data_path, 'esm3_experiments/6mfw_exp')
evodiff_path = os.path.join(data_path, 'evodiff_experiments/6mfw_exp')
esm2_random_path = os.path.join(data_path, 'esm2_experiments/random/6mfw_exp')
esm2_entropy_path = os.path.join(data_path, 'esm2_experiments/entropy/6mfw_exp')

In [5]:
protein_6mfw = BaseProtein(file=os.path.join(data_path, '6mfw_conformations/hm_6mfy_ATC.pdb'))

#### Extract T domains and save

In [6]:
def extract_residues(pdb_file, chain_id, start_residue, end_residue, output_file):
    """
        This was from chatgpt - it works!
        I was lazy to create my own
    """
    # Create a parser and structure object
    parser = PDB.PDBParser()
    structure = parser.get_structure('protein', pdb_file)
    
    # Create a PDBIO object to write the output
    io = PDB.PDBIO()
    
    class ResidueSelector(PDB.Select):
        def accept_residue(self, residue):
            # Check if the residue is within the specified range
            return start_residue <= residue.id[1] <= end_residue
    
    # Select the chain and extract the residues
    chain = structure[0][chain_id]
    
    # Set the structure to be written (the selected residues)
    io.set_structure(chain)
    
    # Save the selected residues to a new PDB file
    io.save(output_file, ResidueSelector())

In [7]:
# chain_id = "A"
# start_residue = T_6mfw[0] + 1
# end_residue = T_6mfw[-1] + 1 

# paths = [pmpnn_path, esm3_path, evodiff_path, esm2_random_path, esm2_entropy_path]
# for _path in paths:
#     pdb_path = os.path.join(_path, '6mfw_pdbs')
#     for f in os.listdir(pdb_path):
#         file = os.path.join(pdb_path, f)
#         if f.endswith('.pdb'):
#             input_file = os.path.join(pdb_path, f)
#             output_file = os.path.join(pdb_path, f.replace('.pdb', '.T.pdb'))

#             extract_residues(input_file, chain_id, start_residue, end_residue, output_file)

#### Get TM Score against full proteins and T domains

We can skip random as they have poor sequences

In [8]:
paths = [pmpnn_path, esm3_path, evodiff_path, esm2_entropy_path]
for _path in paths:
    print(_path)
    pdbs_path = os.path.join(_path, '6mfw_pdbs')
    
    ## Sanity check
    print('Checking keys in metadata')
    for f in tqdm(os.listdir(pdbs_path)):
        if f.endswith('.meta.npz'):
            meta_file = os.path.join(pdbs_path, f)
            metadata = dict(np.load(meta_file))
            for key in ['predicted_aligned_error', 'ptm', 'esm2_650M_perplexity', 'max_TM_score', 'T_TM_score']:
                assert key in metadata

/data/users/kgeorge/workspace/Data/pmpnn_experiments/6mfw_exp
Checking keys in metadata


100%|██████████| 3000/3000 [04:20<00:00, 11.49it/s]


/data/users/kgeorge/workspace/Data/esm3_experiments/6mfw_exp
Checking keys in metadata


100%|██████████| 3000/3000 [03:54<00:00, 12.80it/s]


/data/users/kgeorge/workspace/Data/evodiff_experiments/6mfw_exp
Checking keys in metadata


100%|██████████| 3000/3000 [03:56<00:00, 12.68it/s]


/data/users/kgeorge/workspace/Data/esm2_experiments/entropy/6mfw_exp
Checking keys in metadata


100%|██████████| 2949/2949 [03:59<00:00, 12.33it/s]


In [9]:
tm_path = '/nethome/kgeorge/workspace/DomainPrediction/src/DomainPrediction/utils/TMalign'
ref_paths = [os.path.join(data_path, f) for f in ['6mfw_conformations/hm_6mfy_ATC.pdb', '6mfw_conformations/hm_6mfz_ATC.pdb', 
                                                  '6mfw_conformations/hm_6mg0_cA_ATC.pdb', '6mfw_conformations/hm_6mg0_cB_ATC.pdb']]

ref_path = os.path.join(data_path, '6mfw_conformations/hm_6mfy_T.pdb') 


tmalign = TMalign(tm_path)
paths = [pmpnn_path, esm3_path, evodiff_path, esm2_entropy_path]
for _path in paths:
    print(_path)
    pdbs_path = os.path.join(_path, '6mfw_pdbs')
    for f in os.listdir(pdbs_path):
        if f.endswith('T.pdb'):
            print(f)
            file = os.path.join(pdbs_path, f)
            res = tmalign.run(ref_path, file)

            tm_score = res['tm_score']
            meta_file = file.replace('.T.pdb', '.meta.npz') ## need to change this

            assert os.path.isfile(meta_file)
            # helper.update_metadata(meta_file, 'T_TM_score', tm_score, force=False)
    
    ## Sanity check
    # print('Checking keys in metadata')
    # for f in os.listdir(pdbs_path):
    #     if f.endswith('.meta.npz'):
    #         meta_file = os.path.join(pdbs_path, f)
    #         metadata = dict(np.load(meta_file))
    #         for key in ['predicted_aligned_error', 'ptm', 'esm2_650M_perplexity', 'max_TM_score']:
    #             assert key in metadata

/data/users/kgeorge/workspace/Data/pmpnn_experiments/6mfw_exp
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-0.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-1.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-2.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-3.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-4.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-5.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-6.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-7.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-8.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-9.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-10.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-11.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-12.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-13.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-14.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-15.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-16.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-17.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-gen-18.T.pdb
6mfw-hm_6mfz_ATC-v_48_002-temp_0.3-g