In [3]:
import gzip
import json
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

In [43]:
import biotite
import biotite.database.rcsb as rcsb
import biotite.structure.io.pdb as pdb
import biotite.structure as struc

In [2]:
dataset_path = "/projects/robustmicrob/jlaw/inputs"

In [None]:
def load_thermomutdb():
    df = pd.read_csv("thermomutdb/pancotti_etal_2022/Data_s669_with_predictions.csv")

In [9]:
data_file = Path(dataset_path, "thermomutdb/pancotti_etal_2022/Data_s669_with_predictions.csv")
df = pd.read_csv(data_file, index_col=0)
df.head(2)

Unnamed: 0,Protein,PDB_Mut,Mut_seq,TEMP,pH,DDG_checked_dir,DOI,nmr_xray,resolution,MAESTRO_dir,...,SEC_STR_dir,SEC_STR_inv,ThermoNet_dir,ThermoNet_inv,ACDC-NN-Seq_inv,ACDC-NN_inv,PDB_wild,DDGun_inv,DDG_checked_inv,DDGun3D_inv
0,1A0FA,S11A,S11A,329.83,6.5,-1.8,10.1042/BJ20061707,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.10 ANGSTROMS.,-0.761365,...,T,T,0.0209,-0.1772,-0.041723,-0.319539,1A0F,-0.0,1.8,-0.5
1,1A7VA,A104H,A104H,298.15,6.5,-2.69,10.1016/j.jmb.2009.07.074,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.30 ANGSTROMS.,-0.376272,...,H,H,0.1795,-0.0441,0.495499,0.308649,1A7V,0.5,2.69,0.1


In [40]:
data_file.name

'Data_s669_with_predictions.csv'

In [49]:
# get the sequence from the pdb files
from glob import glob
pdb_structures = {}
for pdb_file in glob(f"{dataset_path}/thermomutdb/pdbs/*.pdb*"):
    pdb_id = Path(pdb_file).name.split('.')[0]
    # print(pdb_id, pdb_file)
    structure = pdb.PDBFile.read(pdb_file).get_structure()
    break

In [50]:
pdb_id

'1KJY'

In [52]:
structure.res_id

array([  30,   30,   30, ..., 4176, 4182, 4186])

In [55]:
atom_array = structure[0]

In [56]:
nucleotides = atom_array[struc.filter_nucleotides(atom_array)]

# Get the residue names and residue ids of the nucleotides
residue_ids, residue_names = struc.get_residues(nucleotides)

In [57]:
residue_ids

array([356, 355])

In [59]:
residue_names

array(['GDP', 'GDP'], dtype='<U3')

In [69]:
struc.filter_amino_acids(structure)

array([ True,  True,  True, ..., False, False, False])

In [63]:
struc.get_residues(structure)

(array([  30,   31,   32,   33,   34,   35,   36,   37,   38,   39,   40,
          41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
          52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,
          63,   64,   65,   66,   67,   68,   69,   70,   71,   72,   73,
          74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,
          85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,
          96,   97,   98,   99,  100,  101,  102,  103,  104,  105,  106,
         107,  108,  109,  110,  111,  112,  113,  114,  115,  116,  117,
         118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,
         129,  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,
         140,  141,  142,  143,  144,  145,  146,  147,  148,  149,  150,
         151,  152,  153,  154,  155,  156,  157,  158,  159,  160,  161,
         162,  163,  164,  165,  166,  167,  168,  169,  170,  171,  172,
         173,  174,  175,  176,  177, 

In [62]:
len(struc.filter_amino_acids(structure))

5916

In [60]:
# Filter first peptide chain
protein_chain = structure[
    struc.filter_amino_acids(structure)
    & (structure.chain_id == structure.chain_id[0])
]
protein_chain

IndexError: boolean index did not match indexed array along dimension 0; dimension is 1 but corresponding boolean dimension is 5916

In [48]:
pdb_obj.get_structure()

stack([
	array([
	Atom(np.array([19.39 , 38.603,  2.589], dtype=float32), chain_id="A", res_id=30, ins_code="", res_name="GLY", hetero=False, atom_name="N", element="N"),
	Atom(np.array([19.94 , 38.661,  3.974], dtype=float32), chain_id="A", res_id=30, ins_code="", res_name="GLY", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([20.007, 37.293,  4.631], dtype=float32), chain_id="A", res_id=30, ins_code="", res_name="GLY", hetero=False, atom_name="C", element="C"),
	Atom(np.array([19.307, 36.365,  4.217], dtype=float32), chain_id="A", res_id=30, ins_code="", res_name="GLY", hetero=False, atom_name="O", element="O"),
	Atom(np.array([20.849, 37.171,  5.657], dtype=float32), chain_id="A", res_id=31, ins_code="", res_name="ALA", hetero=False, atom_name="N", element="N"),
	Atom(np.array([21.018, 35.914,  6.384], dtype=float32), chain_id="A", res_id=31, ins_code="", res_name="ALA", hetero=False, atom_name="CA", element="C"),
	Atom(np.array([19.802, 35.608,  7.252], dtype=float32), 

In [23]:
df_mapping = pd.read_csv(Path(dataset_path, "thermomutdb/uniprot_pdb_mapping.csv"), index_col=0)
uniprot_to_pdb = dict(zip(df_mapping.uniprot_id, df_mapping.pdb_and_chain))
pdb_to_uniprot = dict(zip(df_mapping.pdb_and_chain, df_mapping.uniprot_id))
df_mapping.head(2)

Unnamed: 0,uniprot_id,PDB,pdb_and_chain
0,P0A877,1wq5,1WQ5A
2,P00720,2lzm,2LZMA


In [24]:
df['uniprot_id'] = df.Protein.apply(lambda x: pdb_to_uniprot.get(x, np.nan))

In [25]:
pdb_to_uniprot.get("1A0FA")

'P0A9D2'

In [26]:
df.head(2)

Unnamed: 0,Protein,PDB_Mut,Mut_seq,TEMP,pH,DDG_checked_dir,DOI,nmr_xray,resolution,MAESTRO_dir,...,SEC_STR_inv,ThermoNet_dir,ThermoNet_inv,ACDC-NN-Seq_inv,ACDC-NN_inv,PDB_wild,DDGun_inv,DDG_checked_inv,DDGun3D_inv,uniprot_id
0,1A0FA,S11A,S11A,329.83,6.5,-1.8,10.1042/BJ20061707,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.10 ANGSTROMS.,-0.761365,...,T,0.0209,-0.1772,-0.041723,-0.319539,1A0F,-0.0,1.8,-0.5,P0A9D2
1,1A7VA,A104H,A104H,298.15,6.5,-2.69,10.1016/j.jmb.2009.07.074,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.30 ANGSTROMS.,-0.376272,...,H,0.1795,-0.0441,0.495499,0.308649,1A7V,0.5,2.69,0.1,P00149


In [27]:
print(len(df), len(df.dropna(subset=['uniprot_id'])))

669 502


In [31]:
df[(pd.isnull(df.uniprot_id)) & (df.Protein == "2PTLA")]

Unnamed: 0,Protein,PDB_Mut,Mut_seq,TEMP,pH,DDG_checked_dir,DOI,nmr_xray,resolution,MAESTRO_dir,...,SEC_STR_inv,ThermoNet_dir,ThermoNet_inv,ACDC-NN-Seq_inv,ACDC-NN_inv,PDB_wild,DDGun_inv,DDG_checked_inv,DDGun3D_inv,uniprot_id
488,2PTLA,A22G,A22G,295.15,7.0,-2.43,10.1006/jmbi.2000.3701,EXPDTA SOLUTION NMR,,-2.934533,...,E,-0.4520,0.9985,1.872548,1.464314,2PTL,1.2,2.43,1.5,
489,2PTLA,A27P,A27P,295.15,7.0,0.10,10.1006/jmbi.2000.3701,EXPDTA SOLUTION NMR,,0.588152,...,T,0.6211,-0.8270,1.042050,0.652630,2PTL,0.9,-0.10,0.5,
490,2PTLA,A27V,A27V,295.15,7.0,-0.83,10.1006/jmbi.2000.3701,EXPDTA SOLUTION NMR,,0.181530,...,T,0.9588,-0.4221,-0.158822,0.082541,2PTL,0.0,0.83,-0.1,
491,2PTLA,A34G,A34G,295.15,7.0,-2.17,10.1006/jmbi.2000.3701,EXPDTA SOLUTION NMR,,-1.315164,...,E,-0.7048,0.5380,1.489788,1.289548,2PTL,1.2,2.17,0.9,
492,2PTLA,A34V,A34V,295.15,7.0,1.47,10.1006/jmbi.2000.3701,EXPDTA SOLUTION NMR,,-0.151049,...,E,0.6982,-0.6354,0.035003,0.446808,2PTL,-0.3,-1.47,-0.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,2PTLA,V65A,V65A,295.15,7.0,-1.14,10.1006/jmbi.2000.3701,EXPDTA SOLUTION NMR,,-0.233232,...,E,-0.5447,0.2855,1.393555,1.136307,2PTL,2.0,1.14,0.8,
552,2PTLA,Y48A,Y48A,295.15,7.0,-2.82,10.1006/jmbi.2000.3701,EXPDTA SOLUTION NMR,,-0.377922,...,H,-1.1636,1.0352,2.173247,1.526204,2PTL,2.3,2.82,1.6,
553,2PTLA,Y50A,Y50A,295.15,7.0,-2.46,10.1006/jmbi.2000.3701,EXPDTA SOLUTION NMR,,-0.532283,...,H,-0.8092,1.3655,2.640789,1.944200,2PTL,3.9,2.46,2.4,
554,2PTLA,Y70A,Y70A,295.15,7.0,-1.66,10.1006/jmbi.2000.3701,EXPDTA SOLUTION NMR,,-3.187967,...,T,-2.3793,1.5137,1.581269,1.578347,2PTL,2.0,1.66,1.4,


In [29]:
df[pd.isnull(df.uniprot_id)].Protein.value_counts()

2PTLA    68
1GUAB    49
3DV0I    31
2CLRB     4
1JLVA     3
5VP3A     3
2VY0A     2
3BCIA     2
3MONB     2
4BUQA     2
1R2YA     1
Name: Protein, dtype: int64

In [5]:
df.columns

Index(['Protein', 'PDB_Mut', 'Mut_seq', 'TEMP', 'pH', 'DDG_checked_dir', 'DOI',
       'nmr_xray', 'resolution', 'MAESTRO_dir', 'FoldX_dir', 'PremPS_dir',
       'Dynamut_dir', 'mCSM_dir', 'SDM_dir', 'DUET_dir', 'I-Mutant3.0_dir',
       'I-Mutant3.0-Seq_dir', 'MuPro_dir', 'SAAFEC-Seq_dir', 'DDGun3D_dir',
       'DDGun_dir', 'ACDC-NN-Seq_dir', 'ACDC-NN_dir', 'Mut_PDB', 'PremPS_inv',
       'MAESTRO_inv', 'FoldX_inv', 'Dynamut_inv', 'mCSM_inv', 'SDM_inv',
       'DUET_inv', 'I-Mutant3.0_inv', 'I-Mutant3.0-Seq_inv', 'MuPro_inv',
       'SAAFEC-Seq_inv', 'INPS-Seq_dir', 'INPS3D_dir', 'INPS-Seq_inv',
       'INPS3D_inv', 'PopMusic_dir', 'PopMusic_inv', 'SOL_ACC_dir',
       'SOL_ACC_inv', 'SEC_STR_dir', 'SEC_STR_inv', 'ThermoNet_dir',
       'ThermoNet_inv', 'ACDC-NN-Seq_inv', 'ACDC-NN_inv', 'PDB_wild',
       'DDGun_inv', 'DDG_checked_inv', 'DDGun3D_inv'],
      dtype='object')

In [4]:
data_file2 = Path(dataset_path, "thermomutdb/pancotti_etal_2022/Ssym+_experimental.csv")
df = pd.read_csv(data_file, index_col=0)
df.head(2)

Unnamed: 0,Protein,PDB_Mut,Mut_seq,TEMP,pH,DDG_checked_dir,DOI,nmr_xray,resolution,MAESTRO_dir,...,SEC_STR_dir,SEC_STR_inv,ThermoNet_dir,ThermoNet_inv,ACDC-NN-Seq_inv,ACDC-NN_inv,PDB_wild,DDGun_inv,DDG_checked_inv,DDGun3D_inv
0,1A0FA,S11A,S11A,329.83,6.5,-1.8,10.1042/BJ20061707,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.10 ANGSTROMS.,-0.761365,...,T,T,0.0209,-0.1772,-0.041723,-0.319539,1A0F,-0.0,1.8,-0.5
1,1A7VA,A104H,A104H,298.15,6.5,-2.69,10.1016/j.jmb.2009.07.074,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.30 ANGSTROMS.,-0.376272,...,H,H,0.1795,-0.0441,0.495499,0.308649,1A7V,0.5,2.69,0.1
