In [26]:
import pandas as pd
import simba2.methods as simba
from Bio.PDB import PDBList

In [27]:
# Import Ssym reverse dataset
ssym = pd.read_csv('../data/processed/ssym_tidy.csv')

ssym.Number = ssym.Number.astype('str')
ssym['PDB'] = ssym.apply(lambda row : row['PDB'].upper(), axis=1)

aa_codes = pd.read_csv('../data/aa_codes.csv')
aa_dict = aa_codes.drop(columns = 'Name').set_index('Code').T.to_dict(orient = 'list')
aa_dict = {code: letter[0] for code, letter in aa_dict.items()}

In [28]:
for column in ['Wild','Mutated']:
    ssym[column] = ssym.apply(lambda row : aa_dict[row[column]], axis=1)

In [29]:
len(ssym)

684

In [30]:
# QC on dataset

## Any duplicates?
ssym_mutations = ssym[['Number', 'PDB', 'Wild', 'Mutated', 'exp_ddG']]
ssym_mutations[ssym_mutations.duplicated()]
sum(ssym_mutations.duplicated())

0

In [31]:
pdb_dir = '../data/PDB'

In [32]:
# List of unique PDBs 
unique_pdbs = ssym['PDB'].unique()

In [25]:
# Download PDBs
pdbl = PDBList()

for name in unique_pdbs:
    pdbl.retrieve_pdb_file(name, pdir=pdb_dir, file_format='pdb')

Structure exists: '../data/external/PDB\pdb1amq.ent' 
Structure exists: '../data/external/PDB\pdb1qir.ent' 
Structure exists: '../data/external/PDB\pdb1qis.ent' 
Structure exists: '../data/external/PDB\pdb1qit.ent' 
Structure exists: '../data/external/PDB\pdb5eaa.ent' 
Structure exists: '../data/external/PDB\pdb1bni.ent' 
Structure exists: '../data/external/PDB\pdb1brg.ent' 
Structure exists: '../data/external/PDB\pdb1brh.ent' 
Structure exists: '../data/external/PDB\pdb1bns.ent' 
Structure exists: '../data/external/PDB\pdb1bsa.ent' 
Structure exists: '../data/external/PDB\pdb1bri.ent' 
Structure exists: '../data/external/PDB\pdb1bsb.ent' 
Structure exists: '../data/external/PDB\pdb1bao.ent' 
Structure exists: '../data/external/PDB\pdb1brj.ent' 
Structure exists: '../data/external/PDB\pdb1bsc.ent' 
Structure exists: '../data/external/PDB\pdb1bse.ent' 
Structure exists: '../data/external/PDB\pdb1ban.ent' 
Structure exists: '../data/external/PDB\pdb1brk.ent' 
Structure exists: '../data/e

In [33]:
# Calculate ddG with Simba2 for each PDB in datasets but 
# keep only those entries that are present in the dataset.

df_list = []
for pdb in unique_pdbs:
    pdb_path, exists = simba.exists_pdb(pdb, pdb_dir)
    if exists:
        df = simba.simba2_predict(pdb, pdb_path)
        df1 = pd.merge(df, 
                 ssym[ssym['PDB'] == pdb], 
                 how = 'inner', 
                 on = ['PDB', 'Chain', 'Number', 'Wild', 'Mutated'])
        
        ## Does wild-type residue type and position match in PDB?
        ## Is the chain letter present in the PDB?
        if len(df1) == 0:
            print(pdb)
            
        pdb_entries = ssym[ssym['PDB'] == pdb]
        pdb_positions = pdb_entries[['Number', 'Wild']]
        pdb_chains = pdb_entries[['PDB','Number','Chain']]
        merged = pd.merge(df, pdb_positions, how = 'outer', indicator = True, on = ['Number', 'Wild'])
        not_matched = merged[merged['_merge'] == 'right_only']
        if len(not_matched) != 0: 
            print(not_matched[['PDB', 'Number', 'Wild']])
        
        merged = pd.merge(df, pdb_entries, how = 'outer', indicator = True, on = ['Chain'])
        not_matched = merged[merged['_merge'] == 'right_only']
        if len(not_matched) != 0: 
            print(not_matched[['Number_y', 'Chain']])
              
        df_list.append(df1)
        

In [34]:
simba_df = pd.concat(df_list)

In [35]:
len(simba_df)

684

In [25]:
(simba_df
 .sort_values(by = ['Mut_index', 'PDB', 'Number'])
 .to_csv("../data/ssym_2.csv", index = False))