In [None]:
import pandas as pd
from Bio import PDB 
from Bio.PDB import PDBParser
from simba2 import methods as simba

In [None]:
# Import training and test datasets
b1131 = pd.read_csv('../data/processed/B1131_expddg.csv')
b663 =  pd.read_csv('../data/processed/B663_expddg.csv')
s350 =   pd.read_csv('../data/processed/S350_expddg.csv')

# Convert No column to string for compatibility with simba output
b1131.No = b1131.No.astype('str')
b663.No = b663.No.astype('str')
s350.No = s350.No.astype('str')

In [None]:
pdb_dir = '../data/external/PDB'

In [None]:
# Dataframe containing unique entries across datasets
dataset_residues = (pd.concat([b1131, b663, s350])
 .drop(columns = ['exp_ddG'])
 .drop_duplicates())

# List of unique PDBs across datasets
dataset_pdbs = dataset_residues['PDB'].unique()

In [None]:
# Calculate RSA, Hdiff, Vdiff, and predicted ddG (predicted ddG is not used for creating datasets) with Simba2 
# for each PDB in datasets and indicate if those entries are present in the datasets.
# Insert column indicating if PDB is a heterooligomer
df_list = []
for pdb in dataset_pdbs:
    pdb_path, exists = simba.exists_pdb(pdb, pdb_dir)
    print(pdb_path, exists)
    if exists:
        df = simba.simba2_predict(pdb, pdb_path)
        multichain, homo = simba.check_chains(df)
        df.insert(loc=1, column='Hetero', value=multichain and not homo)
        df_list.append(pd.merge(df, 
                 dataset_residues[dataset_residues['PDB'] == pdb],
                 how = 'outer',
                 indicator = True,       
                 left_on = ['PDB', 'Number', 'Wild', 'Mutated'],
                 right_on = ['PDB', 'No', 'Wild', 'Mutated']))

In [None]:
simba_df = pd.concat(df_list)        

In [None]:
def len_unique(inputlist):
    return len(inputlist.unique())

In [None]:
# Check which residues in the datasets that were not merged with Simba output

absent = simba_df[simba_df['_merge'] == 'right_only']
print('B1131:', pd.merge(b1131, absent, on = ['PDB', 'Wild', 'No', 'Mutated'], how = 'inner'))
print('B663:', pd.merge(b663, absent, on = ['PDB', 'Wild', 'No', 'Mutated'], how = 'inner'))
print('S350:', pd.merge(s350, absent, on = ['PDB', 'Wild', 'No', 'Mutated'], how = 'inner'))

Four data points in B663 does not exist in the simba output (the residue type is not at that position in the PDBs)

In [None]:
# remove outliers
b1131 = b1131[(b1131['exp_ddG'] > -8) & (b1131['exp_ddG'] < 8)]
b663 = b663[(b663['exp_ddG'] > -8) & (b663['exp_ddG'] < 8)]
s350 = s350[(s350['exp_ddG'] > -8) & (s350['exp_ddG'] < 8)]

In [None]:
len(b1131)

In [None]:
len(b663)

In [None]:
len(s350)

In [None]:
# Create dataframe with those entries in the dataset where it is not
# possible to know which chain it is located in. 

## keep only mutations that are present in the data sets
simba_df = simba_df[simba_df['_merge'] == 'both'].drop(columns = ['_merge'])

## keep heterooligomers
hetero = simba_df[simba_df['Hetero']] 
hetero[['PDB','Number', 'Mutated']].drop_duplicates() 

## for each unique mutation, count chains
no_chains = pd.DataFrame(hetero.groupby(['PDB', 'Number', 'Mutated'])['Chain'].agg(len_unique))

## make dataframe of mutations with more than one chain
unsure_chain = no_chains[no_chains['Chain'] != 1]
unsure_chain = pd.DataFrame(list(unsure_chain.index), columns = ['PDB', 'No', 'Mutated'])

unsure_chain

In [None]:
hetero

In [None]:
no_chains

In [None]:
def anti_join(dataset, unsure):
    df = pd.merge(dataset, unsure, on = ['PDB', 'No', 'Mutated'], how = 'outer', indicator = True)
    return df[df['_merge'] == 'left_only'].drop(columns = ['_merge'])

In [None]:
# Discard entries where it's not possible to know the chain
b1131 = anti_join(b1131, unsure_chain)
b663 = anti_join(b663, unsure_chain)
s350 = anti_join(s350, unsure_chain)

In [None]:
len(b1131)

In [None]:
len(b663)

In [None]:
len(s350)

In [None]:
simba_df

In [None]:
# Merge datasets (without unsure chains) with simba output and tidy up result 
def choose_var(row, variable):
    if pd.isna(row[variable + '_mean']):
        return row[variable]
    else:
        return row[variable + '_mean']

def finalize_dataset(dataset, simba_output):
    df = pd.merge(dataset, simba_output, on = ['PDB', 'No', 'Wild', 'Mutated'], how = 'inner')
    df['final_RSA'] = df.apply(lambda row : choose_var(row, 'RSA'), axis=1)
    df = df[['PDB', 'Wild', 'Number', 'final_RSA', 'Mutated', 'exp_ddG', 'Hdiff', 'Vdiff']]
    df = df.drop_duplicates()
    df = df.rename(columns = {"final_RSA" : "RSA"})
    df = df[['PDB', 'Wild', 'Number', 'RSA', 'Mutated', 'Hdiff', 'Vdiff', 'exp_ddG']]
    
    return df

In [None]:
b1131_simba2 = finalize_dataset(b1131, simba_df)
#b1131_simba2.to_csv('../data/b1131_simba2.csv', index = False)

b663_simba2 = finalize_dataset(b663, simba_df)
#b663_simba2.to_csv('../data/b663_simba2.csv', index = False)

s350_simba2 = finalize_dataset(s350, simba_df)
#s350_simba2.to_csv('../data/s350_simba2.csv', index = False)

In [None]:
print("After removal of hetero:", len(b1131))
print("Final:", len(b1131_simba2))
print("Duplicates:", sum(b1131.duplicated()))

There are two duplicated data points in the original B1131 which are removed above

In [None]:
## The duplicated entries are:
b1131[b1131.duplicated()]

In [None]:
print("After removal of hetero:", len(b663))
print("Final:", len(b663_simba2))
print("Duplicates:", sum(b663.duplicated()))

There were four missing data points in the PDBs, see above

In [None]:
print("After removal of hetero:", len(s350))
print("Final:", len(s350_simba2))
print("Duplicates:", sum(s350.duplicated()))

One missing data point due to obsolete PDB 2A01

In [None]:
# Save datasets as new datasets with new names

b1131_simba2['dataset'] = "B1112"
b663_simba2['dataset'] = "B655"
s350_simba2['dataset'] = "S344"

b1131_simba2.to_csv('../data/processed/B1112.csv', index = False)
b663_simba2.to_csv('../data/processed/B655.csv', index = False)
s350_simba2.to_csv('../data/processed/S344.csv', index = False)

In [None]:
from simba2 import methods as simba

In [None]:
dir(methods)