# 1 For reproducing results in PocketAnchor Paper or processing cutomized datasets with predefined PDB IDs and chains

## 1.1 Prepare or load the list file

### 1.1.1 For reproducing the results in PocketAnchor paper, please just use the lists in ./lists/

In [52]:
# demo data
with open('lists/pdbid_chain_list_HOLO4k.txt') as f:
    list_task = [line.strip() for line in f.readlines()]
# print(list_task)

For reproducing results on COACH420, please use lists/pdbid_chain_list_COACH420.txt

For reproducing results on HOLO4k, please use lists/pdbid_chain_list_HOLO4k.txt

For reproducing results on PDBbind, please use lists/pdbid_chain_center_list_PDBbind.txt


### 1.1.2 For customized datasets:

In [3]:
# example
customized_list = ['121p_A', '12as_AB', '13pk_ABCD', '16pk_A', '17gs_AB', '182l_A', '183l_A', '185l_A', '186l_A', '187l_A', '18gs_AB', '19gs_AB', '1a05_AB', '1a0f_AB', '1a0g_AB']
name = 'customized_list'
# write the list file
with open('lists/{}.txt'.format(name), 'w') as f:
    for item in customized_list:
        f.write(item+'\n')

In [4]:
# define list_task
with open('lists/{}.txt'.format(name)) as f:
    list_task = [line.strip() for line in f.readlines()]
print(list_task)

['121p_A', '12as_AB', '13pk_ABCD', '16pk_A', '17gs_AB', '182l_A', '183l_A', '185l_A', '186l_A', '187l_A', '18gs_AB', '19gs_AB', '1a05_AB', '1a0f_AB', '1a0g_AB']


## 1.2 Download pdb files

In [79]:
import pymol
import os
from multiprocessing import Pool

In [130]:
def download_one(pdbid):
    if not os.path.exists('MasifOutput/download/{}.pdb'.format(pdbid)):
        os.system('wget -P MasifOutput/download/ https://files.rcsb.org/download/{}.pdb'.format(pdbid))

###############################################
### please define this number according to 
### your computational resources
num_processes = 32
###############################################

pdbid_list = [item[:4] for item in list_task]

with Pool(num_processes) as p:
    res = p.map(download_one, pdbid_list)

## 1.3 Add hydrogens

In [134]:
def process_pdb(pdbid):
    rewrite=True
    if not os.path.exists('MasifOutput/download/{}.pdb'.format(pdbid)):
        return
    pymol.cmd.reinitialize()
    pymol.cmd.load('MasifOutput/download/{}.pdb'.format(pdbid))
    pymol.cmd.h_add(pdbid)
    pymol.cmd.save('MasifOutput/00-raw_pdbs/{}.pdb'.format(pdbid), state=-1)
    return 

In [135]:
pdbid_list = [item[:4] for item in list_task]

with Pool(num_processes) as p:
    res = p.map(process_pdb, pdbid_list)

In [136]:
# check the processed files
len(os.listdir('MasifOutput/00-raw_pdbs/'))

583

## 1.4 Calculated MaSIF meshes and features

In [86]:
from urllib.parse import quote_plus
import xmlrpc.client as rpc_client
import pandas as pd
from io import StringIO
import numpy as np

In [87]:
link_str = "http://{user}:{pwd}@{host}:{port}".format(
    user=quote_plus("anchor"),
    pwd=quote_plus("pocket"),
    host="192.168.1.233", # change this IP to that of your masif server
    port="1213"
)
rpc = rpc_client.ServerProxy(link_str)

In [137]:
for task in list_task:
    rpc.query(task)

In [138]:
task

'4kkg_A'

## 1.5 Check progress and fix the failed samples using PDBFixer

In [154]:
count_done = 0
list_task_fix = []
for task in set(list_task):
    if rpc.check(task) == 'Done':
        count_done += 1
    elif rpc.check(task) == 'Failed':
        print('pdbfixer ./MasifOutput/download/{}.pdb  --output ./MasifOutput/download/fixed_{}.pdb'\
              .format(task[:4], task[:4]))
        list_task_fix.append('fixed_'+task)
    else:
#         pass
        print(task, rpc.check(task))
count_done

pdbfixer ./MasifOutput/download/1hbq.pdb  --output ./MasifOutput/download/fixed_1hbq.pdb
pdbfixer ./MasifOutput/download/1bgt.pdb  --output ./MasifOutput/download/fixed_1bgt.pdb
pdbfixer ./MasifOutput/download/1epa.pdb  --output ./MasifOutput/download/fixed_1epa.pdb
pdbfixer ./MasifOutput/download/1gmc.pdb  --output ./MasifOutput/download/fixed_1gmc.pdb
pdbfixer ./MasifOutput/download/3zv2.pdb  --output ./MasifOutput/download/fixed_3zv2.pdb
pdbfixer ./MasifOutput/download/1oem.pdb  --output ./MasifOutput/download/fixed_1oem.pdb
pdbfixer ./MasifOutput/download/1pyp.pdb  --output ./MasifOutput/download/fixed_1pyp.pdb
pdbfixer ./MasifOutput/download/1kan.pdb  --output ./MasifOutput/download/fixed_1kan.pdb
pdbfixer ./MasifOutput/download/1h6j.pdb  --output ./MasifOutput/download/fixed_1h6j.pdb
pdbfixer ./MasifOutput/download/3t0h.pdb  --output ./MasifOutput/download/fixed_3t0h.pdb
pdbfixer ./MasifOutput/download/1uyl.pdb  --output ./MasifOutput/download/fixed_1uyl.pdb
pdbfixer ./MasifOutpu

470

Run the above commands in a terminal, to fix the samples that were not successfully processed in last step.

And then calculate MaSIF precomputed features for them:

In [161]:
def process_fixed_pdb(pdbid):
    rewrite=True
    if not os.path.exists('MasifOutput/download/fixed_{}.pdb'.format(pdbid)):
        return False
    pymol.cmd.reinitialize()
    pymol.cmd.load('MasifOutput/download/fixed_{}.pdb'.format(pdbid))
    pymol.cmd.h_add()
    pymol.cmd.save('MasifOutput/00-raw_pdbs/fixed_{}.pdb'.format(pdbid), state=-1)
    return True

In [163]:
# add hydrogens
for task in list_task_fix:
    print(task, ' ', end="")
    print(process_fixed_pdb(task[6:10]))

fixed_1hbq_A  True
fixed_1bgt_A  True
fixed_1epa_AB  True
fixed_1gmc_EFG  True
fixed_3zv2_A  True
fixed_1oem_X  True
fixed_1pyp_AB  True
fixed_1kan_AB  True
fixed_1h6j_AB  True
fixed_3t0h_A  True
fixed_1uyl_A  True
fixed_1gsb_AB  True
fixed_1qhm_AB  True
fixed_1dla_B  True
fixed_2pfk_CD  True
fixed_2tld_E  True
fixed_1gsc_AB  True
fixed_1ksv_A  True
fixed_1aat_AB  True
fixed_2gch_EFG  True
fixed_5cha_EFG  True


In [164]:
# run the Masif server
for task in list_task_fix:
    rpc.query(task)

In [167]:
# check progress
for task in list_task_fix:
    print(task, '\t',  rpc.check(task))

fixed_1hbq_A 	 Done
fixed_1bgt_A 	 Done
fixed_1epa_AB 	 Done
fixed_1gmc_EFG 	 Done
fixed_3zv2_A 	 Done
fixed_1oem_X 	 Done
fixed_1pyp_AB 	 Done
fixed_1kan_AB 	 Done
fixed_1h6j_AB 	 Done
fixed_3t0h_A 	 Done
fixed_1uyl_A 	 Done
fixed_1gsb_AB 	 Done
fixed_1qhm_AB 	 Doing
fixed_1dla_B 	 Done
fixed_2pfk_CD 	 Done
fixed_2tld_E 	 Done
fixed_1gsc_AB 	 Done
fixed_1ksv_A 	 Failed
fixed_1aat_AB 	 Done
fixed_2gch_EFG 	 Done
fixed_5cha_EFG 	 Done


# 2 For processing datasets without predefined chains (need ligand files; e.g., PDBbind)

## 2.1 Download pdb files

In [None]:
from multiprocessing import Pool
import os

def download_one(pdbid):
    if not os.path.exists('MasifOutput/00-raw_pdbs/{}.pdb'.format(pdbid)):
        os.system('wget -P MasifOutput/00-raw_pdbs/ https://files.rcsb.org/download/{}.pdb'.format(pdbid))

###############################################
### please define this number according to 
### your computational resources
num_processes = 4 
###############################################

pdbid_list = [item[:4] for item in list_task]

with Pool(num_processes) as p:
    res = p.map(download_one, pdbid_list)

## 2.2 Define chains using the following demo code

In [None]:
from pymol import cmd
import numpy as np
import pandas as pd
import os, time, sys, pickle
from collections import defaultdict
from sklearn.metrics import pairwise_distances

prefix = "../"
pdb_datapath = prefix + "your_local_path"  # for raw pdb files
pdbbind_datapath = prefix + "your_local_path"  # need the ligand files provided by PDBbind or other sources
pdbchains_path = prefix + "your_local_path"  # for save the results

In [6]:
def get_biomolecule(pdb_filename):
    dict_temp = {}
    with open(os.path.join(pdb_filename), 'r') as f:
        line = f.readline()
        while line:
            temp = line.strip().split()
            if temp[0] == "REMARK":
                if len(temp)>1 and temp[1] == "350":
                    if "BIOMOLECULE" in line:
                        biomol = temp[3]
                    elif "APPLY THE FOLLOWING TO CHAIN" in line:
                        added = line.strip().replace(" ", "").split(':')[1].split(',')
                        added = [item for item in added if item != ""]
                        if biomol not in dict_temp:
                            dict_temp[biomol] = []
                        dict_temp[biomol].extend(added)
                    elif "AND CHAIN" in line:
                        added = line.strip().replace(" ", "").split(':')[1].split(',')
                        added = [item for item in added if item != ""]
                        if biomol not in dict_temp:
                            dict_temp[biomol] = []
                        dict_temp[biomol].extend(added)
            if temp[0] == "ATOM":
                break
            line = f.readline()
    return dict_temp


def select_biomolecule(pdb_filename, chain_dict, ligand_filename):
    distance_dict = {}
    cmd.reinitialize()
    cmd.load(pdb_datapath + pdbid + '.pdb')
    cmd.load(pdbbind_datapath + pdbid + '/' + pdbid + '_ligand.sdf')
    ligand_coords = []
    cmd.iterate_state(-1, '{}_ligand'.format(pdbid), 'ligand_coords.append([x,y,z])', space=locals())
        
    for biomol, chain_list in chain_dict.items():
        cmd.select('biomol_{}'.format(biomol), 'chain {}'.format('+'.join(chain_list)))
        coords = []
        cmd.iterate_state(-1, 'biomol_{}'.format(biomol), 'coords.append([x,y,z])', space=locals())
        if len(coords) > 0:
            dist = pairwise_distances(ligand_coords, coords)
            distance_dict[biomol] = dist.min()
        else:
            distance_dict[biomol] = np.nan
    return distance_dict

def get_all_chains(pdbid):
    cmd.reinitialize()
    cmd.load(pdb_datapath+'{}.pdb'.format(pdbid))
    cmd.remove('het')
    chains = set()
    for x in cmd.get_names():
        # print('x', x)
        for ch in cmd.get_chains(x):
            chains.add(ch)
            #print(x, " has chain ", ch)
    return chains

def get_remove_chains(pdbid):
    cmd.reinitialize()
    cmd.load(pdb_datapath+'{}.pdb'.format(pdbid))
    cmd.remove('het')
    cmd.load(pdbbind_datapath+'{}/{}_ligand.sdf'.format(pdbid, pdbid))
    cmd.select('near_ligand', '{}_ligand expand 0.1'.format(pdbid))
    chains = set()
    cmd.iterate('near_ligand', 'chains.add(chain)', space=locals())
    if '' in chains:
        chains = chains - {''}
    return chains


def get_chains_one_sample(pdbid, isprint=False, isread=True):
    if isread:
        if os.path.exists(pdbchains_path + pdbid):
            with open(pdbchains_path + pdbid, 'r') as f:
                chains = f.readline().strip()
            return chains
    
    biomol_dict = get_biomolecule(pdbid)
    if isprint:
        print('biomol_dict', biomol_dict)
    try:
        result_dict = select_biomolecule(pdbid, biomol_dict)
    except Exception as E:
        print(pdbid, E)
        return ""
    if isprint:
        print('result_dict', result_dict)
    if len(biomol_dict) == 0:
        select_chains = get_all_chains(pdbid)
    else:
        select_key, selected_value = '', 99999
        for key, value in result_dict.items():
            if value != np.nan and value < selected_value:
                selected_value = value
                select_key = key
        if isprint:
            print('select_key', select_key)
        assert select_key != '', pdbid
        select_chains = biomol_dict[select_key]
    if isprint:
        print('select_chains', select_chains)
    remove_chains = get_remove_chains(pdbid)
    if isprint:
        print('remove_chains', remove_chains)
    final_chains  = list(set(select_chains) - remove_chains)
    if isprint:
        print('final_chains', final_chains)
        
    chains = "".join(sorted(final_chains))
    with open(pdbchains_path + pdbid, 'w') as f:
        f.write(chains)
        
    return chains

In [None]:
pdbid = '1a1e'
get_chains_one_sample(pdbid)

## 2.3 Then prepare the lists of [PDBID_CHAINS], and run the steps in Section 1.