In [1]:
import os, time, pickle, sys, math
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from pymol import cmd
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
import pymesh

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from scipy.cluster.hierarchy import fcluster, linkage, single
from scipy.spatial.distance import pdist

In [2]:
import pymol

### 0. Define dataset parameters

In [19]:
da = 6
# scpdb_path = '/data/lishuya/software/scPDB/'

outdir = './AffinityPredictionData_PDBbind_da{}/'.format(da)
if not os.path.exists(outdir):
    os.mkdir(outdir)

### 1. Load dataset files

In [4]:
table = pd.read_csv('lists/casf2016_table_new_protein.tsv', sep='\t')
table.shape

(19019, 5)

In [5]:
with open('lists/pdbid_chain_center_list_PDBbind.txt') as f:
    list_task = [item.strip() for item in f.readlines()]
len(list_task)

19443

In [6]:
list_pdbid = table['pdbid'].values
set_pdbid = set(list_pdbid)
len(list_pdbid)

19019

### 2. Get protein features

In [7]:
# batch_process

anchor_dict = {}
for item in list_task: 
    pdbid_chains, center = item.split()
    pdbid, chains = pdbid_chains.split('_')
    if pdbid not in set_pdbid:
        continue
    repeat_list = np.load('AnchorOutput/{}_center_{}_da_{}/anchors.npy'.format(pdbid_chains, center, da))
    anchor_dict[pdbid] = repeat_list[0]

print(len(anchor_dict))

19019


In [8]:
with open(outdir+'casf2016_anchor_dict_thre'+str(da), 'wb') as f:
    pickle.dump(anchor_dict, f)

In [9]:
feature_dict = {}

for item in list_task:
    pdbid_chains, center = item.split()
    pdbid, chains = pdbid_chains.split('_')
    if pdbid not in set_pdbid:
        continue
        
    atom_dict = pickle.load(open('AnchorOutput/{}_center_{}_da_{}/atom_feature.pk'.format(pdbid_chains, center, da), 'rb'))
#     print(pdbid, atom_dict[0][0].shape)
    feature_dict[pdbid] = atom_dict[0]
len(feature_dict)

19019

In [10]:
# SAVE AS A WHOLE
with open(outdir+"casf2016_atom_feature_coord_nei_dict_thre"+str(da), "wb") as f:
    pickle.dump(feature_dict, f)

In [11]:
masif_feature_coord_nei_dict = {}

for item in list_task:
    pdbid_chains, center = item.split()
    pdbid, chains = pdbid_chains.split('_')
    if pdbid not in set_pdbid:
        continue
    try:
        masif_data = pickle.load(open('AnchorOutput/{}_center_{}_da_{}/masif_feature_coord_nei_dict.pk'.format(pdbid_chains, center, da), 'rb'))[0]
    except:
        print('no maisf', pdbid)
        masif_data = (np.array([]).reshape((0, 5)), 
                       np.array([]).reshape((0, 3)),
                       np.array([]))
    masif_feature_coord_nei_dict[pdbid] = masif_data

len(masif_feature_coord_nei_dict)

no maisf 4h85
no maisf 3fq7
no maisf 4w53
no maisf 5kbi
no maisf 1lgw
no maisf 185l
no maisf 1l6s
no maisf 3twp
no maisf 4h81
no maisf 6cc9
no maisf 4ghi
no maisf 6csf
no maisf 1e02
no maisf 5kbg
no maisf 1dzj
no maisf 1dzk
no maisf 3tz4
no maisf 1e06
no maisf 6cse
no maisf 4gs9
no maisf 5kbe


19019

In [12]:
with open(outdir+'casf2016_masif_feature_coord_nei_dict', 'wb') as f:
    pickle.dump(masif_feature_coord_nei_dict, f)

In [13]:
am_dict = {}
for item in list_task:
    pdbid_chains, center = item.split()
    pdbid, chains = pdbid_chains.split('_')
    if pdbid not in set_pdbid:
        continue
    try:
        am = np.load('AnchorOutput/{}_center_{}_da_{}/am_list.npy'.format(pdbid_chains, center, da))[0]
    except:
        print('no maisf', pdbid)
        am = np.array([]).reshape((len(anchor_dict[pdbid]), 0))
    am_dict[pdbid] = am
    
len(am_dict)    

no maisf 4h85
no maisf 3fq7
no maisf 4w53
no maisf 5kbi
no maisf 1lgw
no maisf 185l
no maisf 1l6s
no maisf 3twp
no maisf 4h81
no maisf 6cc9
no maisf 4ghi
no maisf 6csf
no maisf 1e02
no maisf 5kbg
no maisf 1dzj
no maisf 1dzk
no maisf 3tz4
no maisf 1e06
no maisf 6cse
no maisf 4gs9
no maisf 5kbe


19019

In [14]:
with open(outdir+'casf2016_am_dict', 'wb') as f:
    pickle.dump(am_dict, f)

In [15]:
# protein part

aa_dict = {}
at_dict = {}

for item in list_task:
    pdbid_chains, center = item.split()
    pdbid, chains = pdbid_chains.split('_')
    if pdbid not in set_pdbid:
        continue
    at = np.load('AnchorOutput/{}_center_{}_da_{}/at_list.npy'.format(pdbid_chains, center, da))[0]  
    aa = np.load('AnchorOutput/{}_center_{}_da_{}/aa_list.npy'.format(pdbid_chains, center, da))[0]
    
    aa_dict[pdbid] = aa
    at_dict[pdbid] = at
print(len(aa_dict), len(at_dict))

19019 19019


In [16]:
with open(outdir+'casf2016_aa_dict', 'wb') as f:
    pickle.dump(aa_dict, f)
with open(outdir+'casf2016_at_dict', 'wb') as f:
    pickle.dump(at_dict, f)

### 2. copy other files, for ligand features and table (will be provided at https://github.com/tiantz17/PocketAnchor)

In [20]:
for file in ['casf2016_atom_dict', 'casf2016_bond_dict', 'casf2016_cf_dict', 
            'casf2016_frag_dict', 'casf2016_gg_dict', 'casf2016_table_new_protein.tsv']:
    print('scp -P 12878 lishuya@192.168.1.233:/data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/{} \
    {}\n'.format(file, outdir))

scp -P 12878 lishuya@192.168.1.233:/data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_atom_dict     ./AffinityPredictionData_PDBbind_da6/

scp -P 12878 lishuya@192.168.1.233:/data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_bond_dict     ./AffinityPredictionData_PDBbind_da6/

scp -P 12878 lishuya@192.168.1.233:/data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_cf_dict     ./AffinityPredictionData_PDBbind_da6/

scp -P 12878 lishuya@192.168.1.233:/data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_frag_dict     ./AffinityPredictionData_PDBbind_da6/

scp -P 12878 lishuya@192.168.1.233:/data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_gg_dict     ./AffinityPredictionData_PDBbind_da6/

scp -P 12878 lishuya@192.168.1.233:/data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_table_new_protein.tsv     ./AffinityPredictionData_PDBbind_da6/

