In [1]:
import os, time, pickle, sys, math
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from pymol import cmd
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
import pymesh

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from scipy.cluster.hierarchy import fcluster, linkage, single
from scipy.spatial.distance import pdist

In [2]:
import pymol

### 0. Define dataset parameters

In [11]:
da = 4
# scpdb_path = '/data/lishuya/software/scPDB/'

outdir = './AffinityPredictionData_CPIplus_da{}/'.format(da)
if not os.path.exists(outdir):
    os.mkdir(outdir)

### 1. Load dataset files

In [4]:
tablename = '/data/lishuya/lab/PocketAnchor/Revise1_new_data/Clean_protein_PDBbind_table.tsv'
table = pd.read_csv(tablename, sep='\t')
table.shape

(2667, 5)

In [5]:
table.head()

Unnamed: 0,pdbid,chains,original_sample,p_group,center_in_query
0,1xqz,A,4rpv,2496,74.268_33.653_2.081
1,7n6t,AB,3vf5,2452,6.134_15.673_-0.473
2,3htc,HIL,1tbz,2236,0.000_0.000_0.000
3,1c5m,DF,2q1j,2247,7.073_60.629_71.693
4,2bls,A,1pi5,1182,22.621_6.574_14.218


In [7]:
list_pdbid_query = table['pdbid'].values
list_pdbid_ref = table['original_sample'].values
len(list_pdbid_ref), len(set(list_pdbid_ref)), len(list_pdbid_query), len(set(list_pdbid_query))

(2667, 2667, 2667, 58)

### 2. Get protein features

In [22]:
# batch_process

anchor_dict = {}
for i in table.index:
    query_pdbid, chains, ref_pdbid, center = \
    table.loc[i, ['pdbid', 'chains', 'original_sample', 'center_in_query']]
    try:
        repeat_list = np.load('AnchorOutput/{}_{}_center_{}_da_{}/anchors.npy'\
                              .format(query_pdbid, chains, center, da))
    except: 
        continue
    anchor_dict[ref_pdbid] = repeat_list[0]

print(len(anchor_dict))

2606


In [14]:
with open(outdir+'casf2016_anchor_dict_thre'+str(da), 'wb') as f:
    pickle.dump(anchor_dict, f)

In [16]:
feature_dict = {}
for i in table.index:
    query_pdbid, chains, ref_pdbid, center = \
    table.loc[i, ['pdbid', 'chains', 'original_sample', 'center_in_query']]
    try:
        
        atom_dict = pickle.load(open('AnchorOutput/{}_{}_center_{}_da_{}/atom_feature.pk'\
                                     .format(query_pdbid, chains, center, da), 'rb'))
    except: 
        continue
#     print(pdbid, atom_dict[0][0].shape)
    feature_dict[ref_pdbid] = atom_dict[0]
len(feature_dict)

2606

In [17]:
# SAVE AS A WHOLE
with open(outdir+"casf2016_atom_feature_coord_nei_dict_thre"+str(da), "wb") as f:
    pickle.dump(feature_dict, f)

In [25]:
masif_feature_coord_nei_dict = {}

for i in table.index:
    query_pdbid, chains, ref_pdbid, center = \
    table.loc[i, ['pdbid', 'chains', 'original_sample', 'center_in_query']]
    if ref_pdbid not in anchor_dict:
        continue
    try:
        masif_data = pickle.load(open('AnchorOutput/{}_{}_center_{}_da_{}/masif_feature_coord_nei_dict.pk'\
                                      .format(query_pdbid, chains, center, da), 'rb'))[0]
    except:
        print('no maisf', query_pdbid)
        masif_data = (np.array([]).reshape((0, 5)), 
                       np.array([]).reshape((0, 3)),
                       np.array([]))
    masif_feature_coord_nei_dict[ref_pdbid] = masif_data

len(masif_feature_coord_nei_dict)

no maisf 1c2p
no maisf 2rfj


2606

In [26]:
with open(outdir+'casf2016_masif_feature_coord_nei_dict', 'wb') as f:
    pickle.dump(masif_feature_coord_nei_dict, f)

In [27]:
am_dict = {}
for i in table.index:
    query_pdbid, chains, ref_pdbid, center = \
    table.loc[i, ['pdbid', 'chains', 'original_sample', 'center_in_query']]
    if ref_pdbid not in anchor_dict:
        continue
    try:
        am = np.load('AnchorOutput/{}_{}_center_{}_da_{}/am_list.npy'\
                     .format(query_pdbid, chains, center, da))[0]
    except:
        print('no maisf', query_pdbid)
        am = np.array([]).reshape((len(anchor_dict[ref_pdbid]), 0))
    am_dict[ref_pdbid] = am
    
len(am_dict)    

no maisf 1c2p
no maisf 2rfj


2606

In [29]:
with open(outdir+'casf2016_am_dict', 'wb') as f:
    pickle.dump(am_dict, f)

In [30]:
# protein part

aa_dict = {}
at_dict = {}

for i in table.index:
    query_pdbid, chains, ref_pdbid, center = \
    table.loc[i, ['pdbid', 'chains', 'original_sample', 'center_in_query']]
    if ref_pdbid not in anchor_dict:
        continue
    at = np.load('AnchorOutput/{}_{}_center_{}_da_{}/at_list.npy'\
                 .format(query_pdbid, chains, center, da))[0]  
    aa = np.load('AnchorOutput/{}_{}_center_{}_da_{}/aa_list.npy'\
                 .format(query_pdbid, chains, center, da))[0]
    
    aa_dict[ref_pdbid] = aa
    at_dict[ref_pdbid] = at
print(len(aa_dict), len(at_dict))

2606 2606


In [31]:
with open(outdir+'casf2016_aa_dict', 'wb') as f:
    pickle.dump(aa_dict, f)
with open(outdir+'casf2016_at_dict', 'wb') as f:
    pickle.dump(at_dict, f)

### 2. copy other files

In [35]:
for file in ['casf2016_atom_dict', 'casf2016_bond_dict', 'casf2016_cf_dict', 
            'casf2016_frag_dict', 'casf2016_gg_dict', 'casf2016_table_new_protein.tsv']:
    print('cp /data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/{} \
    {}\n'.format(file, outdir))

cp /data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_atom_dict     ./AffinityPredictionData_CPIplus_da4/

cp /data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_bond_dict     ./AffinityPredictionData_CPIplus_da4/

cp /data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_cf_dict     ./AffinityPredictionData_CPIplus_da4/

cp /data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_frag_dict     ./AffinityPredictionData_CPIplus_da4/

cp /data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_gg_dict     ./AffinityPredictionData_CPIplus_da4/

cp /data/lishuya/silexon/doghouse2021/data11/anchor_casf_update0624/casf2016_table_new_protein.tsv     ./AffinityPredictionData_CPIplus_da4/

