In [1]:
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.Polypeptide import three_to_one
from Bio.PDB.Polypeptide import is_aa
from Bio import pairwise2
from multiprocessing import Pool, cpu_count
from functools import partial
import scipy.cluster.hierarchy
import numpy as np
import sys, argparse, bisect, re, os, fnmatch
import pickle, collections
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import FingerprintSimilarity as fs
from rdkit.Chem.Fingerprints import FingerprintMols
import rdkit
import re
import pandas as pd
from Bio.pairwise2 import format_alignment
from collections import Counter


In [2]:
def getResidueStrings(structure):
    seqs = []
    for model in structure:
        for ch in model.get_chains():
            seq = ''
            for residue in model.get_residues():
                resname = residue.get_resname()
                if is_aa(resname, standard=True):
                    seq += three_to_one(resname)
                elif resname in {'HIE', 'HID'}:
                    seq += 'H'
                elif resname in {'CYX', 'CYM'}:
                    seq += 'C'
                else:
                    seq += 'X'
            seqs.append(seq)
    return seqs

In [3]:
def cUTDM2(targets, pair):
    '''compute distance between target pair'''
    (a, b) = pair
    mindist = 1.0
    for seq1 in targets[a]:
        for seq2 in targets[b]:
            score = pairwise2.align.globalxx(seq1, seq2, score_only=True)
            length = max(len(seq1), len(seq2))
            distance = (length-score)/length
            if distance < mindist:
                mindist = distance
    #print (a,b,mindist)
    return (a, b, mindist)

In [4]:
def calcDistanceMatrix(targets):
    '''compute full pairwise target distance matrix in parallel'''
    n = len(targets)
    pairs = [(r, c) for r in range(n) for c in range(r+1, n)] #upper triangle
    pool = Pool()
    function = partial(cUTDM2, targets)
    distanceTuples = pool.map(function, pairs)
    distanceMatrix = np.zeros((n, n))
    for (a, b, distance) in distanceTuples:
        distanceMatrix[a][b] = distanceMatrix[b][a] = distance
    return distanceMatrix

In [5]:
def get_chain(pdb_id):
    chain_seq = []
    chain_dict = {}
    with open("/pubhome/hzhu02/GPSF/dataset/pdbbind_v2020/general_refine/"+pdb_id+"/"+pdb_id+"_protein.pdb", "r") as f:
        for line in f.readlines():
            if line.startswith("SEQRES"):
                line = line.strip().split()
                if line[2] not in chain_dict.keys():
                    chain_dict[line[2]]=''
                for i in range(4, len(line)):
                    aa = line[i]
                    if is_aa(aa, standard=True):
                        chain_dict[line[2]] += three_to_one(aa)
                    elif aa in {'HIE', 'HID'}:
                        chain_dict[line[2]] += 'H'
                    elif aa in {'CYX', 'CYM'}:
                        chain_dict[line[2]] += 'C'
                    else:
                        chain_dict[line[2]] += 'X' 
    for key in chain_dict.keys():
        chain_seq.append(chain_dict[key])
    return chain_seq
         

# aspartyl protease

In [70]:
chain_seq_1=get_chain("2v12")
chain_seq_2 = get_chain("1ec1")

In [71]:
chain_seq_1

['GNTTSSVILTNYMDTQYYGEIGIGTPPQTFKVVFDTGSSNVWVPSSKCSRLYTACVYHKLFDASDSSSYKHNGTELTLRYSTGTVSGFLSQDIITVGGITVTQMFGEVTEMPALPFMLAEFDGVVGMGFIEQAIGRVTPIFDNIISQGVLKEDVFSFYYNRDSESLGGQIVLGGSDPQHYEGNFHYINLIKTGVWQIQMKGVSVGSSTLLCEDGCLALVDTGASYISGSTSSIEKLMEALGAKKRLFDYVVKCNEGPTLPDISFHLGGKEYTLTSADYVFQESYSSKKLCTLAIHAMDIPPPTGPTWALGATFIRKFYTEFDRRNNRIGFALAR',
 'NTTSSVILTNYMDTQYYGEIGIGTPPQTFKVVFDTGSSNVWVPSSKCSRLYTACVYHKLFDASDSSSYKHNGTELTLRYSTGTVSGFLSQDIITVGGITVTQMFGEVTEMPALPFMLAEFDGVVGMGFIEQAIGRVTPIFDNIISQGVLKEDVFSFYYNRDSSLGGQIVLGGSDPQHYEGNFHYINLIKTGVWQIQMKGVSVGSSTLLCEDGCLALVDTGASYISGSTSSIEKLMEALGAKKRLFDYVVKCNEGPTLPDISFHLGGKEYTLTSADYVFQESYSSKKLCTLAIHAMDIPPPTGPTWALGATFIRKFYTEFDRRNNRIGFALAR']

In [72]:
alignments = pairwise2.align.globalxx(chain_seq_1[0], chain_seq_2[0])
a = format_alignment(*alignments[0])
split = a.split("\n")
print(len(split[1]))
print(split[0][0:100])
print(split[1][0:100])
print(split[2][0:100])
print(" ")
print(split[0][100:200])
print(split[1][100:200])
print(split[2][100:200])
print(" ")
print(split[0][200:300])
print(split[1][200:300])
print(split[2][200:300])
print(" ")
print(split[0][300:400])
print(split[1][300:400])
print(split[2][300:400])
print(" ")
print(split[0][400:500])
print(split[1][400:500])
print(split[2][400:500])

# GPCR

In [79]:
chain_seq_1=get_chain("5xjm")
chain_seq_2 = get_chain("6h7m")

In [80]:
chain_seq_1

['CSQKPSDKHLDAIPILYYIIFVIGFLVNIVVVTLFCCQKGPKKVSSIYIFNLAVADLLLLATLPLWATYYSYRYDWLFGPVMCKVFGSFLTLNMFASIFFITCMSVDRYQSVIYPFLSQRRNPWQASYIVPLVWCMACLSSLPTFYFRDVRTIEYLGVNACIMAFPPEKYAQWSAGIALMKNILGFIIPLIFIATCYFGIRKHLLKTNADLEDNWETLNDNLKVIEKADNAAQVKDALTKMRAAALDAQKATPDFRHGFDILVGQIDDALKLANEGKVKEAQAAAEQLKTTRNAYIQKYLKNRITRDQVLKMAAAVVLAFIICWLPFHVLTFLDALAWMGVINSCEVIAVIDLALPFAILLGFTNSCVNPFLYCF',
 'DVQLVESGGGLVQPGGSRKLSCAASGFTFSGFGMHWVRQAPEKGLEWVAYISSGSSLIYYADTVKGRFTISRDNPKNTLFLQMTSLRSEDTAMYFCATSLYYGTPWFAYWGQGTLVTVSAAKTTPPSVYPLAPGCGDTTGSSVTLGCLVKGYFPESVTVTWNSGSLSSSVHTFPALLQSGLYTMSSSVTVPSSTWPSQTVTCSVAHPASSTTVDKKLEPS',
 'DIVLTQSPAIMSASPGEKVTMTCSASSSVTYMYWYQQKPGSSPRLLIYDTSNLASGVPVRFSGSGSGTSYSLTISRMEAEDAATFYCQQWSSYPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKDINVKWKIDGSERQNGVLNSWTDQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNRNE']

In [81]:
len(chain_seq_1[0])

375

In [41]:
chain_seq_2

['KIIHLTDDSFDTDVLKADGAILVDFWAEWSGPSKMIAPILDEIADEYQGKLTVAKLNIDQNPGTAPKYGIRGIPTLLLFKNGEVAATKVGALSKGQLKEFLDANLAE',
 'AAKVMSLLMALVVLLIVAGNVLVIAAIGSTQRLQTLTNLFITSLACADLVVGLLVVPFGATLVVRGTWLWGSFLCELWTSLDVLCVTASIETLCVIAIDRYLAITSPFRYQSLMTRARAKVIICTVWAISALVSFLPIMMHWWRDEDPQALKCYQDPGCCDFVTNRAYAIASSIISFYIPLLIMIFVYLRVYREAKEQIRKIDRASKRKTSRVMAMKEHKALKTLGIIMGVFTLCWLPFFLVNIVNVFNRDLVPDWLFVAFNWLGYANSAMNPIIYCRSPDFRKAFKRLLA',
 'QVQLQESGGGLVQAGGSLRLSCAASGSIFALNIMGWYRQAPGKQRELVAAIHSGGTTNYANSVKGRFTISRDNAANTVYLQMNSLKPEDTAVYYCNVKDFGAIIYDYDYWGQGTQVTVSS']

In [78]:
len(chain_seq_2[1])

291

In [45]:
seq1=chain_seq_1[0]
seq2=chain_seq_2[1]
score = pairwise2.align.globalxx(seq1, seq2, score_only=True)
length = max(len(seq1), len(seq2))
distance = (length-score)/length

In [46]:
distance

0.6533333333333333

In [26]:
score

124.0

In [27]:
length

304

In [83]:
alignments = pairwise2.align.globalxx(chain_seq_1[0], chain_seq_2[1])
a = format_alignment(*alignments[0])
split = a.split("\n")
print(len(split[1]))
print(split[0][0:100])
print(split[1][0:100])
print(split[2][0:100])
print(" ")
print(split[0][100:200])
print(split[1][100:200])
print(split[2][100:200])
print(" ")
print(split[0][200:300])
print(split[1][200:300])
print(split[2][200:300])
print(" ")
print(split[0][300:400])
print(split[1][300:400])
print(split[2][300:400])
print(" ")
print(split[0][400:500])
print(split[1][400:500])
print(split[2][400:500])
print(" ")
print(split[0][500:600])
print(split[1][500:600])
print(split[2][500:600])

536
CSQ--KP--SDKHLD--AIPILYYIIFVIGFLVN--IVV---VTLFCCQKGPKKVSSIY--IF----------NL-----AV-ADLLL---LATL--PLW
     |   |   |   |   |     |    |   | |   | |         |  |   |           ||     |  ||  |   |  |  |  
---AAK-VMS---L-LMA---L-----V----V-LLI-VAGNV-L---------V--I-AAI-GSTQRLQTLTNLFITSLA-CAD--LVVGL--LVVP--
 
--ATYYSYRYDWLFGPVMCKVF-G-----SFL----T-LNMF------ASIFFI-T-CMSV---DRYQSV--IY--PFLS--Q----R-RNPWQASYI-V
  ||        |   |   |  |     |||    | |         ||   | | |  |   |||     |   ||    |    | |    |    |
FGAT--------L---V---V-RGTWLWGSFLCELWTSL---DVLCVTAS---IETLC--VIAIDRY---LAI-TSPF--RYQSLMTRAR----A---KV
 
PL----VWCMACL-S---S-LPTFYF------RDVRTIEY----L------G----V-NACIM-AFPPEKYAQWSAGIALMKN--ILGFI---IPL-IF-
      ||  |   |   | ||          ||    |     |      |    | |     |     |    | ||      |   |   ||| |  
--IICTVW--A--ISALVSFLP----IMMHWWRD----E-DPQALKCYQDPGCCDFVTN----RA-----Y----A-IA----SSI---ISFYIPLLI-M
 
IATCYFGIRKHLLKTNADLEDNWETLNDN--LK-VI--EKADNAA--QV--K-D-ALT-KMRAAALDAQKATPDF-RHGFD

# RNAses

In [63]:
chain_seq_1=get_chain("2gmk")
chain_seq_2 = get_chain("2g8r")

In [64]:
chain_seq_1

['XDWLTFQKKHITNTRDVDCDNIMSTNLFHCKDKNTFIYSRPEPVKAICKGIIASKNVLTTSEFYLSDCNVTSRPCKYKLKKSTNKFCVNCANQAPVHFVGVGSC']

In [65]:
len(chain_seq_1[0])

104

In [66]:
chain_seq_2

['KETAAAKFERQHMDSSTSAASSSNYCNQMMKSRNLTKDRCKPVNTFVHESLADVQAVCSQKNVACKNGQTNCYQSYSTMSITDCRETGSSKYPNCAYKTTQANKHIIVACEGNPYVPVHFDASV']

In [67]:
len(chain_seq_2[0])

124

In [29]:
seq1=chain_seq_1[0]
seq2=chain_seq_2[0]
score = pairwise2.align.globalxx(seq1, seq2, score_only=True)
length = max(len(seq1), len(seq2))
distance = (length-score)/length

In [30]:
length

124

In [31]:
score

49.0

In [32]:
distance

0.6048387096774194

In [68]:
alignments = pairwise2.align.globalxx(chain_seq_1[0], chain_seq_2[0])
a = format_alignment(*alignments[0])

In [69]:
split = a.split("\n")
print(len(split[1]))
print(split[0][0:100])
print(split[1][0:100])
print(split[2][0:100])
print(" ")
print(split[0][100:200])
print(split[1][100:200])
print(split[2][100:200])
print(" ")
print(split[0][200:300])
print(split[1][200:300])
print(split[2][200:300])
print(" ")
print(split[0][300:400])
print(split[1][300:400])
print(split[2][300:400])
print(" ")
print(split[0][400:500])
print(split[1][400:500])
print(split[2][400:500])

179
XDWL--T----F--QKKHI----T------NTRDVD-CDNI-M--ST-NLFHC-KD--K--NTFIYSRP--EP----VK-AI-CKGIIAS-KNVLT----
      |    |  |  |     |      |      | |  |  |  ||    ||  |  |||       |     |  |  |     | |||      
----KETAAAKFERQ--H-MDSSTSAASSSN-----YC-N-QMMKS-RNL---TKDRCKPVNTF-----VHE-SLADV-QA-VC-----SQKNV--ACKN
 
--T----SEFYLS-----DCNV--T-SR---P-CK-YKLKKST---NKFC---VNCA---NQAP-V--HFVGVG--SC-
  |    |  | |     ||    | |    | |  |   | |   ||     |  |   |  | |  ||      |  
GQTNCYQS--Y-STMSITDC--RETGS-SKYPNC-AY---K-TTQANK--HIIV--ACEGN--PYVPVHF----DAS-V
 



 



 





# hormone receptor

In [48]:
chain_seq_1=get_chain("6i65")
chain_seq_2 = get_chain("2xhs")

In [49]:
chain_seq_1

['YNKIVSHLLVAEPEKIYAMPDPTVPDSDIKALTTLCDLADRELVVIIGWAKHIPGFSTLSLADQMSLLQSAWMEILILGVVYRSLSFEDELVYADDYIMDEDQSKLAGLLDLNNAILQLVKKYKSMKLEKEEFVTLKAIALANSDSMHIEDVEAVQKLQDVLHEALQDYEAGQHMEDPRRAGKMLMTLPLLRQTSTKAVQHFYNIKLEGKVPMHKLFLEMLEAK',
 'YNKIVSHLLVAEPEKIYAMPDPTVPDSDIKALTTLCDLADRELVVIIGWAKHIPGFSTLSLADQMSLLQSAWMEILILGVVYRSLSFEDELVYADDYIMDEDQSKLAGLLDLNNAILQLVKKYKSMKLEKEEFVTLKAIALANSDSMHIEDVEAVQKLQDVLHEALQDYEAGQHMEDPRRAGKMLMTLPLLRQTSTKAVQHFYNIKLEGKVPMHKLFLEMLEAK']

In [50]:
chain_seq_2

['GSHXLEDPLRVSPXIREFVQSIDDREWQTQLFALLQKQTYNQVEVDLFELXCKVLDQNLFSQVDWARNTVFFKDLKVDDQXKLLQHSWSDXLVLDHLHHRIHNGLPDETQLNNGQVFNLXSLGLLGVPQLGDYFNELQNKLQDLKFDXGDYVCXKFLILLNPSVRGIVNRKTVSEGHDNVQAALLDYTLTCYPSVNDKFRGLVNILPEIHAXAVRGEDHLYTKHCAGSAPTQTLLXEXLHAKRK']

In [51]:
seq1=chain_seq_1[0]
seq2=chain_seq_2[0]
score = pairwise2.align.globalxx(seq1, seq2, score_only=True)
length = max(len(seq1), len(seq2))
distance = (length-score)/length

In [52]:
distance

0.639344262295082

# kinase

In [53]:
chain_seq_1=get_chain("5mo8")

In [54]:
len(chain_seq_1[0])

325

In [55]:
chain_seq_2 = get_chain("1xkk")

In [56]:
len(chain_seq_2[0])

289

In [57]:
alignments = pairwise2.align.globalxx(chain_seq_1[0], chain_seq_2[0])
a = format_alignment(*alignments[0])

In [62]:
split = a.split("\n")
print(len(split[1]))
print(split[0][0:100])
print(split[1][0:100])
print(split[2][0:100])
print(" ")
print(split[0][100:200])
print(split[1][100:200])
print(split[2][100:200])
print(" ")
print(split[0][200:300])
print(split[1][200:300])
print(split[2][200:300])
print(" ")
print(split[0][300:400])
print(split[1][300:400])
print(split[2][300:400])
print(" ")
print(split[0][400:500])
print(split[1][400:500])
print(split[2][400:500])



499
PVPSRARVYTDVNTHRPSEYWDYESHVVEWGNQDDYQLVRKLGRG--KYSEVFEAINITNNE-KVVVKI--L---------K----PVKKKKIKRE---I
     |                               |   | |   |     |    |  | |   ||  |         |    ||   ||      |
-----A-------------------------------L---L-R-ILK-----E----T--EFK---KIKVLGSGAFGTVYKGLWIPV---KI---PVAI
 
KILENLRGGP---NI--ITLADI---VK----D-P-VS-RTPALVFEHVNN-----TDFK---QLYQ-T-LT-------DYDI-RFYMYEIL-KALD---
|  | ||      |   | | |    |     | | |  |   |            |      ||   | |        ||   |    |   |  |   
K--E-LR---EKAN-KEI-L-D-EAYV-MASVDNPHV-CR---L-------LGICLT---STVQL--ITQL-MPFGCLLDY--VR----E--HK--DNIG
 
--Y----CHSMG--I---M---------HRD-----V--K-P-HNVM-I-DHEHR----KLRLIDWGLAEFYHPGQEYNVRVASRYFKG-PE-----L--
  |    |      |   |         |||     |  | | | |  | |        || |   | ||      |          |  |      |  
SQYLLNWC----VQIAKGMNYLEDRRLVHRDLAARNVLVKTPQH-V-KITD----FGLAKL-L---G-AE------E----------K-VP-IKWMALES
 
-LVD---Y--QMYDYSLDM-WSL-GCM-----LASMI-FR--KEPFFHGHDNYDQLVR-I-AKV------L--GTED-LY-

In [None]:
## mol_fingerprint

In [None]:
## check sdf v2020 id readable

In [4]:
pdbbind_v2020_code  = pd.read_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pfam/refine/pdbbind_v2020_refine.code", header=None)
pdbbind_v2020_code.columns=['pdb']
code_list = pdbbind_v2020_code['pdb'].tolist()

In [5]:
for code in code_list:
    try:
        supplier_10gs = Chem.SDMolSupplier("/pubhome/hzhu02/GPSF/dataset/pdbbind_v2020/refine_2020/"+code+"/"+code+"_ligand.sdf", sanitize=False, removeHs=False)
        mol_10gs = supplier_10gs[0]
        fp_10gs = FingerprintMols.FingerprintMol(mol_10gs)
    except:
        print(code)


In [2]:
supplier_10gs = Chem.SDMolSupplier("/pubhome/hzhu02/GPSF/dataset/pdbbind_v2020/refine_2020/10gs/10gs_ligand.sdf", sanitize=False, removeHs=False)
mol_10gs = supplier_10gs[0]
fp_10gs = FingerprintMols.FingerprintMol(mol_10gs)

In [14]:
supplier_1a4k = Chem.SDMolSupplier("/pubhome/hzhu02/GPSF/dataset/data/result/1a4k/1a4k_ligand.fixed.sdf", sanitize=False, removeHs=False)
mol_1a4k = supplier_1a4k[0]
fp_1a4k = FingerprintMols.FingerprintMol(mol_1a4k)

In [16]:
fs(fp_10gs, fp_1a4k)

0.3580152671755725

In [67]:
file = pd.read_csv("/pubhome/hzhu02/GPSF/dataset/INDEX/native_pose_result.csv", header=None)
pdb_chain_mapping = pd.read_csv("/pubhome/hzhu02/GPSF/dataset/INDEX/split/pdb_pfam_mapping.csv", sep=",")
clans = pd.read_csv("/pubhome/hzhu02/GPSF/dataset/INDEX/split/clans.tsv", sep="\t")
clans = clans.rename(columns={'pfamA_acc':'PFAM_ACCESSION'})

In [270]:
def get_lig_chain(pdbid, pocket):
    pocket_file = pd.read_csv(pocket, sep="\t", header=None)
    pocket_file.columns=['all']
    pocket_file['atom']=pocket_file['all'].apply(lambda x:re.split(r"[ ]+", x)[0])
    pocket_file = pocket_file[pocket_file['atom']=="ATOM"]
    pocket_file['chain']=pocket_file['all'].apply(lambda x: re.split(r"[ ]+", x)[4]) 
    pocket_file['res']=pocket_file['all'].apply(lambda x:re.split(r"[ ]+", x)[5])
    pocket_file['new_chain'] = pocket_file['chain'].apply(lambda x: x[0])
    pocket_file['new_res']=pocket_file.apply(lambda x: x.chain[1:] if len(x.chain)>1 else x.res, axis=1)
    pocket_file['real_aa']=pocket_file['new_res'].apply(lambda x: 0 if x[-1] not in z else 1)
    chain_type = list(set(pocket_file[pocket_file['atom']=="ATOM"]['new_chain'].tolist()))

    all_info = []
    max_atom_num = 0
    for chain in chain_type:
        res_list = pocket_file[(pocket_file['new_chain']==chain)&(pocket_file['real_aa']==1)]['new_res'].astype(int).tolist()
        atom_num = len(res_list)
        res_list_uniq = list(set(res_list))
        res_num = len(res_list_uniq)
        item = [pdbid, chain, atom_num, res_num, res_list_uniq]
        all_info.append(item)
        if atom_num > max_atom_num:
            max_atom_num = atom_num
            main_info =[pdbid, chain, atom_num, res_num, res_list_uniq]

    return pocket_file, all_info, main_info

In [256]:
pocket="/pubhome/hzhu02/GPSF/dataset/refine_2020/1uto/1uto_pocket.pdb"

In [271]:
file_2, all, main = get_lig_chain("1uto",pocket)

In [259]:
pocket_file = pd.read_csv(pocket, sep="\t", header=None)
pocket_file.columns=['all']
pocket_file['atom']=pocket_file['all'].apply(lambda x:re.split(r"[ ]+", x)[0])
pocket_file = pocket_file[pocket_file['atom']=="ATOM"]
pocket_file['chain']=pocket_file['all'].apply(lambda x: re.split(r"[ ]+", x)[4]) 
pocket_file['res']=pocket_file['all'].apply(lambda x:re.split(r"[ ]+", x)[5])
pocket_file['new_chain'] = pocket_file['chain'].apply(lambda x: x[0])
pocket_file['new_res']=pocket_file.apply(lambda x: x.chain[1:] if len(x.chain)>1 else x.res, axis=1)
pocket_file['real_aa']=pocket_file['new_res'].apply(lambda x: 0 if x[-1] not in z else 1)

In [241]:
def find_lig_domain(pdbid, chain, res, pfam_mapping):
    possibile_pfam = pfam_mapping[(pfam_mapping['PDB']==pdbid) & (pfam_mapping['CHAIN']==chain)]
    if possibile_pfam.shape[0] > 0:
        for j in range(possibile_pfam.shape[0]):
            subfile = possibile_pfam[j:j+1]
            num_pdb_start = subfile['AUTH_PDBRES_START'].tolist()[0]
            num_pdb_end = subfile['AUTH_PDBRES_END'].tolist()[0]
            pdb_domain = list(range(int(num_pdb_start), int(num_pdb_end)+1))
            num = len(list(set(res).intersection(set(pdb_domain))))

            pfam_info_sub = [subfile['PFAM_ACCESSION'],subfile['PFAM_NAME'],subfile['UNIPROT_ACCESSION'],num_pdb_start,num_pdb_end,num]
            pfam_info =pfam_info.append(info[i].extend(pfam_info_sub))
        

            

    return pfam_info

In [276]:
all_chain=pd.read_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pfam/refine/pdbbind_v2020_refine_all_pocket_chain_info.csv")

In [308]:
def change_str_to_list (res_str):
    a=[]
    for i in range(len(res_str)):
        if i == 0:
            a.append(int(res_str[i][1:]))
        elif i==len(res_str)-1:
            a.append(int(res_str[i][:-1]))
        else:
            a.append(int(res_str[i]))
    return a

In [309]:
sort_chain = all_chain[all_chain['pdb']=="2i4x"].sort_values(by=['atom_num'], ascending=False)

In [319]:
main_chain = sort_chain['chain'].tolist()[0]
res_list = change_str_to_list(sort_chain['res_uniq_list'].tolist()[0].split(","))

In [323]:
clans = pd.read_csv("/pubhome/hzhu02/GPSF/dataset/INDEX/split/clans.tsv", sep="\t")

In [324]:
mapping = pd.read_csv("/pubhome/hzhu02/GPSF/dataset/INDEX/split/pdb_pfam_mapping.csv")

In [325]:
clans = clans.rename(columns={'pfamA_acc':'PFAM_ACCESSION'})

In [347]:
pfam_clan = pd.merge(mapping, clans, on=['PFAM_ACCESSION'])[['PDB','CHAIN','PFAM_ACCESSION','PFAM_NAME','AUTH_PDBRES_START','AUTH_PDBRES_END','UNIPROT_ACCESSION','clan_acc','clan_id']]

In [348]:
pfam_clan.to_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pfam/pdb_pfam_clan.csv", index=False)

In [335]:
nuport_pharos = pd.read_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pharos/pharos.csv")

In [338]:
nuport_pharos=nuport_pharos.rename(columns={'UniProt':'UNIPROT_ACCESSION'})

In [344]:
pfam_clan.shape

(710413, 7)

In [343]:
pd.merge(pfam_clan, nuport_pharos, on=['UNIPROT_ACCESSION'])

Unnamed: 0,PDB,CHAIN,PFAM_ACCESSION,PFAM_NAME,UNIPROT_ACCESSION,clan_acc,clan_id,Family,Name,Novelty,PDB IDs,Symbol,Target Development Level,id
0,1a00,A,PF00042,Globin,P69905,CL0090,Globin,Other,Hemoglobin subunit alpha,0.000941,"1A00,1A01,1A0U,1A0Z,1A3N,1A3O,1A9W,1ABW,1ABY,1...",HBA1,Tclin,6231
1,1a00,C,PF00042,Globin,P69905,CL0090,Globin,Other,Hemoglobin subunit alpha,0.000941,"1A00,1A01,1A0U,1A0Z,1A3N,1A3O,1A9W,1ABW,1ABY,1...",HBA1,Tclin,6231
2,1a01,A,PF00042,Globin,P69905,CL0090,Globin,Other,Hemoglobin subunit alpha,0.000941,"1A00,1A01,1A0U,1A0Z,1A3N,1A3O,1A9W,1ABW,1ABY,1...",HBA1,Tclin,6231
3,1a01,C,PF00042,Globin,P69905,CL0090,Globin,Other,Hemoglobin subunit alpha,0.000941,"1A00,1A01,1A0U,1A0Z,1A3N,1A3O,1A9W,1ABW,1ABY,1...",HBA1,Tclin,6231
4,1a0u,A,PF00042,Globin,P69905,CL0090,Globin,Other,Hemoglobin subunit alpha,0.000941,"1A00,1A01,1A0U,1A0Z,1A3N,1A3O,1A9W,1ABW,1ABY,1...",HBA1,Tclin,6231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159663,7p5m,B,PF04906,Tweety,Q9BSA4,,,Ion Channel,Protein tweety homolog 2,0.197965,,TTYH2,Tbio,9743
159664,7p5c,A,PF04906,Tweety,Q9C0H2,,,Ion Channel,Protein tweety homolog 3,0.217472,,TTYH3,Tbio,9681
159665,7p5c,B,PF04906,Tweety,Q9C0H2,,,Ion Channel,Protein tweety homolog 3,0.217472,,TTYH3,Tbio,9681
159666,7p5j,A,PF04906,Tweety,Q9H313,,,Ion Channel,Protein tweety homolog 1,0.066988,,TTYH1,Tbio,9634


In [354]:
all_pfam_clan = pd.read_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pfam/refine/pdbbind_v2020_main_chain_pfam_info.csv", sep="\t")

In [355]:
all_pfam_clan

Unnamed: 0,pdb,main_chain,res_num,pfam,pfam_name,pdb_start,pdb_end,cov_num,clan_acc,clan_id,uniport
0,2r58,A,25,PF02820,MBT,209,277,0,CL0049,Tudor,Q9VHA0
1,3c2f,A,30,PF02749,QRPTase_N,30,117,0,CL0105,Hybrid,P43619
2,3c2f,A,30,PF01729,QRPTase_C,119,292,30,CL0036,TIM_barrel,P43619
3,3g2y,A,38,PF13354,Beta-lactamase2,51,263,35,CL0013,Beta-lactamase,Q9L5C8
4,3pce,M,25,PF00775,Dioxygenase_C,346,529,22,CL0287,Transthyretin,P00437
...,...,...,...,...,...,...,...,...,...,...,...
5776,5bry,A,33,PF00077,RVP,5,98,33,CL0129,Peptidase_AA,P03366
5777,1sl3,A,56,PF00089,Trypsin,16,238,56,CL0124,Peptidase_PA,P00734
5778,1ctu,A,41,PF00383,dCMP_cyt_deam_1,48,142,40,CL0109,CDA,P0ABF6
5779,1ctu,A,41,PF08211,dCMP_cyt_deam_2,157,277,0,CL0109,CDA,P0ABF6


In [356]:
pdb_list = list(set(all_pfam_clan['pdb'].tolist()))

In [365]:
main_pf=pd.DataFrame()
for pdb in pdb_list:
    select = all_pfam_clan[all_pfam_clan['pdb']==pdb].sort_values(by=['cov_num'], ascending=False)
    main_pf = pd.concat([main_pf,select.head(1)])


In [361]:
select = all_pfam_clan[all_pfam_clan['pdb']=="10gs"].sort_values(by=['cov_num'], ascending=False)

In [372]:
main_pf.to_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pfam/refine/pdbbind_v2020_refine_pfame_clan.csv", index=False)

In [368]:
pocket_chain = pd.read_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pfam/refine/pdbbind_v2020_refine_all_pocket_chain_info.csv", sep=",")
pdb_list = list(set(pocket_chain['pdb'].tolist()))

In [369]:
len(pdb_list)

5316

In [370]:
main_chain=pd.DataFrame()
for pdb in pdb_list:
    select = pocket_chain[pocket_chain['pdb']==pdb].sort_values(by=['atom_num'], ascending=False)
    main_chain = pd.concat([main_chain,select.head(1)])


In [373]:
main_chain.to_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pfam/refine/pdbbind_v2020_refine_pocket_main_chain.csv", index=False)

In [381]:
pdb_list[-1]

'1b58'

In [400]:
uniport_id = pd.read_csv("/pubhome/hzhu02/GPSF/dataset/INDEX/split/pdb_chain_cath_uniprot.csv")

In [402]:
len(pdb_list)

2651

In [None]:
uniport_id_all = []
for pdb in pdb_list:
    print(pdb)
    chain = main_chain[main_chain['pdb']==pdb]["chain"].tolist()[0]
    uni_id = uniport_id[(uniport_id['PDB']==pdb) &(uniport_id['CHAIN']==chain)]['SP_PRIMARY'].tolist()
    if len(uni_id) >0:
        uniport_id_all.append(uni_id[0])
    else:
        uniport_id_all.append('None')
    

In [404]:
len(uniport_id_all)

2651

In [None]:
uniport_id_all

In [397]:
main_chain['uniport'] = uniport_id_all

In [408]:
main_chain[main_chain['uniport']=='None']['uniport'].replace(uniport_id_all, inplace=True)

In [410]:
sub = main_chain[main_chain['uniport']=='None']

In [415]:
sub2 = main_chain[main_chain['uniport']!= 'None']

In [None]:
sub['uniport_2']=uniport_id_all

In [423]:
sub = sub.rename(columns={'uniport_2':'uniport'})

In [426]:
main_chain = sub2.append(sub)

In [450]:
len(main_chain[main_chain['uniport']=='None']['pdb'].tolist())

186

In [429]:
file_3_mapping = pd.read_csv("/pubhome/hzhu02/GPSF/dataset/INDEX/split/pdb_pfam_mapping.csv")

In [432]:
main_chain.shape

(5316, 6)

In [None]:
file_3_mapping_all = []
for pdb in pdb_list_3:
    print(pdb)
    chain = main_chain[main_chain['pdb']==pdb]["chain"].tolist()[0]
    uni_id = file_3_mapping[(file_3_mapping['PDB']==pdb) &(file_3_mapping['CHAIN']==chain)]['UNIPROT_ACCESSION'].tolist()
    if len(uni_id) >0:
        file_3_mapping_all.append(uni_id[0])
    else:
        file_3_mapping_all.append('None')

In [437]:
sub3_1 = main_chain[main_chain['uniport']=='None']

In [438]:
sub3_2 = main_chain[main_chain['uniport']!='None']

In [439]:
sub3_1['uniport_2'] = file_3_mapping_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [448]:

sub3_1 = sub3_1.rename(columns={'uniport_2':'uniport'})
main_chain = sub3_2.append(sub3_1)


In [None]:
main_chain[(main_chain['chain']!='A')&(main_chain['chain']!='B')&(main_chain['uniport']=='None')]

In [455]:
import urllib.request
import json

In [493]:
a = urllib.request.urlopen(f"https://data.rcsb.org/rest/v1/core/polymer_entity/"+"5efc"+"/1").read()

In [497]:
b = json.loads(a)
suffix = b['rcsb_polymer_entity_container_identifiers']


In [496]:
'5efc' in pdb_4_list

True

In [498]:
'uniprot_id' in suffix.keys()

False

In [None]:
pdb_4_map =[]
multi=[]
not_found = []
for pdb in pdb_4_list:
    print(pdb)
    a = urllib.request.urlopen(f"https://data.rcsb.org/rest/v1/core/polymer_entity/"+pdb+"/1").read()
    b = json.loads(a)
    suffix = b['rcsb_polymer_entity_container_identifiers']
    if 'uniprot_ids' in suffix.keys():
        ids = suffix['uniprot_ids']
        pdb_4_map.append(ids[0])
        if len(ids)>0:
            multi.append([pdb, ids])
    else:
        pdb_4_map.append('None')
    

In [517]:
for i in range(len(multi)):
    if len(multi[i][1]) >1:
        print(multi[i])

['4nja', ['P01660', 'Q52L95']]


In [500]:
sub_1 = main_chain[main_chain['uniport']=='None']
sub_2 = main_chain[main_chain['uniport']!= 'None']


In [501]:
sub_1['uniport_2']=pdb_4_map

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
sub_1[sub_1['uniport_2']!='None']

In [504]:
sub_1=sub_1.drop(columns=['uniport'])

In [505]:
sub_1 = sub_1.rename(columns={'uniport_2':'uniport'})

In [506]:
main_chain = sub_2.append(sub_1)

In [507]:
main_chain[main_chain['uniport']=='None']

Unnamed: 0,pdb,chain,atom_num,res_num,res_uniq_list,uniport
3737,6idg,H,236,21,"[31, 32, 33, 34, 35, 47, 49, 50, 51, 52, 56, 5...",
3964,1qyg,H,232,22,"[27, 28, 29, 30, 31, 32, 33, 34, 35, 50, 51, 5...",
619,6cfc,A,430,44,"[386, 389, 390, 398, 399, 400, 401, 402, 403, ...",
1956,5ufc,A,424,43,"[132, 133, 134, 135, 136, 137, 155, 156, 157, ...",
6785,4jn2,B,262,28,"[33, 34, 35, 36, 37, 48, 50, 51, 52, 53, 54, 5...",
...,...,...,...,...,...,...
6656,4yho,H,317,30,"[28, 29, 30, 31, 32, 33, 34, 35, 47, 50, 51, 5...",
4320,4c1t,A,717,76,"[384, 385, 386, 387, 388, 383, 391, 394, 395, ...",
6520,4f3k,A,423,52,"[7, 8, 9, 11, 12, 146, 147, 148, 149, 150, 151...",
2122,1mfd,H,206,21,"[282, 283, 284, 285, 297, 299, 300, 301, 302, ...",


In [508]:
main_chain.to_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pharos/pdbbind_v2020_main_chain_uniprot.csv", index=False)

In [513]:
main_chain=main_chain.rename(columns={'uniport':"UNIPROT_ACCESSION"})

In [522]:
merge = pd.merge(main_chain, nuport_pharos, on=['UNIPROT_ACCESSION'])
merge[['pdb','chain','atom_num','res_num','res_uniq_list','UNIPROT_ACCESSION','Family','Name']].to_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/pharos/pdbbind_v2020_matched_pharos.csv", index=False)

In [6]:
protein_dist = pd.read_csv("/pubhome/hzhu02/Redocked_pose/split_dataset/3_fold_ccv/protein_smi_only_main_chain_pdbbind_2020.csv")
pdb_list = protein_dist.columns.tolist()

In [8]:
protein_dist.drop(columns=['Unnamed: 0'])

Unnamed: 0,2p95,3t1m,4ceb,1x8r,3rtf,2v7a,4zzy,1b5h,2aac,6md6,...,6nw3,5efc,1cgl,2a5c,1hpo,2hb3,1v2n,3t0d,5o1d,1b58
0,0.000000,0.735043,0.713675,0.740047,0.647287,0.678967,0.709402,0.767892,0.696581,0.781457,...,0.716216,0.696581,0.696581,0.739316,0.782051,0.782051,0.508547,0.840394,0.679487,0.767892
1,0.735043,0.000000,0.644295,0.798595,0.751938,0.763838,0.794872,0.825919,0.668712,0.841060,...,0.794595,0.680982,0.690476,0.664234,0.708029,0.700730,0.730942,0.890640,0.720812,0.825919
2,0.713675,0.644295,0.000000,0.786885,0.748062,0.726937,0.772080,0.814313,0.662577,0.839404,...,0.783784,0.668712,0.660714,0.657718,0.697987,0.697987,0.721973,0.882759,0.690355,0.814313
3,0.740047,0.798595,0.786885,0.000000,0.723653,0.711944,0.674473,0.663443,0.782201,0.688742,...,0.662763,0.782201,0.784543,0.814988,0.833724,0.833724,0.756440,0.766502,0.761124,0.663443
4,0.647287,0.751938,0.748062,0.723653,0.000000,0.649446,0.686610,0.746615,0.713178,0.758278,...,0.700000,0.713178,0.720930,0.763566,0.786822,0.782946,0.658915,0.834483,0.713178,0.746615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5311,0.782051,0.700730,0.697987,0.833724,0.782946,0.800738,0.823362,0.864603,0.730061,0.869205,...,0.829730,0.723926,0.726190,0.682927,0.020202,0.000000,0.784753,0.913300,0.766497,0.864603
5312,0.508547,0.730942,0.721973,0.756440,0.658915,0.678967,0.715100,0.771760,0.721973,0.786424,...,0.727027,0.717489,0.704036,0.757848,0.789238,0.784753,0.000000,0.844335,0.686099,0.771760
5313,0.840394,0.890640,0.882759,0.766502,0.834483,0.829557,0.794089,0.743842,0.868966,0.712315,...,0.790148,0.881773,0.869951,0.899507,0.913300,0.913300,0.844335,0.000000,0.861084,0.743842
5314,0.679487,0.720812,0.690355,0.761124,0.713178,0.708487,0.743590,0.789168,0.695431,0.814570,...,0.745946,0.705584,0.670051,0.736041,0.766497,0.766497,0.686099,0.861084,0.000000,0.789168
