In [1]:
#Import Library
import pandas as pd
import numpy as np
import glob
from tqdm.auto import tqdm
import itertools
import joblib
import os

# compound package
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
import pubchempy as pc
from padelpy import from_smiles, padeldescriptor

# protein package
from protlearn.preprocessing import remove_unnatural
from protlearn.features import aac
from protlearn.features import paac
from protlearn.features import aaindex1

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
inter_df = pd.read_csv("../../data/1-preparation/interaction/interaction_bindingdb_nodup.csv")
inter_df

Unnamed: 0,UniProt ID,FASTA Sequence,SMILES,Drug
0,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,O=C(O)C=Cc1ccccc1,d1
1,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,d2
2,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,O=C(CCCCCCC(=O)Nc1ccccc1)NO,d3
3,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,CC=C1NC(=O)C2CSSCCC=CC(CC(=O)NC(C(C)C)C(=O)N2)...,d4
4,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,d5
...,...,...,...,...
40669,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38714
40670,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...,d38715
40671,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38716
40672,Q7Z4H4,MARIPTAALGCISLLCLQLPGSLSRSLGGDPRPVKPREPPARSPSS...,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38717


Padelpy

In [4]:
inter_df.rename(columns={'uniprot_id': 'UniProt ID'}, inplace=True)

In [5]:
# protein_inter = pd.read_csv("../../../data/fasta/fasta_df_a.csv")
prot = inter_df.drop(columns=['Drug', 'SMILES']).drop_duplicates()
prot = prot.reset_index(drop=True)
prot

Unnamed: 0,UniProt ID,FASTA Sequence
0,O15379,MAKTVAYFYDPDVGNFHYGAGHPMKPHRLALTHSLVLHYGLYKKMI...
1,P36894,MPQLYIYIRLLGAYLFIISRVQGQNLDSMLHGTGMKSDSDQKKSEN...
2,P20393,MTTLDSNNNTGGVITYIGSSGSSPSRTSPESLYSDNSNGSFQSLTQ...
3,P06213,MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTR...
4,P35354,MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTR...
...,...,...
131,Q01469,MATVQQLEGRWRLVDSKGFDEYMKELGVGIALRKMGAMAKPDCIIT...
132,O43194,MASPSLPGSDCSQIIDHSHVPEFEVATWIKITLILVYLIIFVMGLL...
133,Q5NUL3,MSPECARAAGDAPLRSLEQANRTRFPFFSDVKGDHRLVLAAVETTV...
134,Q9H093,MESLVFARRSGPTPSAAELARPLAEGLIKSPKPLMKKQAVKRHHHK...


In [6]:
# protein_inter = pd.read_csv("../../../data/fasta/fasta_df_a.csv")
mol = inter_df.drop(columns=['UniProt ID', 'FASTA Sequence']).drop_duplicates()
mol = mol.reset_index(drop=True)
mol

Unnamed: 0,SMILES,Drug
0,O=C(O)C=Cc1ccccc1,d1
1,CC(=CC(C)C=CC(=O)NO)C(=O)c1ccc(N(C)C)cc1,d2
2,O=C(CCCCCCC(=O)Nc1ccccc1)NO,d3
3,CC=C1NC(=O)C2CSSCCC=CC(CC(=O)NC(C(C)C)C(=O)N2)...,d4
4,Nc1ccccc1NC(=O)c1ccc(CNC(=O)OCc2cccnc2)cc1,d5
...,...,...
38713,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38714
38714,CNCc1ccccc1CN(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)Nc...,d38715
38715,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38716
38716,CC(C)(C)C(=O)N(CC(=O)Nc1ccc2c(c1)CC1(C2)C(=O)N...,d38717


In [7]:
# Use os.path.join to correctly concatenate the directory path and file pattern
data_dir = "../../../../data/fp"
xml_files = glob.glob(os.path.join(data_dir, "*.xml"))
xml_files.sort()
xml_files

['../../../../data/fp/AtomPairs2DFingerprintCount.xml',
 '../../../../data/fp/AtomPairs2DFingerprinter.xml',
 '../../../../data/fp/EStateFingerprinter.xml',
 '../../../../data/fp/ExtendedFingerprinter.xml',
 '../../../../data/fp/Fingerprinter.xml',
 '../../../../data/fp/GraphOnlyFingerprinter.xml',
 '../../../../data/fp/KlekotaRothFingerprintCount.xml',
 '../../../../data/fp/KlekotaRothFingerprinter.xml',
 '../../../../data/fp/MACCSFingerprinter.xml',
 '../../../../data/fp/PubchemFingerprinter.xml',
 '../../../../data/fp/SubstructureFingerprintCount.xml',
 '../../../../data/fp/SubstructureFingerprinter.xml']

In [8]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': '../../../../data/fp/AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': '../../../../data/fp/AtomPairs2DFingerprinter.xml',
 'EState': '../../../../data/fp/EStateFingerprinter.xml',
 'CDKextended': '../../../../data/fp/ExtendedFingerprinter.xml',
 'CDK': '../../../../data/fp/Fingerprinter.xml',
 'CDKgraphonly': '../../../../data/fp/GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': '../../../../data/fp/KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': '../../../../data/fp/KlekotaRothFingerprinter.xml',
 'MACCS': '../../../../data/fp/MACCSFingerprinter.xml',
 'PubChem': '../../../../data/fp/PubchemFingerprinter.xml',
 'SubstructureCount': '../../../../data/fp/SubstructureFingerprintCount.xml',
 'Substructure': '../../../../data/fp/SubstructureFingerprinter.xml'}

In [9]:
smiles_list = mol["SMILES"].tolist()
fasta_list = prot["FASTA Sequence"].tolist()

In [28]:
mol.iloc[:,0].to_csv("../../../../data/smiles.smi", sep="\t", header=False, index=False)

def padel_fp(input_smi, fp_type):

    fingerprint = fp_type

    fingerprint_output_file = "".join(["../../data/2-feature/mol/bindingdb_", fingerprint, '.csv'])

    fingerprint_descriptortypes = fp[fingerprint]

    padeldescriptor(mol_dir=input_smi, 
                    d_file=fingerprint_output_file,
                    descriptortypes= fingerprint_descriptortypes,
                    detectaromaticity=True,
                    standardizenitro=True,
                    standardizetautomers=True,
                    threads=32,
                    removesalt=True,
                    log=True,
                    fingerprints=True)
                    
padel_fp("../../../../data/smiles.smi", "PubChem")

In [29]:
feature_dff = pd.read_csv("../../data/2-feature/mol/bindingdb_PubChem.csv")
feature_dff

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,AUTOGEN_smiles_1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUTOGEN_smiles_2,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUTOGEN_smiles_3,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUTOGEN_smiles_4,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUTOGEN_smiles_5,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38713,AUTOGEN_smiles_38714,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38714,AUTOGEN_smiles_38715,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38715,AUTOGEN_smiles_38716,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38716,AUTOGEN_smiles_38717,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
pub = pd.concat([mol[['Drug']], feature_dff], axis=1)
pub = pub.drop(columns=['Name'])
pub

Unnamed: 0,Drug,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,d1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,d2,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,d3,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,d4,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,d5,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38713,d38714,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38714,d38715,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38715,d38716,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38716,d38717,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
pub.to_csv('../../data/2-feature/mol/PubChem_bindingdb.csv', index=False)

In [19]:
mol.iloc[:,0].to_csv("../../../../data/smiles.smi", sep="\t", header=False, index=False)

def padel_fp(input_smi, fp_type):

    fingerprint = fp_type

    fingerprint_output_file = "".join(["../../data/2-feature/mol/bindingdb_", fingerprint, '.csv'])

    fingerprint_descriptortypes = fp[fingerprint]

    padeldescriptor(mol_dir=input_smi, 
                    d_file=fingerprint_output_file,
                    descriptortypes= fingerprint_descriptortypes,
                    detectaromaticity=True,
                    standardizenitro=True,
                    standardizetautomers=True,
                    threads=32,
                    removesalt=True,
                    log=True,
                    fingerprints=True)
                    
padel_fp("../../../../data/smiles.smi", "MACCS")

In [20]:
feature_df = pd.read_csv("../../data/2-feature/mol/bindingdb_MACCS.csv")
feature_df

Unnamed: 0,Name,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
0,AUTOGEN_smiles_1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
1,AUTOGEN_smiles_2,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2,AUTOGEN_smiles_3,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
3,AUTOGEN_smiles_4,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,0,1,1,0
4,AUTOGEN_smiles_5,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38713,AUTOGEN_smiles_38714,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
38714,AUTOGEN_smiles_38715,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
38715,AUTOGEN_smiles_38716,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
38716,AUTOGEN_smiles_38717,0,0,0,0,0,0,0,1,0,...,0,1,1,1,1,1,1,1,1,0


In [21]:
maccs = pd.concat([mol[['Drug']], feature_df], axis=1)
maccs = maccs.drop(columns=['Name'])
maccs

Unnamed: 0,Drug,MACCSFP1,MACCSFP2,MACCSFP3,MACCSFP4,MACCSFP5,MACCSFP6,MACCSFP7,MACCSFP8,MACCSFP9,...,MACCSFP157,MACCSFP158,MACCSFP159,MACCSFP160,MACCSFP161,MACCSFP162,MACCSFP163,MACCSFP164,MACCSFP165,MACCSFP166
0,d1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
1,d2,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
2,d3,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
3,d4,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,0,1,1,0
4,d5,0,0,0,0,0,0,0,0,0,...,1,1,1,0,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38713,d38714,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
38714,d38715,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
38715,d38716,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
38716,d38717,0,0,0,0,0,0,0,1,0,...,0,1,1,1,1,1,1,1,1,0


In [22]:
maccs.to_csv('../../data/2-feature/mol/maccs_nrlmf_bindingdb.csv', index=False)

ECFP

In [32]:
def calc_ecfp(list_of_smiles:list)->list:
    
    ecfp = []
    
    for smiles in (list_of_smiles):
        
        try:
        
            mol = Chem.MolFromSmiles(smiles)
            can = Chem.MolToSmiles(mol, isomericSmiles=False)
            mol = Chem.MolFromSmiles(can)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=True)
            ecfp.append([int(x) for x in fp.ToBitString()])
            
        except:
            
            ecfp.append([np.NaN]*1024)
            print("error in smiles:", smiles)
        
     #res = np.array([np.array(x) for x in ecfp])
        
    return ecfp
  
ecfp_result = calc_ecfp(smiles_list)

res = np.array([np.array(x) for x in ecfp_result])



In [33]:
ecfp_df = pd.DataFrame(res)
ecfp_df = ecfp_df.astype(float)
ecfp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38713,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38714,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38715,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38716,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
ecfp_df = pd.concat([mol[['Drug']], ecfp_df], axis=1)
ecfp_df

Unnamed: 0,Drug,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,d1,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,d2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,d3,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,d4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,d5,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38713,d38714,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38714,d38715,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38715,d38716,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38716,d38717,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
ecfp_df.to_csv('../../data/2-feature/mol/ecfp_bindingdb.csv', index=False)

Morgan Fingerprint

In [23]:
def calc_morgan(list_of_smiles:list)->list:
    
    morgan = []
    
    for smiles in tqdm(list_of_smiles):
        
        try:
        
            mol = Chem.MolFromSmiles(smiles)
            can = Chem.MolToSmiles(mol, isomericSmiles=False)
            mol = Chem.MolFromSmiles(can)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
            morgan.append([int(x) for x in fp.ToBitString()])
            
        except:
            
            morgan.append([np.NaN]*1024)
            print("error in smiles:", smiles)
        
    res = np.array([np.array(x) for x in morgan])
        
    return res

In [None]:
res_morgan = calc_morgan(smiles_list)



In [25]:
morgan_df = pd.DataFrame(res_morgan)
morgan_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38713,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
38714,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
38715,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,1,0
38716,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [26]:
morgan = pd.concat([mol[['Drug']], morgan_df], axis=1)
morgan

Unnamed: 0,Drug,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,d1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,d2,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,d3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,d4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,d5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38713,d38714,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
38714,d38715,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
38715,d38716,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
38716,d38717,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


In [27]:
morgan.to_csv('../../data/2-feature/mol/morgan_bindingdb.csv', index=False)