### Adding Fingerprints

In [93]:
#Libraries
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
from rdkit.Chem import rdMolDescriptors as rdescriptors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*') 

In [94]:
#Loading zinc database from csv 
suppl_csv = pd.read_csv('../data/ZINCFINAL.csv', delimiter = ' ')
suppl_csv

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC
...,...,...,...,...,...,...,...
379007,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC
379008,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC
379009,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC
379010,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC


### Calculating Fingerprints for ZINCFINAL.csv 

In [95]:
# List of ECFP of zinc products
ligandm_database = []
for ligand in suppl_csv["Smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))

In [96]:
db_ECFP = []
for substance in ligandm_database:
		db_ECFP.append(Chem.GetMorganFingerprintAsBitVect(substance, 2, nBits=2048))

In [97]:
suppl_csv['ECFP'] = db_ECFP

In [98]:
suppl_csv

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name,ECFP
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
379007,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
379008,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
379009,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
379010,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [99]:
db_ECFP1 = []
for x in suppl_csv.Smiles:
    molecule = Chem.MolFromSmiles(x)
    fp = Chem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=2048, bitInfo=bitinfo)
    db_ECFP1.append(list(fp.GetOnBits()))

In [100]:
suppl_csv['Bitset'] = db_ECFP1


In [101]:
suppl_csv

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name,ECFP,Bitset
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 80, 140, 197, 389, 623, 650, 667, 739, 786..."
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[41, 80, 197, 339, 389, 390, 650, 652, 807, 88..."
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[80, 187, 190, 207, 305, 307, 389, 469, 650, 8..."
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[209, 221, 285, 389, 650, 656, 807, 926, 1017,..."
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[15, 106, 301, 314, 378, 584, 650, 664, 724, 7..."
...,...,...,...,...,...,...,...,...,...
379007,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[14, 234, 235, 241, 350, 383, 517, 587, 650, 6..."
379008,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[80, 94, 114, 121, 140, 208, 235, 249, 289, 29..."
379009,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[13, 74, 80, 92, 140, 212, 216, 262, 296, 378,..."
379010,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[80, 99, 165, 216, 237, 255, 294, 295, 333, 36..."


In [102]:
suppl_csv.to_csv('../data/ZINCECFP.csv', sep=' ', index = False)

In [114]:
#Loading zinc database from csv 
suppl_csv = pd.read_csv('../data/ZINCECFP.csv', delimiter = ' ')
suppl_csv

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name,ECFP,Bitset
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[1, 80, 140, 197, 389, 623, 650, 667, 739, 786..."
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[41, 80, 197, 339, 389, 390, 650, 652, 807, 88..."
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[80, 187, 190, 207, 305, 307, 389, 469, 650, 8..."
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[209, 221, 285, 389, 650, 656, 807, 926, 1017,..."
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[15, 106, 301, 314, 378, 584, 650, 664, 724, 7..."
...,...,...,...,...,...,...,...,...,...
379007,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[14, 234, 235, 241, 350, 383, 517, 587, 650, 6..."
379008,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[80, 94, 114, 121, 140, 208, 235, 249, 289, 29..."
379009,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[13, 74, 80, 92, 140, 212, 216, 262, 296, 378,..."
379010,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[80, 99, 165, 216, 237, 255, 294, 295, 333, 36..."


#### Ading fingerprints for COCONUT data

In [104]:
#Loading zinc database from csv 
suppl_coco = pd.read_csv('../data/COCOFINAL.csv', delimiter = ' ')
suppl_coco

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,CNP0000002,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,660.183639,-2.08210,FJEMIESGEMWDOB-UHFFFAOYSA-N,FJEMIESGEMWDOB,COCO
1,CNP0000003,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,598.183897,3.63422,KLWKJVYCDFWQMK-UHFFFAOYSA-N,KLWKJVYCDFWQMK,COCO
2,CNP0000004,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,554.157682,3.32262,PTEKHLCNKCAXPH-UHFFFAOYSA-N,PTEKHLCNKCAXPH,COCO
3,CNP0000005,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,534.298139,6.87940,ZVAVQCZAGOKAMX-UHFFFAOYSA-N,ZVAVQCZAGOKAMX,COCO
4,CNP0000006,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,540.142032,3.01962,UYIPOCQHTAYRMA-UHFFFAOYSA-N,UYIPOCQHTAYRMA,COCO
...,...,...,...,...,...,...,...
386292,CNP0436851,COc1cccc2ccc(C(C)=O)c(O)c12,216.078644,2.75660,XNWOWNYWQOTWIX-UHFFFAOYSA-N,XNWOWNYWQOTWIX,COCO
386293,CNP0436852,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,570.188983,4.77902,XWGVAZYMLDVIDS-UHFFFAOYSA-N,XWGVAZYMLDVIDS,COCO
386294,CNP0436853,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,420.157288,4.82510,XZXMEYSQXQNHCX-UHFFFAOYSA-N,XZXMEYSQXQNHCX,COCO
386295,CNP0436854,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,1183.685261,-1.20930,YNLIJTVZSFUHKP-UHFFFAOYSA-N,YNLIJTVZSFUHKP,COCO


In [105]:
# List of ECFP of zinc products
ligandm_database = []
for ligand in suppl_coco["Smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))

In [106]:
db_ECFP = []
for substance in ligandm_database:
		db_ECFP.append(Chem.GetMorganFingerprintAsBitVect(substance, 2, nBits=2048))

In [107]:
suppl_coco['ECFP'] = db_ECFP

In [108]:
suppl_coco

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name,ECFP
0,CNP0000002,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,660.183639,-2.08210,FJEMIESGEMWDOB-UHFFFAOYSA-N,FJEMIESGEMWDOB,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,CNP0000003,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,598.183897,3.63422,KLWKJVYCDFWQMK-UHFFFAOYSA-N,KLWKJVYCDFWQMK,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CNP0000004,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,554.157682,3.32262,PTEKHLCNKCAXPH-UHFFFAOYSA-N,PTEKHLCNKCAXPH,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,CNP0000005,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,534.298139,6.87940,ZVAVQCZAGOKAMX-UHFFFAOYSA-N,ZVAVQCZAGOKAMX,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
4,CNP0000006,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,540.142032,3.01962,UYIPOCQHTAYRMA-UHFFFAOYSA-N,UYIPOCQHTAYRMA,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
386292,CNP0436851,COc1cccc2ccc(C(C)=O)c(O)c12,216.078644,2.75660,XNWOWNYWQOTWIX-UHFFFAOYSA-N,XNWOWNYWQOTWIX,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
386293,CNP0436852,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,570.188983,4.77902,XWGVAZYMLDVIDS-UHFFFAOYSA-N,XWGVAZYMLDVIDS,COCO,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
386294,CNP0436853,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,420.157288,4.82510,XZXMEYSQXQNHCX-UHFFFAOYSA-N,XZXMEYSQXQNHCX,COCO,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
386295,CNP0436854,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,1183.685261,-1.20930,YNLIJTVZSFUHKP-UHFFFAOYSA-N,YNLIJTVZSFUHKP,COCO,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [109]:
db_ECFP1 = []
db_bitinfo = []
for x in suppl_coco.Smiles:
    molecule = Chem.MolFromSmiles(x)
    fp = Chem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=2048, bitInfo=bitinfo)
    db_ECFP1.append(list(fp.GetOnBits()))

In [110]:
suppl_coco['Bitset'] = db_ECFP1

In [111]:
suppl_coco

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name,ECFP,Bitset
0,CNP0000002,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,660.183639,-2.08210,FJEMIESGEMWDOB-UHFFFAOYSA-N,FJEMIESGEMWDOB,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[39, 58, 80, 118, 140, 144, 147, 209, 229, 252..."
1,CNP0000003,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,598.183897,3.63422,KLWKJVYCDFWQMK-UHFFFAOYSA-N,KLWKJVYCDFWQMK,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[77, 94, 249, 314, 467, 495, 497, 569, 587, 64..."
2,CNP0000004,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,554.157682,3.32262,PTEKHLCNKCAXPH-UHFFFAOYSA-N,PTEKHLCNKCAXPH,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[77, 249, 298, 314, 352, 467, 497, 569, 587, 6..."
3,CNP0000005,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,534.298139,6.87940,ZVAVQCZAGOKAMX-UHFFFAOYSA-N,ZVAVQCZAGOKAMX,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[13, 70, 80, 114, 145, 147, 207, 270, 302, 341..."
4,CNP0000006,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,540.142032,3.01962,UYIPOCQHTAYRMA-UHFFFAOYSA-N,UYIPOCQHTAYRMA,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[77, 249, 298, 314, 352, 410, 457, 467, 497, 5..."
...,...,...,...,...,...,...,...,...,...
386292,CNP0436851,COc1cccc2ccc(C(C)=O)c(O)c12,216.078644,2.75660,XNWOWNYWQOTWIX-UHFFFAOYSA-N,XNWOWNYWQOTWIX,COCO,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[67, 202, 203, 230, 249, 650, 695, 725, 806, 8..."
386293,CNP0436852,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,570.188983,4.77902,XWGVAZYMLDVIDS-UHFFFAOYSA-N,XWGVAZYMLDVIDS,COCO,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 67, 84, 142, 148, 202, 206, 231, 249, 253,..."
386294,CNP0436853,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,420.157288,4.82510,XZXMEYSQXQNHCX-UHFFFAOYSA-N,XZXMEYSQXQNHCX,COCO,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 6, 53, 56, 84, 212, 276, 294, 314, 398, 43..."
386295,CNP0436854,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,1183.685261,-1.20930,YNLIJTVZSFUHKP-UHFFFAOYSA-N,YNLIJTVZSFUHKP,COCO,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 5, 19, 42, 80, 109, 115, 130, 222, 231, 23..."


In [112]:
suppl_coco.to_csv('../data/COCOECFP.csv', sep=' ', index = False)

In [113]:
#Loading coco database from csv 
suppl_coco = pd.read_csv('../data/COCOECFP.csv', delimiter = ' ')
suppl_coco

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name,ECFP,Bitset
0,CNP0000002,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,660.183639,-2.08210,FJEMIESGEMWDOB-UHFFFAOYSA-N,FJEMIESGEMWDOB,COCO,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[39, 58, 80, 118, 140, 144, 147, 209, 229, 252..."
1,CNP0000003,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,598.183897,3.63422,KLWKJVYCDFWQMK-UHFFFAOYSA-N,KLWKJVYCDFWQMK,COCO,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[77, 94, 249, 314, 467, 495, 497, 569, 587, 64..."
2,CNP0000004,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,554.157682,3.32262,PTEKHLCNKCAXPH-UHFFFAOYSA-N,PTEKHLCNKCAXPH,COCO,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[77, 249, 298, 314, 352, 467, 497, 569, 587, 6..."
3,CNP0000005,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,534.298139,6.87940,ZVAVQCZAGOKAMX-UHFFFAOYSA-N,ZVAVQCZAGOKAMX,COCO,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[13, 70, 80, 114, 145, 147, 207, 270, 302, 341..."
4,CNP0000006,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,540.142032,3.01962,UYIPOCQHTAYRMA-UHFFFAOYSA-N,UYIPOCQHTAYRMA,COCO,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[77, 249, 298, 314, 352, 410, 457, 467, 497, 5..."
...,...,...,...,...,...,...,...,...,...
386292,CNP0436851,COc1cccc2ccc(C(C)=O)c(O)c12,216.078644,2.75660,XNWOWNYWQOTWIX-UHFFFAOYSA-N,XNWOWNYWQOTWIX,COCO,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[67, 202, 203, 230, 249, 650, 695, 725, 806, 8..."
386293,CNP0436852,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,570.188983,4.77902,XWGVAZYMLDVIDS-UHFFFAOYSA-N,XWGVAZYMLDVIDS,COCO,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[5, 67, 84, 142, 148, 202, 206, 231, 249, 253,..."
386294,CNP0436853,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,420.157288,4.82510,XZXMEYSQXQNHCX-UHFFFAOYSA-N,XZXMEYSQXQNHCX,COCO,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[1, 6, 53, 56, 84, 212, 276, 294, 314, 398, 43..."
386295,CNP0436854,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,1183.685261,-1.20930,YNLIJTVZSFUHKP-UHFFFAOYSA-N,YNLIJTVZSFUHKP,COCO,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[1, 5, 19, 42, 80, 109, 115, 130, 222, 231, 23..."
