### COCO to CSV
Creating new csv containg COCONUT data. 'MW': db_mw, 'logP':db_logP, 'Smiles': db_smiles, 'Inchi': db_inchikey, 'coconut_id': db_COCONUT_id

In [1]:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
from rdkit.Chem import rdMolDescriptors as rdescriptors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys    
import pubchempy as pcp
import warnings
warnings.filterwarnings('ignore')
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*')

In [2]:
#Loading coconut database from sdf format
coco = Chem.SDMolSupplier('COCONUT_DB.sdf')
coco_d = [m for m in coco if m]

In [3]:
#Number of natural products
len(coco_d)

405960

In [5]:
# List of molecular octanol-water partition coefficient
db_logP = []
for substance in coco_d:
		db_logP.append(Descriptors.MolLogP(substance))

In [6]:
# List of molecular weights of natural products
db_mw = []
for substance in coco_d:
		db_mw.append(rdescriptors.CalcExactMolWt(substance))

In [7]:
#List of smiles
db_smiles = []
for substance in coco_d:
		db_smiles.append(Chem.MolToSmiles(substance))

In [17]:
#List of Inchikey
db_inchikey = []
for substance in coco_d:
		db_inchikey.append(Chem.MolToInchiKey(substance))

In [18]:
#List of Inchikey split
db_inchikey_s = []
for substance in coco_d:
		db_inchikey_s.append(Chem.MolToInchiKey(substance).split('-')[0])

In [10]:
#List of COCONUT_ID
db_COCONUT_id = []
for substance in coco_d:
		db_COCONUT_id.append(substance.GetProp('coconut_id'))

In [22]:
print(len(db_inchikey_s))

405960


In [21]:
#Creating dataframe of logP, MW and smiles value of COCONUT products
df_COCO = pd.DataFrame(data={'Coconut_id': db_COCONUT_id, 'Smiles': db_smiles, 'MW': db_mw, 'logP':db_logP, 'Inchi': db_inchikey, 'Inchi_s': db_inchikey_s})

df_COCO['Name'] = "COCO"
df_COCO

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,CNP0000002,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,660.183639,-2.08210,FJEMIESGEMWDOB-UHFFFAOYSA-N,FJEMIESGEMWDOB,COCO
1,CNP0000003,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,598.183897,3.63422,KLWKJVYCDFWQMK-UHFFFAOYSA-N,KLWKJVYCDFWQMK,COCO
2,CNP0000004,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,554.157682,3.32262,PTEKHLCNKCAXPH-UHFFFAOYSA-N,PTEKHLCNKCAXPH,COCO
3,CNP0000005,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,534.298139,6.87940,ZVAVQCZAGOKAMX-UHFFFAOYSA-N,ZVAVQCZAGOKAMX,COCO
4,CNP0000006,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,540.142032,3.01962,UYIPOCQHTAYRMA-UHFFFAOYSA-N,UYIPOCQHTAYRMA,COCO
...,...,...,...,...,...,...,...
405955,CNP0436851,COc1cccc2ccc(C(C)=O)c(O)c12,216.078644,2.75660,XNWOWNYWQOTWIX-UHFFFAOYSA-N,XNWOWNYWQOTWIX,COCO
405956,CNP0436852,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,570.188983,4.77902,XWGVAZYMLDVIDS-UHFFFAOYSA-N,XWGVAZYMLDVIDS,COCO
405957,CNP0436853,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,420.157288,4.82510,XZXMEYSQXQNHCX-UHFFFAOYSA-N,XZXMEYSQXQNHCX,COCO
405958,CNP0436854,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,1183.685261,-1.20930,YNLIJTVZSFUHKP-UHFFFAOYSA-N,YNLIJTVZSFUHKP,COCO


In [23]:
#Saving COCO pd dataframe to CSV
df_COCO.to_csv('ZINC_csv1/COCOALL.csv', sep=' ', index = False)

In [25]:
#Loading COCO database from csv 
suppl_csv = pd.read_csv('ZINC_csv1/COCOALL.csv', delimiter = ' ')
suppl_csv

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,CNP0000002,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,660.183639,-2.08210,FJEMIESGEMWDOB-UHFFFAOYSA-N,FJEMIESGEMWDOB,COCO
1,CNP0000003,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,598.183897,3.63422,KLWKJVYCDFWQMK-UHFFFAOYSA-N,KLWKJVYCDFWQMK,COCO
2,CNP0000004,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,554.157682,3.32262,PTEKHLCNKCAXPH-UHFFFAOYSA-N,PTEKHLCNKCAXPH,COCO
3,CNP0000005,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,534.298139,6.87940,ZVAVQCZAGOKAMX-UHFFFAOYSA-N,ZVAVQCZAGOKAMX,COCO
4,CNP0000006,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,540.142032,3.01962,UYIPOCQHTAYRMA-UHFFFAOYSA-N,UYIPOCQHTAYRMA,COCO
...,...,...,...,...,...,...,...
405955,CNP0436851,COc1cccc2ccc(C(C)=O)c(O)c12,216.078644,2.75660,XNWOWNYWQOTWIX-UHFFFAOYSA-N,XNWOWNYWQOTWIX,COCO
405956,CNP0436852,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,570.188983,4.77902,XWGVAZYMLDVIDS-UHFFFAOYSA-N,XWGVAZYMLDVIDS,COCO
405957,CNP0436853,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,420.157288,4.82510,XZXMEYSQXQNHCX-UHFFFAOYSA-N,XZXMEYSQXQNHCX,COCO
405958,CNP0436854,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,1183.685261,-1.20930,YNLIJTVZSFUHKP-UHFFFAOYSA-N,YNLIJTVZSFUHKP,COCO


In [None]:
#Saving COCO pd dataframe to CSV
df_COCO.to_csv('ZINC_csv1/COCOFINAL.csv', sep=' ', index = False)

In [26]:
#Check duplicates in COCO dataframe
ids = df_COCO['Inchi_s']
df_COCO[ids.isin(ids[ids.duplicated()])].sort_values('Inchi_s')

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name
342105,CNP0369807,CCc1cc(=O)oc2cc(OCC(=O)NCCCCCC(=O)O)ccc12,361.152537,2.49540,AAAPEXAGBQZUNL-UHFFFAOYSA-N,AAAPEXAGBQZUNL,COCO
305022,CNP0331440,CCc1cc(=O)oc2cc(OCC(=O)NCCCCCC(=O)[O-])ccc12,360.145261,1.16070,AAAPEXAGBQZUNL-UHFFFAOYSA-M,AAAPEXAGBQZUNL,COCO
80871,CNP0098629,COC1=C(C)C(=O)OC1=C1OC2=CCC3CCC(O)(CC4CCCC45CC...,524.337050,4.69540,AABJOSWGDPWIAI-UHFFFAOYSA-O,AABJOSWGDPWIAI,COCO
155052,CNP0175948,COC1=C(C)C(=O)OC1=C1OC2=CCC3CCC(O)(CC4CCCC45CC...,523.329774,6.11250,AABJOSWGDPWIAI-UHFFFAOYSA-N,AABJOSWGDPWIAI,COCO
295845,CNP0321925,CC(CO)=C1CCc2ccc(cc2)C2C=CC(c3cccc(Cc4ccccc4)c...,909.420839,9.12370,AABKOYXSWYMOPH-UHFFFAOYSA-O,AABKOYXSWYMOPH,COCO
...,...,...,...,...,...,...,...
40805,CNP0052001,CC12CC=CC3(C4=CCC5C6(C)CCC(OC7OC(C(=O)[O-])C(O...,670.372795,1.76170,ZZZCIBDNNQUEGX-UHFFFAOYSA-L,ZZZCIBDNNQUEGX,COCO
66277,CNP0083286,O=C1OC(CO)C([O-])=C1O,145.014247,-1.96600,ZZZCUOFIHGPKAK-UHFFFAOYSA-M,ZZZCUOFIHGPKAK,COCO
229469,CNP0253154,O=C1OC(CO)C(O)=C1O,146.021523,-0.76830,ZZZCUOFIHGPKAK-UHFFFAOYSA-N,ZZZCUOFIHGPKAK,COCO
380188,CNP0409188,CCCC(NC(=O)C(C)Oc1ccc2c(C)cc(=O)oc2c1)C(=O)[O-],346.129611,0.90352,ZZZOCTGZIOGBLT-UHFFFAOYSA-M,ZZZOCTGZIOGBLT,COCO


In [27]:
#Drop duplicates
df_COCO = df_COCO.drop_duplicates(subset=['Inchi_s'])
df_COCO
#Check duplicates
ids = df_COCO['Inchi_s']
df_COCO[ids.isin(ids[ids.duplicated()])].sort_values('Inchi_s')

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name


In [28]:
df_COCO

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,CNP0000002,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,660.183639,-2.08210,FJEMIESGEMWDOB-UHFFFAOYSA-N,FJEMIESGEMWDOB,COCO
1,CNP0000003,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,598.183897,3.63422,KLWKJVYCDFWQMK-UHFFFAOYSA-N,KLWKJVYCDFWQMK,COCO
2,CNP0000004,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,554.157682,3.32262,PTEKHLCNKCAXPH-UHFFFAOYSA-N,PTEKHLCNKCAXPH,COCO
3,CNP0000005,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,534.298139,6.87940,ZVAVQCZAGOKAMX-UHFFFAOYSA-N,ZVAVQCZAGOKAMX,COCO
4,CNP0000006,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,540.142032,3.01962,UYIPOCQHTAYRMA-UHFFFAOYSA-N,UYIPOCQHTAYRMA,COCO
...,...,...,...,...,...,...,...
405955,CNP0436851,COc1cccc2ccc(C(C)=O)c(O)c12,216.078644,2.75660,XNWOWNYWQOTWIX-UHFFFAOYSA-N,XNWOWNYWQOTWIX,COCO
405956,CNP0436852,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,570.188983,4.77902,XWGVAZYMLDVIDS-UHFFFAOYSA-N,XWGVAZYMLDVIDS,COCO
405957,CNP0436853,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,420.157288,4.82510,XZXMEYSQXQNHCX-UHFFFAOYSA-N,XZXMEYSQXQNHCX,COCO
405958,CNP0436854,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,1183.685261,-1.20930,YNLIJTVZSFUHKP-UHFFFAOYSA-N,YNLIJTVZSFUHKP,COCO


In [29]:
df_COCO.to_csv('ZINC_csv1/COCOFINAL.csv', sep=' ', index = False)

In [30]:
#Loading zinc database from csv 
suppl_csv = pd.read_csv('ZINC_csv1/COCOFINAL.csv', delimiter = ' ')
suppl_csv

Unnamed: 0,Coconut_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,CNP0000002,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,660.183639,-2.08210,FJEMIESGEMWDOB-UHFFFAOYSA-N,FJEMIESGEMWDOB,COCO
1,CNP0000003,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,598.183897,3.63422,KLWKJVYCDFWQMK-UHFFFAOYSA-N,KLWKJVYCDFWQMK,COCO
2,CNP0000004,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,554.157682,3.32262,PTEKHLCNKCAXPH-UHFFFAOYSA-N,PTEKHLCNKCAXPH,COCO
3,CNP0000005,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,534.298139,6.87940,ZVAVQCZAGOKAMX-UHFFFAOYSA-N,ZVAVQCZAGOKAMX,COCO
4,CNP0000006,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,540.142032,3.01962,UYIPOCQHTAYRMA-UHFFFAOYSA-N,UYIPOCQHTAYRMA,COCO
...,...,...,...,...,...,...,...
386292,CNP0436851,COc1cccc2ccc(C(C)=O)c(O)c12,216.078644,2.75660,XNWOWNYWQOTWIX-UHFFFAOYSA-N,XNWOWNYWQOTWIX,COCO
386293,CNP0436852,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,570.188983,4.77902,XWGVAZYMLDVIDS-UHFFFAOYSA-N,XWGVAZYMLDVIDS,COCO
386294,CNP0436853,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,420.157288,4.82510,XZXMEYSQXQNHCX-UHFFFAOYSA-N,XZXMEYSQXQNHCX,COCO
386295,CNP0436854,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,1183.685261,-1.20930,YNLIJTVZSFUHKP-UHFFFAOYSA-N,YNLIJTVZSFUHKP,COCO
