## ZINC_analysis_07-04

Removing duplicates across sampled ZINC substances, exactly 10 substances with identical first INCHIKEY part. Creating new csv with ZINC substances, containing zinc_id, Smiles, MW, logP, INCHIKEY (Inchi) and Inchikey first part (Inchi_s). 

In [17]:
#Libraries
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
from rdkit.Chem import rdMolDescriptors as rdescriptors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*') 

In [21]:
#Loading zinc database from csv 
suppl_csv = pd.read_csv('ZINC_csv1/ZINCALL.csv', delimiter = ' ')
suppl_csv

Unnamed: 0,Smiles,zinc_id
0,N=C(N)NCC[C@H](N)C(=O)O,1589384
1,CN(C)CC(=O)NCC(=O)O,83822513
2,O=C(O)CN1CCCNS1(=O)=O,214763687
3,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,306392345
4,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,85343607
...,...,...
379017,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,97986502
379018,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,97948800
379019,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,54274624
379020,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,2093448


In [23]:
#Number of zinc products
len(suppl_csv)

379022

In [24]:
# List of molecular weights of zinc products
ligandm_database = []
for ligand in suppl_csv["Smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))

In [25]:
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))

#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))

379022
379022


In [26]:
#List of INCHI from csv
db_Inchi = []
for substance in ligandm_database:
		db_Inchi.append(Chem.MolToInchiKey(substance))
print(len(db_Inchi))

379022


In [27]:
#List of INCHI_s from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

379022


In [29]:
#Creating dataframe smiles and of ZINC products
df_ZINC = pd.DataFrame(data={'Zinc_id': suppl_csv['zinc_id'], 'Smiles': suppl_csv['Smiles'], 'MW': db_MW_z,  'logP': db_logP_z, 'Inchi': db_Inchi, 'Inchi_s': db_Inchi_z})
                    
df_ZINC['Name'] = "ZINC"
df_ZINC

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC
...,...,...,...,...,...,...,...
379017,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC
379018,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC
379019,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC
379020,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC


In [30]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['Inchi_s']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('Inchi_s')

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name
239279,239275070,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H]...,660.348277,-0.7024,BDERCNWRQAOVNB-JBIWOSBCSA-N,BDERCNWRQAOVNB,ZINC
280218,150631027,CC[C@H](C)[C@H](NC(=O)[C@H](CC(C)C)NC(=O)[C@H]...,660.348277,-0.7024,BDERCNWRQAOVNB-GYTVCLMCSA-N,BDERCNWRQAOVNB,ZINC
239232,239296785,CC(=O)O[C@H](C(=O)NC[C@H]1CCCO1)[C@@H](OC(C)=O...,544.226825,-0.6964,DHZGKAWIFVVCSS-ZCPXKWAGSA-N,DHZGKAWIFVVCSS,ZINC
280077,97940073,CC(=O)O[C@H](C(=O)NC[C@@H]1CCCO1)[C@@H](OC(C)=...,544.226825,-0.6964,DHZGKAWIFVVCSS-BPIQYHPVSA-N,DHZGKAWIFVVCSS,ZINC
238519,239309554,C[C@H](O)[C@H](NC(=O)OC(C)(C)C)C(=O)N[C@H](CC(...,672.311892,-1.1104,HRTZNWOMIISGRQ-SQFLVMJDSA-N,HRTZNWOMIISGRQ,ZINC
278086,4091778,C[C@@H](O)[C@@H](NC(=O)OC(C)(C)C)C(=O)N[C@@H](...,672.311892,-1.1104,HRTZNWOMIISGRQ-KIHRHZKDSA-N,HRTZNWOMIISGRQ,ZINC
244796,239310113,CC[C@H](C)[C@H](NC(=O)[C@@H]1CCC(=O)N1C(=O)OCc...,571.300599,1.4622,ICWOBLRWDHMGJE-CPAPFFTJSA-N,ICWOBLRWDHMGJE,ZINC
288056,4177603,CC[C@H](C)[C@H](NC(=O)[C@@H]1CCC(=O)N1C(=O)OCc...,571.300599,1.4622,ICWOBLRWDHMGJE-OQCQCTNUSA-N,ICWOBLRWDHMGJE,ZINC
239130,238901046,CC(C)[C@H](C(=O)Nc1nc(O)c2ncn([C@@H]3O[C@H](CO...,512.165562,-0.5973,KQNLYZWFWSWXOV-HZEGDUTGSA-N,KQNLYZWFWSWXOV,ZINC
278774,11666771,CC(C)[C@@H](C(=O)Nc1nc2c(ncn2[C@@H]2O[C@@H](CO...,512.165562,-1.0096,KQNLYZWFWSWXOV-KZDMQCJOSA-N,KQNLYZWFWSWXOV,ZINC


In [31]:
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['Inchi_s'])
df_ZINC
#Check duplicates
ids = df_ZINC['Inchi_s']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('Inchi_s')

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name


In [32]:
print(len(df_ZINC))

379012


In [33]:
df_ZINC

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC
...,...,...,...,...,...,...,...
379017,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC
379018,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC
379019,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC
379020,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC


In [34]:
df_ZINC.to_csv('ZINC_csv1/ZINCFINAL.csv', sep=' ', index = False)

In [35]:
#Loading zinc database from csv 
suppl_csv = pd.read_csv('ZINC_csv1/ZINCFINAL.csv', delimiter = ' ')
suppl_csv

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC
...,...,...,...,...,...,...,...
379007,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC
379008,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC
379009,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC
379010,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC
