### ZINC sampling

In [None]:
#Libraries
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
from rdkit.Chem import rdMolDescriptors as rdescriptors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*') 

In [2]:
#Loading coconut database from sdf format
coco = Chem.SDMolSupplier('COCONUT_DB.sdf')
coco_d = [m for m in coco if m]

In [91]:
#Number of natural products
len(coco_d)

405960

In [4]:
# List of molecular octanol-water partition coefficient
db_logP = []
for substance in coco_d:
		db_logP.append(Descriptors.MolLogP(substance))

In [5]:
# List of molecular weights of natural products
db_mw = []
for substance in coco_d:
		db_mw.append(rdescriptors.CalcExactMolWt(substance))

In [6]:
#List of smiles
db_smiles = []
for substance in coco_d:
		db_smiles.append(Chem.MolToSmiles(substance))

In [7]:
db_inchikey = []
for substance in coco_d:
		db_inchikey.append(Chem.MolToInchiKey(substance).split('-')[0])


In [8]:
#List of COCONUT_ID
db_COCONUT_id = []
for substance in coco_d:
		db_COCONUT_id.append(substance.GetProp('coconut_id'))

In [9]:
print(len(db_COCONUT_id))

405960


In [11]:
#Creating dataframe of logP, MW and smiles value of COCONUT products
df_COCO = pd.DataFrame(data={'MW': db_mw, 'logP':db_logP, 'Smiles': db_smiles, 'Inchi': db_inchikey, 'coconut_id': db_COCONUT_id})

df_COCO['name'] = "COCO"
df_COCO

Unnamed: 0,MW,logP,Smiles,Inchi,coconut_id,name
0,660.183639,-2.08210,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,FJEMIESGEMWDOB,CNP0000002,COCO
1,598.183897,3.63422,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,KLWKJVYCDFWQMK,CNP0000003,COCO
2,554.157682,3.32262,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,PTEKHLCNKCAXPH,CNP0000004,COCO
3,534.298139,6.87940,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,ZVAVQCZAGOKAMX,CNP0000005,COCO
4,540.142032,3.01962,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,UYIPOCQHTAYRMA,CNP0000006,COCO
...,...,...,...,...,...,...
405955,216.078644,2.75660,COc1cccc2ccc(C(C)=O)c(O)c12,XNWOWNYWQOTWIX,CNP0436851,COCO
405956,570.188983,4.77902,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,XWGVAZYMLDVIDS,CNP0436852,COCO
405957,420.157288,4.82510,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,XZXMEYSQXQNHCX,CNP0436853,COCO
405958,1183.685261,-1.20930,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,YNLIJTVZSFUHKP,CNP0436854,COCO


In [12]:
#Let's split the dataframe into intervals
intervals = (0,200),(200,250),(250,300),(300,325),(325,350),(350,375),(375,400),(400,425),(425,450),(450,500),(500, 3500)
numbers = []
for count, value in enumerate(intervals):
    numbers.append(len(df_COCO.loc[(df_COCO['MW'] > intervals[count][0]) & (df_COCO['MW']<= intervals[count][1])]))
numbers

[20733, 23172, 34127, 24327, 27387, 28633, 27732, 28910, 26016, 40637, 124286]

In [13]:
#intervals_MW = (0,200),(200,250),(250,300),(300,325),(325,350),(350,375),(375,400),(400,425),(425,450),(450,500),(500, 3500)
#intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
names = ['MW_0-200','MW_200-250','MW_250-300','MW_300-325','MW_325-350','MW_350-375','MW_375-400','MW_400-425','MW_425-450','MW_450-500','MW_500-3500']


def MW(intervals):
    return df_COCO.loc[(df_COCO['MW'] > intervals[0]) & (df_COCO['MW'] <= intervals[1])]  

df_MW_0_200 = MW([0,200])
df_MW_200_250 = MW([200,250])
df_MW_250_300 = MW([250,300])
df_MW_300_325 = MW([300,325])
df_MW_325_350 = MW([325,350])
df_MW_350_375 = MW([350,375])
df_MW_375_400 = MW([375,400])
df_MW_400_425 = MW([400,425])
df_MW_425_450 = MW([425,450])
df_MW_450_500 = MW([450,500])
df_MW_500_3500 = MW([500,3500])

In [14]:
#intervals_MW = (0,200),(200,250),(250,300),(300,325),(325,350),(350,375),(375,400),(400,425),(425,450),(450,500),(500, 3500)
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def logP(df):
    num = []
    for count, value in enumerate(intervals_logP):
        num.append(len(df.loc[(df['logP'] > intervals_logP[count][0]) & (df['logP']<= intervals_logP[count][1])]))
    return num
    
df_logP_1 = logP(df_MW_0_200)
df_logP_2 = logP(df_MW_200_250)
df_logP_3 = logP(df_MW_250_300)
df_logP_4 = logP(df_MW_300_325) 
df_logP_5 = logP(df_MW_325_350)
df_logP_6 = logP(df_MW_350_375) 
df_logP_7 = logP(df_MW_375_400)
df_logP_8 = logP(df_MW_400_425) 
df_logP_9 = logP(df_MW_425_450)
df_logP_10 = logP(df_MW_450_500) 
df_logP_11 = logP(df_MW_500_3500)
numbers

[20733, 23172, 34127, 24327, 27387, 28633, 27732, 28910, 26016, 40637, 124286]

In [15]:
my_array =np.stack([df_logP_1, df_logP_2, df_logP_3,df_logP_4,df_logP_5,df_logP_6,df_logP_7,df_logP_8,df_logP_9,df_logP_10, df_logP_11], axis=1)

In [16]:
df_array = pd.DataFrame(my_array, columns = ['200','250','300','325','350','375','400','425', '450','500','>500'],
                        index=['-1', '0','1','2','2.5','3','3.5','4','4.5','5','>5'])
df_array

Unnamed: 0,200,250,300,325,350,375,400,425,450,500,>500
-1,1588,983,1276,794,1091,943,819,891,832,1487,17135
0,2226,1195,1333,816,948,842,948,922,925,1599,6104
1,5031,2454,3055,1601,1747,1770,1747,1605,1523,2330,7440
2,5848,4776,6575,3454,3636,4154,3537,2995,2183,3182,8867
2.5,2443,3086,4561,2414,2937,3038,2608,2307,1694,2217,4958
3,1843,3360,4678,3001,3753,3607,3138,2750,2069,2736,5642
3.5,1038,2592,4180,3146,3717,3613,3256,2980,2470,3366,6114
4,429,2196,2897,2749,3126,3214,2967,2932,2619,3578,6254
4.5,183,1357,1958,2626,2491,2682,2644,2702,2599,3880,6613
5,62,812,1347,1782,1660,1847,1973,2508,2306,3741,6427


In [17]:
numbers

[20733, 23172, 34127, 24327, 27387, 28633, 27732, 28910, 26016, 40637, 124286]

In [92]:
print(len(df_COCO))

405960


### ZINC 200

In [18]:
suppl_csv1 = pd.read_csv('Data_ZINC/full200.csv', delimiter = ' ')
suppl_csv1 = suppl_csv1[suppl_csv1.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv1.tail(1).index,inplace=True)
print(len(suppl_csv1))

107631


In [23]:
#load smiles
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))

#List of INCHIKEY from csv
db_inchikey_z = []
for substance in ligandm_database:
		db_inchikey_z.append(Chem.MolToInchiKey(substance).split('-')[0])

107631
107631


In [41]:
#Creating dataframe of logP and MW value of zinc products
df_ZINC1 = pd.DataFrame(data={'MW': db_MW_z, 'logP':db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_inchikey_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC1['name'] = "ZINC"
df_ZINC1

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,164.068473,-1.9282,CO[C@H]1OC[C@@H](O)[C@H](O)[C@H]1O,ZBDGHWFPLXXWRD,4371221,ZINC
1,121.019749,-1.1498,C[S@@](=O)CC(N)=O,TTXPTDCYRCXFHQ,34310585,ZINC
2,158.043990,-2.1798,NC(=O)N[C@@H]1NC(=O)NC1=O,POJWUDADGALRAB,1843030,ZINC
3,157.121512,-1.1052,NC(=O)CN1CCC(N)CC1,JWRVCVWALQWYIJ,9256947,ZINC
4,141.065060,-1.2535,CNC(=O)c1n[nH]c(N)n1,AFLLHPQFSIEJFI,19844301,ZINC
...,...,...,...,...,...,...
107664,196.219101,5.4834,C=C(CCCCCC)CCCCCC,QDOYJBSJTHIWKH,2528344,ZINC
107665,184.219101,5.0291,CCCCCCC[C@H](C)C(C)(C)C,BWEUYKNMLNSHIJ,2510819,ZINC
107666,196.219101,5.4834,C=C(CCCC)CCCCCCCC,RMTSLZJISCPGBT,2528345,ZINC
107667,184.219101,5.1732,CCCCCCCCCCC(C)C,HGEMCUOAMCILCP,2528314,ZINC


In [42]:
#Check duplicates in ZINC dataframe
print('Celkový počet před upravou',len(df_ZINC1))
ids = df_ZINC1['Inchi']
df_ZINC1[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC1 = df_ZINC1.drop_duplicates(subset=['Inchi'])
print('Celkový Počet po odstanění duplikátu v ramci datasetu ZINC',len(df_ZINC1))
#Check duplicates
ids = df_ZINC1['Inchi']
df_ZINC1[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC1], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi_split
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', axis = 1)
print('Počet duplikátů',len(df_merge))
#get inchi_split
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC1 = df_ZINC1[~df_ZINC1['Inchi'].isin(list_ids)]
print('Počet po odstanění všech duplikátů',len(df_ZINC1))
#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC1.loc[(df_ZINC1['logP'] > intervals_logP[count][0]) & (df_ZINC1['logP']<= intervals_logP[count][1])]))
numbers


# shuffle the DataFrame rows
df_ZINC1 = df_ZINC1.sample(frac = 1)
df_ZINC1

Celkový počet před upravou 107631
Celkový Počet po odstanění duplikátu v ramci datasetu ZINC 69784
Počet duplikátů 2003
Počet po odstanění všech duplikátů 67781


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
16456,156.033506,0.50110,Nc1cc(C(=O)O)c(F)cn1,SWXMQSZBMMOIBW,95762146,ZINC
46594,176.064886,1.57990,CC1(C)CCC(=O)C(F)(F)C1=O,ALHOWJZSABQPEZ,39326164,ZINC
102617,195.108171,3.53180,CC(C)CS[C@H](C)c1cccnc1,ADKSWHXHRFLPQT,308129794,ZINC
101383,197.214350,3.58930,CC(C)CC(CC(C)C)NC1CCC1,HHTHEBVAQWEPPA,52207523,ZINC
69852,183.008706,2.40210,O=C1Nc2cc(Cl)ccc2CO1,IPSHCFHBJHUIPH,214358793,ZINC
...,...,...,...,...,...,...
11369,183.075625,-0.86310,COC1(OC)C=NN=C2NC=NN21,QQEXXXFTSCGWTD,16952827,ZINC
53280,191.105862,1.29610,CCCCn1ccn2nccc2c1=O,WWZPIIIACYIUBG,1889091770,ZINC
64160,189.126597,2.01462,CNc1cc(C)c(C2=NCCC2)cn1,LKYBFJZAUKZKGS,72219021,ZINC
66147,185.177964,2.02340,C[C@H](O)CN1CCCCCCCC1,CKIDKQQLMZVMSH,1586818,ZINC


In [43]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC1.loc[(df_ZINC1['logP'] > intervals[0]) & (df_ZINC1['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1588]
df_lgP_2 = lgP(intervals_logP[1])[:2226]
df_lgP_3 = lgP(intervals_logP[2])[:5031]
df_lgP_4 = lgP(intervals_logP[3])[:5848]
df_lgP_5 = lgP(intervals_logP[4])[:2443]
df_lgP_6 = lgP(intervals_logP[5])[:1843]
df_lgP_7 = lgP(intervals_logP[6])[:1038]
df_lgP_8 = lgP(intervals_logP[7])[:429]
df_lgP_9 = lgP(intervals_logP[8])[:183]
df_lgP_10 = lgP(intervals_logP[9])[:62]
df_lgP_11 = lgP(intervals_logP[10])[:42]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC1 = pd.concat(frames)
print(len(df_ZINC1))

20689


In [44]:
#Check duplicates in ZINC dataframe
ids = df_ZINC1['zinc_id']
df_ZINC1[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name


In [45]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC1.loc[(df_ZINC1['logP'] > intervals_logP[count][0]) & (df_ZINC1['logP']<= intervals_logP[count][1])]))
numbers

[1588, 2226, 5031, 5848, 2443, 1843, 1038, 429, 183, 50, 10]

In [46]:
print(len(df_ZINC1))
df_ZINC1

20689


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
2238,160.096026,-1.72853,N=C(N)NCC[C@H](N)C(=O)O,IFPQOXNWLSRZKX,1589384,ZINC
926,160.084792,-1.25120,CN(C)CC(=O)NCC(=O)O,HQFNONZTUQSPJS,83822513,ZINC
1851,194.036128,-1.38890,O=C(O)CN1CCCNS1(=O)=O,KCIDTUHJDPZBTQ,214763687,ZINC
455,174.052823,-1.41990,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,ZZTJOHOETCDWML,306392345,ZINC
1456,167.044324,-1.87440,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,OSXQHYVRCFCLQV,85343607,ZINC
...,...,...,...,...,...,...
107643,182.203451,5.07130,C1CCCCCCCCCCCC1,UEVXKGPJXXDGCX,90755240,ZINC
107651,182.203451,5.09330,C=C(C)CCCCCCCCCC,PWRBDKMPAZFCSV,2528313,ZINC
107661,184.219101,5.02910,CCCCCCC[C@@H](C)C(C)(C)C,BWEUYKNMLNSHIJ,100014035,ZINC
107666,196.219101,5.48340,C=C(CCCC)CCCCCCCC,RMTSLZJISCPGBT,2528345,ZINC


In [47]:
df_ZINC1 = df_ZINC1.drop('MW', axis = 1).drop('name', axis = 1).drop('Inchi', axis = 1).drop('logP', axis = 1)
df_ZINC1.to_csv('ZINC_csv1/ZINC200-1.csv', sep=' ', index = False)

### ZINC 250

In [48]:
suppl_csv2 = pd.read_csv('Data_ZINC/full250.csv', delimiter = ' ')
suppl_csv2 = suppl_csv2[suppl_csv2.zinc_id != 'zinc_id']
suppl_csv2.drop(suppl_csv2.tail(1).index,inplace=True)
print(len(suppl_csv2))

166659


In [49]:
ligandm_database = []
for ligand in suppl_csv2["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

166659
166659
166659


In [56]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC2 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv2["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv2['zinc_id'] })

df_ZINC2['name'] = "ZINC"
df_ZINC2

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,239.101839,-1.32500,COCCn1cc(C[C@H]2NC(=O)NC2=O)nn1,APLRPUZVZJFSGN,98210274,ZINC
1,232.015392,-1.06020,Cn1cc(S(N)(=O)=O)cc1C(=O)C(=O)O,ADEONOUVTRMBOD,238833219,ZINC
2,224.090940,-1.19590,C[C@@H](O)Cn1cnc2c1c(=O)[nH]c(=O)n2C,DUPRHONHXHRYAA,518708,ZINC
3,220.094688,-1.42310,CC1(C)O[C@@H]2O[C@@H]([C@H](O)CO)[C@@H](O)[C@@...,BGGCXQKYCBBHAH,519804,ZINC
4,242.101505,-2.15880,N[C@H](Cc1cnc[nH]1)C(=O)N[C@H](CO)C(=O)O,KRBMQYPTDYSENE,4533527,ZINC
...,...,...,...,...,...,...
166678,238.229666,5.44260,CCCCCC/C=C\CCCCCCCC=O,QFPVVMKZTVQDTL,59725570,ZINC
166679,239.224915,5.29160,ON=C1CCCCCCCCCCCCCC1,MIMLVMHHSQXPAY,5225592,ZINC
166680,246.211252,5.53000,CCCCCCCCCCCCP(C)(C)=O,SIDULKZCBGMXJL,2040264,ZINC
166681,241.186421,5.01838,CCC[C@H](CC)SCCCCC(C)(C)C#N,OMGJNHQEYQWNPS,477035098,ZINC


In [57]:
#Check duplicates in ZINC dataframe
print('Počet před úpravou:', len(df_ZINC2))
ids = df_ZINC2['Inchi']
df_ZINC2[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC2= df_ZINC2.drop_duplicates(subset=['Inchi'])
df_ZINC2
print('Počet po odstraněný duplikátu v zinku:', len(df_ZINC2))
#Check duplicates
ids = df_ZINC2['Inchi']
df_ZINC2[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC2], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)
print('Počet duplikátu v rámci coco:', len(df_merge))
#get inchi
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC2 = df_ZINC2[~df_ZINC2['Inchi'].isin(list_ids)]
print('Konečný počet:', len(df_ZINC2))


#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC2.loc[(df_ZINC2['logP'] > intervals_logP[count][0]) & (df_ZINC2['logP']<= intervals_logP[count][1])]))
numbers


# shuffle the DataFrame rows
df_ZINC2 = df_ZINC2.sample(frac = 1)
df_ZINC2

Počet před úpravou: 166659
Počet po odstraněný duplikátu v zinku: 118370
Počet duplikátu v rámci coco: 2411
Konečný počet: 115959


  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
6289,212.077262,-0.3467,O=C(NCC(F)(F)F)[C@@H]1COCCN1,XASFPGNIQRKQQS,83872159,ZINC
94109,248.073182,2.3387,CCCC(=O)Nc1nnc(-c2cccnc2)s1,VXUUSNIUMWDGKE,377554,ZINC
101793,216.126263,2.0978,O=C(c1ccccn1)N1CCC2(CC1)CC2,STIQTLJTFQWXDQ,96051080,ZINC
136137,242.985384,3.1018,O=C(O)Cc1c[nH]c2cc(Cl)c(Cl)cc12,ICRDKUKLLAPFGP,2558049,ZINC
67481,241.131408,1.8590,CC(C)(C)OC(=O)N1[C@@H]2CC[C@@H]1[C@H](C(=O)O)C2,XRDRXGVDCVQVPV,100076508,ZINC
...,...,...,...,...,...,...
22395,246.075290,0.4266,Nc1nccnc1C(=O)OCC(=O)c1ccc[nH]1,IRISBUYFCKGJPK,7984934,ZINC
42385,210.007468,1.6011,Cn1ccnc1SC(=O)C(F)(F)F,RBVUSIJZQLGDFP,5387407,ZINC
158687,241.146664,4.3586,CC(C)(C)c1ccc(Oc2ccccc2N)cc1,YGZXSSOPRWYUMO,14629169,ZINC
29847,231.137162,0.4444,O=C(Cc1ccccn1)N1C[C@H]2CCN[C@H]2C1,ORWZQWJEAJSVKN,96054064,ZINC


In [58]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC2.loc[(df_ZINC2['logP'] > intervals[0]) & (df_ZINC2['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:983]
df_lgP_2 = lgP(intervals_logP[1])[:1195]
df_lgP_3 = lgP(intervals_logP[2])[:2454]
df_lgP_4 = lgP(intervals_logP[3])[:4776]
df_lgP_5 = lgP(intervals_logP[4])[:3086]
df_lgP_6 = lgP(intervals_logP[5])[:3360]
df_lgP_7 = lgP(intervals_logP[6])[:2592]
df_lgP_8 = lgP(intervals_logP[7])[:2196]
df_lgP_9 = lgP(intervals_logP[8])[:1357]
df_lgP_10 = lgP(intervals_logP[9])[:812]
df_lgP_11 = lgP(intervals_logP[10])[:361]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC2 = pd.concat(frames)

In [60]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC2.loc[(df_ZINC2['logP'] > intervals_logP[count][0]) & (df_ZINC2['logP']<= intervals_logP[count][1])]))
numbers

[983, 1195, 2454, 4776, 3086, 3360, 2592, 2196, 1357, 812, 361]

In [61]:
print(len(df_ZINC2))

23172


In [62]:
df_ZINC2 = df_ZINC2.drop('MW',axis = 1).drop('name', axis =1).drop('Inchi', axis = 1).drop('logP', axis = 1)
df_ZINC2.to_csv('ZINC_csv1/ZINC250-1.csv', sep=' ', index = False)

### ZINC 300

In [65]:
suppl_csv3 = pd.read_csv('Data_ZINC/full300.csv', delimiter = ' ')
suppl_csv3 = suppl_csv3[suppl_csv3.zinc_id != 'zinc_id']
suppl_csv3.drop(suppl_csv3.tail(1).index,inplace=True)
print(len(suppl_csv3))

305952


In [66]:
ligandm_database = []
for ligand in suppl_csv3["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

305952
305952
305952


In [69]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC3 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv3["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv3['zinc_id'] })

df_ZINC3['name'] = "ZINC"
df_ZINC3

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,275.148121,-1.12310,N[C@H](CCCCNC(=O)CC[C@@H](N)C(=O)O)C(=O)O,JPKNLFVGUZRHOB,4545889,ZINC
1,254.137890,-1.08460,COCCN1CNc2c(c(=O)n(C)c(=O)n2C)C1,DAWRSBSRETXTPS,55227771,ZINC
2,254.126657,-1.67580,O=C1CC[C@H](C(=O)N2C[C@@H]3[C@@H](C2)C3(CO)CO)N1,ILYHWMBVZYUODH,1772813580,ZINC
3,272.057926,-1.26660,COC(=O)CNC(=O)CSc1nc(N)cc(=O)[nH]1,JTAYZHXRQYAJSU,8671294,ZINC
4,286.080101,-2.28110,CC(=O)OC[C@@H]1O[C@H](n2ccc(=O)[nH]c2=O)[C@H](...,KTMVKCZHYODLLY,100935907,ZINC
...,...,...,...,...,...,...
305959,284.271530,6.18840,CC[C@@H](C)CCCCCCCCCCCCCC(=O)O,MAFSBQRWNXDTRK,4556918,ZINC
305960,292.276616,6.34680,CC[C@](C)(O)CC/C=C(\C)CC/C=C(/C)CCC=C(C)C,SAHYANTVORDRQI,38655889,ZINC
305961,296.271530,5.45280,CCCC(=O)O[C@@H](C)CC[C@H]1[C@@H](C(C)C)C[C@@H]...,LICMODCSGGQGMQ,4783332,ZINC
305962,297.151750,5.73412,Cc1cccc2c(-c3ccccc3)c(Cc3ccccc3)[nH]c12,IYIWBHFXMFTRFH,1675789,ZINC


In [70]:
#Check duplicates in ZINC dataframe
print('Počet před úpravou', len(df_ZINC3))
ids = df_ZINC3['Inchi']
df_ZINC3[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC3 = df_ZINC3.drop_duplicates(subset=['Inchi'])
print('Počet  po odstranění v rámci ZINC', len(df_ZINC3))
#Check duplicates
ids = df_ZINC3['Inchi']
df_ZINC3[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC3], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)
print('Počet duplikátu coco', len(df_merge))
#get zinc_ids
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC3 = df_ZINC3[~df_ZINC3['Inchi'].isin(list_ids)]
print('Konečný počet', len(df_ZINC3))
#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC3.loc[(df_ZINC3['logP'] > intervals_logP[count][0]) & (df_ZINC3['logP']<= intervals_logP[count][1])]))
numbers


# shuffle the DataFrame rows
df_ZINC3 = df_ZINC3.sample(frac = 1)
df_ZINC3

Počet před úpravou 305952
Počet  po odstranění v rámci ZINC 209269
Počet duplikátu coco 4025
Konečný počet 205244


  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
25948,297.895252,0.99630,CN1CC(Br)(Br)C(=O)N(C)C1=O,RDMZPNFPXFFOLZ,1642648,ZINC
144481,270.221975,2.17202,CCN(CC)CCCNCc1c(C)nn(CC)c1F,WCXBUIJUVRRGDT,256054404,ZINC
43116,259.062473,1.90340,NCc1nnc2ccc(-c3ccc(Cl)cc3)nn12,YHFXHNXUQFCBHN,58021811,ZINC
120926,262.148141,2.22860,O=C(NC1C[C@@H]2CCC[C@H](C1)N2)c1cccc(F)c1,BEHZGMKOOPKACL,19293599,ZINC
201281,281.063426,2.50190,CCc1nnc(SCC(=O)Nc2ccccc2F)o1,FEZDPEJGSIDPEI,33794611,ZINC
...,...,...,...,...,...,...
220249,253.146664,3.16234,Cc1ccc(CNC(=O)Cc2ccccc2)cc1C,SAKPKNIGTZTTQQ,1739159,ZINC
284951,299.098000,4.08180,CC(=O)c1c(NC(=O)c2ccccc2)sc2c1CCCC2,MSAAABOWXJLFRE,247086,ZINC
198003,290.094233,2.99370,C[n+]1ccccc1C1COC(c2ccccc2Cl)OC1,PVDXAFBHVNHPRC,5378009,ZINC
94741,295.135448,1.32430,CN(C)[C@@H]1CN(C(=O)c2cscn2)C[C@@H]2CCCO[C@@H]21,LXXFYMQXXARRPD,97953429,ZINC


In [71]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC3.loc[(df_ZINC3['logP'] > intervals[0]) & (df_ZINC3['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1276]
df_lgP_2 = lgP(intervals_logP[1])[:1333]
df_lgP_3 = lgP(intervals_logP[2])[:3055]
df_lgP_4 = lgP(intervals_logP[3])[:6575]
df_lgP_5 = lgP(intervals_logP[4])[:4561]
df_lgP_6 = lgP(intervals_logP[5])[:4678]
df_lgP_7 = lgP(intervals_logP[6])[:4180]
df_lgP_8 = lgP(intervals_logP[7])[:2897]
df_lgP_9 = lgP(intervals_logP[8])[:1958]
df_lgP_10 = lgP(intervals_logP[9])[:1347]
df_lgP_11 = lgP(intervals_logP[10])[:2267]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC3 = pd.concat(frames)



In [72]:
df_ZINC3['MW'].describe()

count    34127.000000
mean       278.210133
std         14.392604
min        248.876308
25%        266.203451
50%        280.082348
75%        290.199428
max        449.029641
Name: MW, dtype: float64

In [143]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC3.loc[(df_ZINC3['logP'] > intervals_logP[count][0]) & (df_ZINC3['logP']<= intervals_logP[count][1])]))
numbers

[1276, 1333, 3055, 6575, 4561, 4678, 4180, 2897, 1958, 1347, 2267]

In [73]:
print(len(df_ZINC3))

34127


In [74]:
df_ZINC3 = df_ZINC3.drop('MW', axis = 1).drop('name', axis = 1).drop('Inchi', axis = 1).drop('logP', axis= 1)
df_ZINC3.to_csv('ZINC_csv1/ZINC300-1.csv', sep=' ', index = False)

### ZINC 325

In [76]:
suppl_csv4 = pd.read_csv('Data_ZINC/full325.csv', delimiter = ' ')
suppl_csv4 = suppl_csv4[suppl_csv4.zinc_id != 'zinc_id']
suppl_csv4.drop(suppl_csv4.tail(1).index,inplace=True)
print(len(suppl_csv4))

89501


In [77]:
ligandm_database = []
for ligand in suppl_csv4["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

89501
89501
89501


In [78]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC4 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv4["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv4['zinc_id'] })

df_ZINC4['name'] = "ZINC"
df_ZINC4

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,307.156577,-1.04810,CC[C@H](CO)N1CCN(S(=O)(=O)N2CCOCC2)CC1,REKAICIWKCRQKG,89943537,ZINC
1,318.099791,-1.87530,CS(=O)(=O)N1CCC[C@@H](NC(=O)C[C@@H]2NC(=O)NC2=...,WDTDLAYRPOBJDZ,108483283,ZINC
2,302.104876,-1.13598,Cc1c(S(=O)(=O)NC[C@H]2CN(C)C(=O)CO2)cnn1C,VUGYNXDPYAHKAG,885939216,ZINC
3,317.152161,-1.01790,CCn1ncnc1CN1C[C@@H](O)[C@H](CS(=O)(=O)N(C)C)C1,QOFKOXQZJWLMNX,1529195917,ZINC
4,323.122969,-1.86960,O=C(NCCCN1CCCC1=O)C(=O)Nc1c[nH]c(=O)[nH]c1=O,ZYIKGZKAUWLTAY,12389828,ZINC
...,...,...,...,...,...,...
89506,317.141579,5.04700,C=C(C)COc1cccc(C(=O)Nc2cccc3ccccc23)c1,HQMXZXJIZCDZMJ,8570619,ZINC
89507,323.201592,6.58950,CCCCCCCCCCCCC(=O)Nc1ccc(Cl)cc1,VKMYCVVXOMDWGS,100286102,ZINC
89508,319.157229,5.38780,CCCOc1ccc2ccccc2c1/C=N/c1ccc(OC)cc1,XFTWTMGNVRITLQ,6942611,ZINC
89509,315.102606,5.05410,C=CCOc1c(Cl)cc(/C=N/c2ccccc2)cc1OCC,QFRMLDXKYFBTQJ,6959801,ZINC


In [81]:
#Check duplicates in ZINC dataframe
print('Před upravou', len(df_ZINC4))
ids = df_ZINC4['Inchi']
df_ZINC4[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC4 = df_ZINC4.drop_duplicates(subset=['Inchi'])
df_ZINC4
print('duplikaty odstr.', len(df_ZINC4))
#Check duplicates
ids = df_ZINC4['Inchi']
df_ZINC4[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC4], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id',axis =  1)

#get zinc_ids
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
print('coco', len(df_merge))
#Get rid of duplicates in df_ZINC
df_ZINC4 = df_ZINC4[~df_ZINC4['Inchi'].isin(list_ids)]
print('celkovy', len(df_ZINC4))
#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC4.loc[(df_ZINC4['logP'] > intervals_logP[count][0]) & (df_ZINC4['logP']<= intervals_logP[count][1])]))
numbers


# shuffle the DataFrame rows
df_ZINC4 = df_ZINC4.sample(frac = 1)
df_ZINC4

Před upravou 89501
duplikaty odstr. 70164
coco 1270
celkovy 68894


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
35585,317.093104,3.60370,O=C(OCc1ccccc1)N1CCC[C@H]1c1nnccc1Cl,LKFBRCGBWCEOGM,256051981,ZINC
15183,300.077993,1.84790,CC/C=C/CNS(=O)(=O)c1ccc([N+](=O)[O-])cc1OC,LBLFXGSCHURZDX,295458080,ZINC
21779,323.130363,2.89566,Cc1ccc(NS(=O)(=O)c2c(C)nn(C(C)C)c2C)cc1O,YCQWEQMDTXVWOQ,96427082,ZINC
20332,302.124212,2.23790,CCCC(=O)NCC(=O)NCc1cccc(C(F)(F)F)c1,CFHFFYDUANKVEW,32928457,ZINC
18211,313.272927,2.32460,CC(C)C(CNC(=O)NCC1(N(C)C)CCOCC1)C(C)C,XAOVMJWMKMAOQJ,194841629,ZINC
...,...,...,...,...,...,...
26901,308.152478,3.45222,CCC1=NN(C(=O)c2ccccc2C)[C@](O)(c2ccccc2)C1,HOGFOIJHCYHHNQ,301860,ZINC
21387,315.104148,2.58122,Cc1cnc(NC(=O)C(=O)N(Cc2ccccc2)C2CC2)s1,CIMODXTVTLJJCK,123402715,ZINC
42445,312.138639,3.68830,CCCCC(=O)Nc1ccc2nn(-c3ccc(F)cc3)nc2c1,KMZBARCEVMPVMW,20610365,ZINC
5237,318.104956,-0.07140,COc1cccc(F)c1CN1C[C@@H](O)[C@H](NS(C)(=O)=O)C1,RXLWSWXNFJTTTJ,257301218,ZINC


In [82]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC4.loc[(df_ZINC4['logP'] > intervals[0]) & (df_ZINC4['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:794]
df_lgP_2 = lgP(intervals_logP[1])[:816]
df_lgP_3 = lgP(intervals_logP[2])[:1601]
df_lgP_4 = lgP(intervals_logP[3])[:3454]
df_lgP_5 = lgP(intervals_logP[4])[:2414]
df_lgP_6 = lgP(intervals_logP[5])[:3001]
df_lgP_7 = lgP(intervals_logP[6])[:3146]
df_lgP_8 = lgP(intervals_logP[7])[:2749]
df_lgP_9 = lgP(intervals_logP[8])[:2626]
df_lgP_10 = lgP(intervals_logP[9])[:1782]
df_lgP_11 = lgP(intervals_logP[10])[:1944]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC4 = pd.concat(frames)

In [83]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC4.loc[(df_ZINC4['logP'] > intervals_logP[count][0]) & (df_ZINC4['logP']<= intervals_logP[count][1])]))
numbers

[794, 816, 1601, 3454, 2414, 3001, 3146, 2749, 2626, 1782, 1850]

In [84]:
print(len(df_ZINC4))

24233


In [85]:
df_ZINC4 = df_ZINC4.drop('MW', axis = 1).drop('name', axis = 1).drop('Inchi',axis= 1).drop('logP',axis = 1)
df_ZINC4.to_csv('ZINC_csv1/ZINC325-1.csv', sep=' ', index = False)

### ZINC 350

In [99]:
suppl_csv5 = pd.read_csv('Data_ZINC/full350.csv', delimiter = ' ')
suppl_csv5 = suppl_csv5[suppl_csv5.zinc_id != 'zinc_id']
suppl_csv5.drop(suppl_csv5.tail(1).index,inplace=True)
print(len(suppl_csv5))

72244


In [100]:
ligandm_database = []
for ligand in suppl_csv5["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

72244
72244
72244


In [102]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC5 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv5["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv5['zinc_id'] })

df_ZINC5['name'] = "ZINC"
df_ZINC5

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,342.172562,-1.22260,C#CCN1CCN(C(=O)CN2CC[C@@H](S(=O)(=O)NCC)C2)CC1,AUGSQPWCIIMHHO,92843693,ZINC
1,331.175673,-1.08522,CN1CCC[C@@H]1C(=O)N1CCN(C(=O)Cn2cnc(C#N)n2)CC1,CQSGKMZIVDZBIQ,330210951,ZINC
2,335.159354,-0.63208,C=CCNC(=O)CN1CCN(C(=O)C(=O)Nc2cc(C)on2)CC1,QGMHSVRTDVMVFY,340572537,ZINC
3,340.131760,-1.24532,C[C@H](NS(=O)(=O)c1cnn(C)c1)C(=O)N1CCN(CC#N)CC1,VRVJREFHQOWTBG,446314287,ZINC
4,349.102233,-1.36120,C=CCN1C(=O)C(=O)N(CC(=O)N[C@H](C(=O)O)c2ccnn2C...,LQWYNIBQAWRZFG,647772990,ZINC
...,...,...,...,...,...,...
72249,348.220164,5.09718,CC[C@@H](C#N)Oc1cccc(CN[C@@H](C)c2ccc3c(c2)CCC...,FTONXEDSLZAIPS,772982152,ZINC
72250,343.004547,5.02780,O=C1/C(=C\c2ccc(Cl)s2)CCc2ncc(C(F)(F)F)cc21,BAUSFPGFJUYQHK,777469307,ZINC
72251,338.088912,5.06868,N#CCOc1cccc(NCc2ccc(-c3cccc(F)c3)s2)c1,NQUHOWKTPKKUEH,779586094,ZINC
72252,347.160932,5.18618,C[C@@H](N[C@H](CCCC#N)c1ccccc1)c1ccc(C(F)(F)F)cn1,WRWJGBGBYVMBFI,903001658,ZINC


In [103]:
#Check duplicates in ZINC dataframe
print(len(df_ZINC5))

ids = df_ZINC5['Inchi']
df_ZINC5[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC5 = df_ZINC5.drop_duplicates(subset=['Inchi'])
print(len(df_ZINC5))
#Check duplicates
ids = df_ZINC5['Inchi']
df_ZINC5[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC5], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)
print(len(df_merge))

#get zinc_ids
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC5 = df_ZINC5[~df_ZINC5['Inchi'].isin(list_ids)]
print(len(df_ZINC5))

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC5.loc[(df_ZINC5['logP'] > intervals_logP[count][0]) & (df_ZINC5['logP']<= intervals_logP[count][1])]))
numbers


# shuffle
# shuffle the DataFrame rows
df_ZINC5 = df_ZINC5.sample(frac = 1)
df_ZINC5

72244
59667
4
59663


  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
39799,327.137162,4.50050,CCn1c2ccccc2c2cc(/C=C/C(=O)c3cnccn3)ccc21,DIDAAJQBCUUBIZ,734750947,ZINC
20971,340.178693,2.95180,C=C(C)CN1C[C@@H](NC(=O)c2cc3ccccc3oc2=O)CC[C@@...,RCCZKVRKLINFQS,1072045451,ZINC
27761,336.140469,3.17280,CC(C)(C)C#CC(=O)N1CCN(Cc2ccc(Cl)c(F)c2)CC1,SPQSCMLMYFZYJJ,1112833329,ZINC
16293,343.164440,2.34898,CCOc1cc(C(=O)N[C@H](C)C[C@@H](C)Nc2ncccc2C#N)on1,STEPCUMNUNUNTE,1089435522,ZINC
28178,338.293328,3.47960,C=C(C)C(C)(C)C(=O)N1CC[C@@H](C)[C@@H](CNCCOCCC...,ZRYCEKXJTZRRNE,1133032451,ZINC
...,...,...,...,...,...,...
23341,331.145140,2.76000,C#CCCCC(=O)N[C@H]1C[C@H]2CC[C@@H]1N2Cc1ccnc(Cl)c1,MBQRDRGYOOYVNU,1109211898,ZINC
7003,332.196074,0.17250,C=CCn1c([C@H]2CCC(=O)NC2)nnc1N1CCNC(=O)[C@H]1CC,VPSWXXHLNRVSKT,1121677070,ZINC
42972,345.187483,4.85078,CCC[C@@H]1CCc2nc(NC(=O)CC3(C#N)CCCCC3)sc2C1,ALKMWLPGOZOBMH,1529565831,ZINC
58084,335.071306,5.00370,O=C(/C=C/c1cccc(-c2cccnc2)c1)c1ccc(O)c(Cl)c1,DOVHMKXWUGZVIF,768754222,ZINC


In [104]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC5.loc[(df_ZINC5['logP'] > intervals[0]) & (df_ZINC5['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1091]
df_lgP_2 = lgP(intervals_logP[1])[:948]
df_lgP_3 = lgP(intervals_logP[2])[:1747]
df_lgP_4 = lgP(intervals_logP[3])[:3636]
df_lgP_5 = lgP(intervals_logP[4])[:2937]
df_lgP_6 = lgP(intervals_logP[5])[:3753]
df_lgP_7 = lgP(intervals_logP[6])[:3717]
df_lgP_8 = lgP(intervals_logP[7])[:3126]
df_lgP_9 = lgP(intervals_logP[8])[:2491]
df_lgP_10 = lgP(intervals_logP[9])[:1660]
df_lgP_11 = lgP(intervals_logP[10])[:2281]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC5 = pd.concat(frames)

In [105]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC5.loc[(df_ZINC5['logP'] > intervals_logP[count][0]) & (df_ZINC5['logP']<= intervals_logP[count][1])]))
numbers

[1091, 948, 1747, 3636, 2937, 3753, 3717, 3126, 2491, 1660, 2281]

In [106]:
print(len(df_ZINC5))

27387


In [107]:
df_ZINC5 = df_ZINC5.drop('MW', axis = 1).drop('name', axis = 1).drop('Inchi', axis = 1).drop('logP',axis = 1)
df_ZINC5.to_csv('ZINC_csv1/ZINC350-1.csv', sep=' ', index = False)

### ZINC 375

In [110]:
suppl_csv6 = pd.read_csv('Data_ZINC/full375.csv', delimiter = ' ')
suppl_csv6 = suppl_csv6[suppl_csv6.zinc_id != 'zinc_id']
suppl_csv6.drop(suppl_csv6.tail(1).index,inplace=True)
print(len(suppl_csv6))

102442


In [111]:
ligandm_database = []
for ligand in suppl_csv6["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

102442
102442
102442


In [112]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC6 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv6["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv6['zinc_id'] })

df_ZINC6['name'] = "ZINC"
df_ZINC6

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,355.131425,-1.28956,Cc1nn(C)c(C)c1S(=O)(=O)N1CCN2C(=O)CN(C)C(=O)[C...,OSBDQWUVGNKRNG,65423162,ZINC
1,354.190320,-1.67260,O=C(CN1C(=O)CNC1=O)N1C[C@@H](CO)C[C@H](CN2CCOC...,VCUJNQNZQRHTBD,91985397,ZINC
2,354.142701,-1.03250,CC(=O)Nc1ccc(O[C@@H]2O[C@H](CO)[C@H](O)[C@H](O...,XVIFZZGCCJWHHU,239433518,ZINC
3,354.142701,-1.03250,CC(=O)Nc1ccccc1O[C@@H]1O[C@H](CO)[C@H](O)[C@@H...,IWCWSRODNPRNLK,245377209,ZINC
4,370.074719,-1.13340,O=C(CNS(=O)(=O)c1cccc(F)c1)N1CC(N2C(=O)CNC2=O)C1,AMTRTSGFOMXNHC,253471034,ZINC
...,...,...,...,...,...,...
102449,357.045397,5.13460,O=C(Nc1cccc(Cl)c1)C1[C@H]2CC[C@H]3[C@@H](CC[C@...,RRSLLQLMRWNJBR,18205557,ZINC
102450,367.004175,5.01592,Cc1ccc(S[C@@H](C)C(=O)Nc2ccc(Br)cc2F)cc1,BAMVYVJPRMGBOA,21601520,ZINC
102451,361.204179,5.76070,CCc1cccc(CC)c1NC(=O)[C@@H](CC)Oc1cccc2ccccc12,CGNQLASVRGXLCL,44947951,ZINC
102452,368.209993,5.12270,CC(C)(C)OC(=O)N1CCCC[C@@H]1c1cccnc1OCc1ccccc1,JRMUXTAIXITUFU,72212107,ZINC


In [113]:
#Check duplicates in ZINC dataframe
ids = df_ZINC6['Inchi']
df_ZINC6[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
print(len(df_ZINC6))
#Drop duplicates
df_ZINC6 = df_ZINC6.drop_duplicates(subset=['Inchi'])
print(len(df_ZINC6))
#Check duplicates
ids = df_ZINC6['Inchi']
df_ZINC6[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC6], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)
print(len(df_merge))
#get zinc_ids
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC6 = df_ZINC6[~df_ZINC6['Inchi'].isin(list_ids)]
print(len(df_ZINC6))
#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC6.loc[(df_ZINC6['logP'] > intervals_logP[count][0]) & (df_ZINC6['logP']<= intervals_logP[count][1])]))
numbers


# shuffle
# shuffle the DataFrame rows
df_ZINC6 = df_ZINC6.sample(frac = 1)
df_ZINC6

102442
76668
2226
74442


  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
54094,354.142701,1.08564,Cc1oc(C)c(C(=O)NCCCC(=O)O)c1C(=O)NCCCC(=O)O,JQAHRCDRKYIUHO,3627246,ZINC
36503,363.125277,1.87704,Cc1ccc(OCC(=O)Nc2cc(C)nn2[C@@H]2CCS(=O)(=O)C2)cc1,YWIDYLAESLJOPA,17042954,ZINC
48121,355.144453,1.84752,Cc1nnnn1-c1ccc(CC(=O)NCCOc2ccc(F)cc2)cc1,UYTFSAZURCVGIQ,19288452,ZINC
80973,366.057169,3.06420,CCC[C@@H](C)NC(=O)CN(c1cccc(Cl)c1Cl)S(C)(=O)=O,WICWQNPOKAHVEF,6945217,ZINC
92711,363.104148,4.82820,COc1cccc(-n2ccnc2SCc2ncc(-c3ccccc3)o2)c1,MWGMPJGQXSMOLS,22886297,ZINC
...,...,...,...,...,...,...
98093,371.085911,5.08070,CCn1c(-c2ccc(Cl)cc2)cs/c1=N\c1ccc(NC(C)=O)cc1,QFUUJYQGUWNUIK,13563210,ZINC
62363,364.247441,1.72970,CCC1=N[C@@]2(CC[C@@H]3CN(C(=O)NCC(C)C)C[C@@H]3...,QYSNJRGGLFFTJQ,96261678,ZINC
7979,350.116109,-0.23020,CCS(=O)(=O)N1CCN(C(=O)c2ccc(-n3cnnn3)cc2)CC1,LAWDZRSFWYEYGW,53646019,ZINC
39954,371.232125,1.03890,CCN1C[C@@H](C(=O)N2CCc3nc(C4CCN(C)CC4)ncc3C2)C...,OVIXYOCFVFIUBD,19526971,ZINC


In [114]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC6.loc[(df_ZINC6['logP'] > intervals[0]) & (df_ZINC6['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:943]
df_lgP_2 = lgP(intervals_logP[1])[:842]
df_lgP_3 = lgP(intervals_logP[2])[:1770]
df_lgP_4 = lgP(intervals_logP[3])[:4154]
df_lgP_5 = lgP(intervals_logP[4])[:3038]
df_lgP_6 = lgP(intervals_logP[5])[:3607]
df_lgP_7 = lgP(intervals_logP[6])[:3613]
df_lgP_8 = lgP(intervals_logP[7])[:3214]
df_lgP_9 = lgP(intervals_logP[8])[:2682]
df_lgP_10 = lgP(intervals_logP[9])[:1847]
df_lgP_11 = lgP(intervals_logP[10])[:2923]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC6 = pd.concat(frames)

In [115]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC6.loc[(df_ZINC6['logP'] > intervals_logP[count][0]) & (df_ZINC6['logP']<= intervals_logP[count][1])]))
numbers

[943, 842, 1770, 4154, 3038, 3607, 3613, 3214, 2682, 1847, 2923]

In [116]:
print(len(df_ZINC6))

28633


In [117]:
df_ZINC6 = df_ZINC6.drop('MW', axis = 1).drop('name', axis = 1).drop('Inchi', axis = 1).drop('logP', axis = 1)
df_ZINC6.to_csv('ZINC_csv1/ZINC375-1.csv', sep=' ', index = False)

### ZINC 400

In [146]:
suppl_csv7 = pd.read_csv('Data_ZINC/full400.csv', delimiter = ' ')
suppl_csv7 = suppl_csv7[suppl_csv7.zinc_id != 'zinc_id']
suppl_csv7.drop(suppl_csv7.tail(1).index,inplace=True)
print(len(suppl_csv7))

325515


In [147]:
ligandm_database = []
for ligand in suppl_csv7["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

325515
325515
325515


In [182]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC7 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv7["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv7['zinc_id'] })

df_ZINC7['name'] = "ZINC"
df_ZINC7

ValueError: array length 241698 does not match index length 325515

In [149]:
#Check duplicates in ZINC dataframe
ids = df_ZINC7['Inchi']
df_ZINC7[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC7 = df_ZINC7.drop_duplicates(subset=['Inchi'])
df_ZINC7
#Check duplicates
ids = df_ZINC7['Inchi']
df_ZINC7[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC7], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC7 = df_ZINC7[~df_ZINC7['Inchi'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC7.loc[(df_ZINC7['logP'] > intervals_logP[count][0]) & (df_ZINC7['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC7 = df_ZINC7.sample(frac = 1)
df_ZINC7

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
231323,386.174276,4.34260,CN(Cc1nnc(-c2ccccc2)o1)Cc1ccccc1OCc1ccccn1,BLBPKCPTMBEIBE,19586539,ZINC
196201,396.241293,3.61380,CCCCNC(=O)[C@@H](C)N(CCc1ccccc1)C(=O)Cc1ccc(OC...,HFYNHONOWWKFOG,223551521,ZINC
282658,383.140055,4.63480,CCOc1ccc(C(=O)N2CCC(c3nc4cc(Cl)ccc4[nH]3)CC2)cc1,WRUBAAYLVANZLM,1327269,ZINC
180124,396.241293,3.87122,COc1ccc(CN2Cc3cc(C)ccc3OC3(CCN(C)CC3)C2)c(OC)c1,XSABEJYYMLJOHM,72394183,ZINC
299413,377.199094,5.19162,CC[C@H](NC(=O)[C@@H](C)Oc1cccc2ccccc12)c1ccc(O...,MWIOVMDKLHRARS,225704873,ZINC
...,...,...,...,...,...,...
203656,396.168522,3.68860,CCN(CC(=O)Nc1ccc(OC)cc1)C(=O)c1oc2ccccc2c1COC,USGSWRAEORCRQT,29234676,ZINC
221798,383.119129,4.25672,COCCN1C(=O)S/C(=C/c2ccccc2OCc2cccc(C)c2)C1=O,TVTDNTXVTHXQOB,409213264,ZINC
56330,382.076947,2.05802,CC(=O)Nc1cccc(NC(=O)c2sc(N(C)S(C)(=O)=O)nc2C)c1,RUAVQQIIGFKDSU,225883820,ZINC
220276,388.182064,4.10358,CC[C@@H](C(=O)Nc1cc(C)ccc1C)N(c1ccc(C)c(C)c1)S...,ACZKHGGQPMOFHP,13716097,ZINC


In [150]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC7.loc[(df_ZINC7['logP'] > intervals[0]) & (df_ZINC7['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:819]
df_lgP_2 = lgP(intervals_logP[1])[:948]
df_lgP_3 = lgP(intervals_logP[2])[:1747]
df_lgP_4 = lgP(intervals_logP[3])[:3537]
df_lgP_5 = lgP(intervals_logP[4])[:2608]
df_lgP_6 = lgP(intervals_logP[5])[:3138]
df_lgP_7 = lgP(intervals_logP[6])[:3256]
df_lgP_8 = lgP(intervals_logP[7])[:2967]
df_lgP_9 = lgP(intervals_logP[8])[:2644]
df_lgP_10 = lgP(intervals_logP[9])[:1973]
df_lgP_11 = lgP(intervals_logP[10])[:4095]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC7 = pd.concat(frames)

In [151]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC7.loc[(df_ZINC7['logP'] > intervals_logP[count][0]) & (df_ZINC7['logP']<= intervals_logP[count][1])]))
numbers

[744, 948, 1747, 3537, 2608, 3138, 3256, 2967, 2644, 1973, 4095]

In [152]:
df_ZINC7 = df_ZINC7.drop('MW', axis = 1).drop('name', axis = 1).drop('Inchi',axis =  1).drop('logP', axis = 1)
df_ZINC7.to_csv('ZINC_csv1/ZINC400-1.csv', sep=' ', index = False)

### ZINC  425

In [153]:
suppl_csv8 = pd.read_csv('Data_ZINC/full425.csv', delimiter = ' ')
suppl_csv8 = suppl_csv8[suppl_csv8.zinc_id != 'zinc_id']
suppl_csv8.drop(suppl_csv8.tail(1).index,inplace=True)
print(len(suppl_csv8))

67022


In [154]:
ligandm_database = []
for ligand in suppl_csv8["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

67022
67022
67022


In [159]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC8 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv8["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv8['zinc_id'] })

df_ZINC8['name'] = "ZINC"
df_ZINC8

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,424.136947,-1.36950,C[C@@H]1C(=O)O[C@H]2[C@H](O)[C@]34[C@H]5C[C@@H...,SQOJOAFXDQDRGF,256002083,ZINC
1,400.197137,-1.59320,Cn1c(=O)c2c(ncn2C[C@H](O)CN2CCN(c3ncccn3)CC2)n...,MPQZUWAFVWPPIR,19938680,ZINC
2,417.163496,-3.27740,OC[C@@H]1O[C@H](Nc2ccccc2)[C@H](O)[C@H](O)[C@@...,OBCAMZYNEBOEPP,1857525952,ZINC
3,412.145027,-1.64250,C[C@H](NS(C)(=O)=O)C(=O)NC1CC(N(C)C(=O)C(C)(C)...,PXEZZCPMSXJNNC,1875322338,ZINC
4,414.116212,-2.29600,CC(=O)OCC1=C[C@@H]2OC(=O)C3=CO[C@@H](O[C@@H]4O...,IBIPGYWNOBGEMH,4098332,ZINC
...,...,...,...,...,...,...
67029,415.074199,6.58180,CC(C)(Oc1ccc(Cl)cc1)C(=O)Nc1cc(Cl)ccc1Oc1ccccc1,RSQAFQUTSKITSZ,6221170,ZINC
67030,404.130697,5.09606,Cc1cc(C)c2[nH]c(-c3ccccc3)c(/C=C(\Sc3nnc(C)[nH...,HAGLUBSFIGBBGV,6456543,ZINC
67031,406.078742,5.22120,O=C(OCc1nnc(-c2ccccc2)o1)/C(=C/c1ccc(F)cc1)c1c...,SCWVGDIVIVCGBW,7705564,ZINC
67032,413.199094,5.43690,CCc1cccc(CC)c1N1C[C@@H](C(=O)Oc2ccc(-c3ccccc3)...,LSCUXSNVEMFRTL,7929346,ZINC


In [160]:
#Check duplicates in ZINC dataframe
ids = df_ZINC8['Inchi']
df_ZINC8[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC8 = df_ZINC8.drop_duplicates(subset=['Inchi'])
df_ZINC8
#Check duplicates
ids = df_ZINC8['Inchi']
df_ZINC8[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC8], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC8 = df_ZINC8[~df_ZINC8['Inchi'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC8.loc[(df_ZINC8['logP'] > intervals_logP[count][0]) & (df_ZINC8['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC8 = df_ZINC8.sample(frac = 1)
df_ZINC8

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
64148,420.074007,6.29480,O=C(OCc1cccc(C(F)(F)F)c1)c1ccccc1OCc1ccccc1Cl,RSJHGWUHMAXCLA,3330825,ZINC
55534,414.224261,4.05930,CC(C)[C@@H]1C[C@H](C(F)(F)F)n2nc([C@H]3CCCN3C(...,MMXVKFBSGKPJOT,12181271,ZINC
32783,424.123897,2.16404,Cc1noc(C)c1CO[C@@H]1CCN(C(=O)C(=O)NCCCSc2nccs2)C1,DRVRCNOXMPDOKR,806279090,ZINC
47974,410.188880,3.62462,Cc1csc(Nc2cccc([C@H]3CN(Cc4cccnc4N(C)C)CCO3)n2)n1,ZPFXQMXNZZBDTM,257342561,ZINC
60417,410.256943,4.64590,CCOc1ccc(CNC(=O)c2ccc(CN3CCC(C)CC3)cc2)cc1OCC,PGMDAXIHDNAWMG,20492492,ZINC
...,...,...,...,...,...,...
15925,421.083138,1.59730,COC(=O)CN(c1ccc2c(c1)OCCO2)S(=O)(=O)c1ccc2c(c1...,QEUHUUXHBUMBOJ,224837404,ZINC
54846,400.155370,4.28572,COC(=O)c1ccc(Cl)c(NC(=O)C2CCN(Cc3cccc(C)c3)CC2)c1,FMURACZIOAZIRM,9063069,ZINC
32104,404.183539,2.05180,O=C(NCc1ccc(N2CCOCC2)c(F)c1)N1CCN(CC(F)(F)F)CC1,CIGDIRHAJNHSAO,918223004,ZINC
44852,424.122356,3.27792,Cc1cc(S(=O)(=O)NCC(C)C)ccc1OCC(=O)NCc1ccc(Cl)cc1,SXVOAENONDCXAZ,6233233,ZINC


In [161]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC8.loc[(df_ZINC8['logP'] > intervals[0]) & (df_ZINC8['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:891]
df_lgP_2 = lgP(intervals_logP[1])[:922]
df_lgP_3 = lgP(intervals_logP[2])[:1605]
df_lgP_4 = lgP(intervals_logP[3])[:2995]
df_lgP_5 = lgP(intervals_logP[4])[:2307]
df_lgP_6 = lgP(intervals_logP[5])[:2750]
df_lgP_7 = lgP(intervals_logP[6])[:2980]
df_lgP_8 = lgP(intervals_logP[7])[:2932]
df_lgP_9 = lgP(intervals_logP[8])[:2702]
df_lgP_10 = lgP(intervals_logP[9])[:2508]
df_lgP_11 = lgP(intervals_logP[10])[:6318]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC8 = pd.concat(frames)

In [162]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC8.loc[(df_ZINC8['logP'] > intervals_logP[count][0]) & (df_ZINC8['logP']<= intervals_logP[count][1])]))
numbers

[455, 922, 1605, 2995, 2307, 2750, 2980, 2932, 2702, 2508, 4751]

In [166]:
print(len(df_ZINC8))

26907


In [163]:
df_ZINC8 = df_ZINC8.drop('MW', axis =1).drop('name',axis = 1).drop('Inchi',axis = 1).drop('logP', axis =1)
df_ZINC8.to_csv('ZINC_csv1/ZINC425-1.csv', sep=' ', index = False)

### ZINC 450

In [179]:
suppl_csv9 = pd.read_csv('Data_ZINC/full450.csv', delimiter = ' ')
suppl_csv9 = suppl_csv9[suppl_csv9.zinc_id != 'zinc_id']
suppl_csv9.drop(suppl_csv9.tail(1).index,inplace=True)

In [180]:
ligandm_database = []
for ligand in suppl_csv9["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

241698
241698
241698


In [183]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC9 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv9["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv9['zinc_id'] })

df_ZINC9['name'] = "ZINC"
df_ZINC9

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,444.153266,-1.46800,CN(C)[C@@H]1C(=O)[C@@H](C(N)=O)C(=O)[C@@]2(O)C...,MUHBBHLZPHKTTR,169686734,ZINC
1,447.269299,-3.85150,CN[C@H]1[C@@H](O)[C@H](O[C@@H]2[C@@H](N)C[C@@H...,URWAJWIAIPFPJE,38139452,ZINC
2,426.204924,-1.26008,Cc1ncn(C)c1C(=O)N1C[C@H]2CN(C(=O)CNS(C)(=O)=O)...,PAUSVXOBTWOSKW,1772705120,ZINC
3,446.142426,-2.68750,COC(=O)C1=CO[C@@H](O[C@@H]2O[C@H](CO)[C@@H](O)...,YSOBQIMODQOGKQ,31159654,ZINC
4,427.029415,-1.74600,Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO[P@@](=O)(O)OP(=...,XTWYTFMLZFPYCI,12360703,ZINC
...,...,...,...,...,...,...
241706,446.074223,5.56594,Cc1ccc(-n2nc(C(=O)Nc3cccc(C)c3)nc2-c2ccc(Br)cc...,REFBPWKILVVAKD,408699055,ZINC
241707,431.983525,6.00410,O=C1N/C(=C/c2ccc(-c3cc(Cl)ccc3Cl)o2)C(=O)N1c1c...,YVNYVYINDVDOCP,409074243,ZINC
241708,434.214033,5.37830,CCc1ccc(-n2c(SCC(=O)N3[C@@H](C)CCC[C@@H]3C)nnc...,AOWDRLKLCZKCOC,409093827,ZINC
241709,438.123584,6.44794,CCCc1sc2nc(SCc3ccc(C)cc3)n(-c3ccc(F)cc3)c(=O)c...,UYAXDGGPVSNDOU,409120073,ZINC


In [184]:
#Check duplicates in ZINC dataframe
ids = df_ZINC9['Inchi']
df_ZINC9[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC9 = df_ZINC9.drop_duplicates(subset=['Inchi'])
df_ZINC9
#Check duplicates
ids = df_ZINC9['Inchi']
df_ZINC9[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC9], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC9 = df_ZINC9[~df_ZINC9['Inchi'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC9.loc[(df_ZINC9['logP'] > intervals_logP[count][0]) & (df_ZINC9['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC9= df_ZINC9.sample(frac = 1)
df_ZINC9

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
172273,436.082599,4.69952,Cc1cc(C(=O)O)ccc1-c1ccc(/C=C2/NC(=O)N(Cc3ccccc...,UYEYPATYXUVPQG,9260767,ZINC
18806,433.167142,1.62450,CNC(=O)[C@H](C)N(Cc1ccccc1)C(=O)CN(c1cccc(OC)c...,WISKMQMKBSKARS,225193381,ZINC
148972,430.248110,4.96218,CCCCn1nc(C)c2c(C(=O)Nc3c(C)nn(Cc4ccccc4)c3C)cc...,UIPFLBHEVOBWRM,17097074,ZINC
34137,436.166808,2.00022,COc1ccc(CCNC(=O)CN(C)S(=O)(=O)c2cc(C)ccc2OC)cc1OC,QAOOHFQVCFKWDH,16291320,ZINC
3505,431.244501,0.98780,O=C(CCn1nnnc1CN1CCOCC1)NC[C@H](c1ccc(F)cc1)N1C...,LHDSWWYRZJDYPD,19221328,ZINC
...,...,...,...,...,...,...
79794,432.179755,3.33440,CC(=O)c1ccc(NC(=O)C[C@@H]2C(=O)N(C[C@@H]3CCCO3...,XDHMSHJOGLEKRW,39964012,ZINC
137861,445.066717,4.30400,O=C(NC(=S)NCc1ccc(-c2nc3ccccc3o2)cc1)c1ccc2nsn...,UKXXLFGANVDZFT,59294578,ZINC
59124,439.167811,2.83840,CC(C)NC(=O)[C@]1(Cc2ccc(-c3cccs3)cc2)CCN(C(=O)...,LHXBRKUGIFODAA,19449334,ZINC
225338,440.026537,5.61110,O=C(CSc1nnc(-c2cccnc2)n1-c1ccc(Cl)cc1)c1ccccc1Cl,ICKJZEKUULLUGK,16789450,ZINC


In [185]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC9.loc[(df_ZINC9['logP'] > intervals[0]) & (df_ZINC9['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:832]
df_lgP_2 = lgP(intervals_logP[1])[:925]
df_lgP_3 = lgP(intervals_logP[2])[:1523]
df_lgP_4 = lgP(intervals_logP[3])[:2183]
df_lgP_5 = lgP(intervals_logP[4])[:1694]
df_lgP_6 = lgP(intervals_logP[5])[:2069]
df_lgP_7 = lgP(intervals_logP[6])[:2470]
df_lgP_8 = lgP(intervals_logP[7])[:2619]
df_lgP_9 = lgP(intervals_logP[8])[:2599]
df_lgP_10 = lgP(intervals_logP[9])[:2306]
df_lgP_11 = lgP(intervals_logP[10])[:6796]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC9 = pd.concat(frames)

In [186]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC9.loc[(df_ZINC9['logP'] > intervals_logP[count][0]) & (df_ZINC9['logP']<= intervals_logP[count][1])]))
numbers

[324, 925, 1523, 2183, 1694, 2069, 2470, 2619, 2599, 2306, 6796]

In [187]:
df_ZINC9 = df_ZINC9.drop('MW', axis = 1).drop('name', axis = 1).drop('Inchi', axis =1).drop('logP', axis = 1)
df_ZINC9.to_csv('ZINC_csv1/ZINC450-1.csv', sep=' ', index = False)

### ZINC 500

In [189]:
suppl_csv10 = pd.read_csv('Data_ZINC/full500.csv', delimiter = ' ')
suppl_csv10 = suppl_csv10[suppl_csv10.zinc_id != 'zinc_id']
suppl_csv10.drop(suppl_csv10.tail(1).index,inplace=True)
print(len(suppl_csv10))

412279


In [190]:
ligandm_database = []
for ligand in suppl_csv10["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

412279
412279
412279


In [193]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC10 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv10["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv10['zinc_id'] })

df_ZINC10['name'] = "ZINC"
df_ZINC10

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,462.222683,-2.67840,O=C1NCC(=O)N2CCC[C@@H]2C(=O)NCC(=O)N2CCC[C@H]2...,WAOABPNIJIPSKM,4536545,ZINC
1,450.120920,-1.02268,Cc1ccoc1CC(=O)N1CC2(C1)[C@@H](NC(=O)Cn1ccc(O)n...,TYOLYYQSZIXKTM,1875346121,ZINC
2,475.181546,-1.23433,N=C1N=C(O)[C@@H]2[C@H](NC[C@H](CNc3ccc(C(=O)N[...,BAGAZEJCJKQNAA,38300505,ZINC
3,453.176067,-1.04820,O=C(CN1C(=O)CNC1=O)NC1(c2ccccc2)CN(C(=O)[C@H]2...,ACELHVIIKWSTIB,1875358550,ZINC
4,483.968528,-2.50090,O=c1ccn([C@@H]2O[C@H](CO[P@](=O)(O)O[P@](=O)(O...,PGAVKCOVUIYSFO,12959005,ZINC
...,...,...,...,...,...,...
412290,472.118985,5.92074,Cc1cc(C)c(C(=O)Nc2cc(Cl)ccc2Oc2ccc(C(=O)c3cccc...,NLAUYNAXFMYBRC,9256798,ZINC
412291,491.011129,6.50760,CC(=O)N(c1nc(CSc2nnc(Nc3cccc(Cl)c3)s2)cs1)c1cc...,MBTGAFAXXPWKDF,9056470,ZINC
412292,492.108996,5.96400,CC(=O)c1cccc(NC(=O)[C@@H](Sc2nnc(NCc3ccc(F)cc3...,HHVTYKDJHJJEGS,9476743,ZINC
412293,489.038389,5.47462,Cc1nc2ccc(NC(=O)c3cc(S(=O)(=O)N(C)c4ccc(F)cc4)...,BHAFJJLZFPLCPD,9632772,ZINC


In [194]:
#Check duplicates in ZINC dataframe
ids = df_ZINC10['Inchi']
df_ZINC10[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC10 = df_ZINC10.drop_duplicates(subset=['Inchi'])
df_ZINC10
#Check duplicates
ids = df_ZINC10['Inchi']
df_ZINC10[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC10], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC10 = df_ZINC10[~df_ZINC10['Inchi'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC10.loc[(df_ZINC10['logP'] > intervals_logP[count][0]) & (df_ZINC10['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC10 = df_ZINC10.sample(frac = 1)
df_ZINC10

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
39912,455.216869,1.87510,CCCCn1c(N)c(N(CC(C)C)C(=O)c2ccc(N3C(=O)CCC3=O)...,BCPHFFKFNRPDSZ,15323901,ZINC
175204,455.161184,3.15300,O=C(CN(Cc1ccc(Cl)cc1)C(=O)CN1C(=O)COc2ccccc21)...,DNIDNMLUUBVNEO,15935872,ZINC
256588,450.079034,3.61320,COc1cc(CNc2ccc(N3CCOCC3)c(C(=O)O)c2)cc(Br)c1OC,KTFYUTUAYSLPCY,9014925,ZINC
191036,469.237685,3.09474,Cc1cc(C)n(CCOc2cccc(CN3CC[C@H](O)[C@@](O)(COc4...,LKVKRMHFSNNMFB,585281581,ZINC
185341,487.250478,3.44394,CC[C@@H](C(=O)NC(C)C)N(CCc1ccccc1)C(=O)CN(c1cc...,OJWHDYRBMYNSNP,222919955,ZINC
...,...,...,...,...,...,...
201949,451.167811,3.85582,CCc1ccc(NC(=O)CSc2nc3c(C)nn(CC)c3c(=O)n2Cc2ccc...,DHKHLYDODRZNGI,64817461,ZINC
22394,479.109724,1.95000,O=C(CCNS(=O)(=O)c1ccc(F)cc1)N1CCN(Cc2nc(-c3ccs...,OXYKONJPZGLPSY,22901607,ZINC
185118,458.131171,3.35460,COc1ccc(S(=O)(=O)N(CC(=O)NCc2ccc(F)cc2)c2ccccc...,KYSISNDBUZTPRZ,113935746,ZINC
185026,461.153990,3.39792,Cc1ccc(S(=O)(=O)N2CCC(C(=O)N3CCN(c4cccc(Cl)c4)...,JHLGHDBZALAXDY,14413693,ZINC


In [195]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC10.loc[(df_ZINC10['logP'] > intervals[0]) & (df_ZINC10['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1478]
df_lgP_2 = lgP(intervals_logP[1])[:1599]
df_lgP_3 = lgP(intervals_logP[2])[:2330]
df_lgP_4 = lgP(intervals_logP[3])[:3128]
df_lgP_5 = lgP(intervals_logP[4])[:2217]
df_lgP_6 = lgP(intervals_logP[5])[:2736]
df_lgP_7 = lgP(intervals_logP[6])[:3366]
df_lgP_8 = lgP(intervals_logP[7])[:3578]
df_lgP_9 = lgP(intervals_logP[8])[:3880]
df_lgP_10 = lgP(intervals_logP[9])[:3741]
df_lgP_11 = lgP(intervals_logP[10])[:12521]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC10 = pd.concat(frames)

In [196]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC10.loc[(df_ZINC10['logP'] > intervals_logP[count][0]) & (df_ZINC10['logP']<= intervals_logP[count][1])]))
numbers

[466, 1150, 2330, 3128, 2217, 2736, 3366, 3578, 3880, 3741, 12521]

In [197]:
print(len(df_ZINC10))

39113


In [198]:
df_ZINC10 = df_ZINC10.drop('MW', axis = 1).drop('name', axis= 1).drop('Inchi',axis = 1).drop('logP', axis = 1)
df_ZINC10.to_csv('ZINC_csv1/ZINC500-1.csv', sep=' ', index = False)

### ZINC 500 up

In [200]:
suppl_csv11 = pd.read_csv('Data_ZINC/full500up.csv', delimiter = ' ')
suppl_csv11 = suppl_csv11[suppl_csv11.zinc_id != 'zinc_id']
suppl_csv11.drop(suppl_csv11.tail(1).index,inplace=True)
print(len(suppl_csv11))

372726


In [201]:
ligandm_database = []
for ligand in suppl_csv11["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchiKey(substance).split('-')[0])
print(len(db_Inchi_z))

372726
372726
372726


In [202]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC11 = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv11["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv11['zinc_id'] })

df_ZINC11['name'] = "ZINC"
df_ZINC11

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,524.174120,-5.8624,OCC1=C[C@@H](O)[C@]2(O[C@@H]3O[C@H](CO)[C@@H](...,LZKBAGSBRBMVBE,28536449,ZINC
1,752.258638,-9.6188,C[C@@H]1O[C@H](O[C@@H]2CO[C@@H](O)[C@H](O)[C@@...,WXJZWZFCEXKSSG,1857525859,ZINC
2,558.122085,-1.8040,COC(=O)C1=C(C(=O)OC)[C@H](C(=O)OC)[C@@H](C(=O)...,WXVSEVMFXFCFKU,5813216,ZINC
3,610.189770,-1.1566,COc1ccc([C@@H]2CC(=O)c3c(O)cc(O[C@@H]4O[C@H](C...,QUQPHWDTPGMPEX,8382286,ZINC
4,920.478974,-2.3974,CSCC[C@H](NC(=O)[C@H](CC(C)C)N1CC[C@H](NC(=O)[...,HVUNRXRFMQDMBO,169289394,ZINC
...,...,...,...,...,...,...
372845,693.129690,8.9729,CC(C)c1ccc(NC(=O)CSc2ccc(NC(=O)/C(=C/c3ccc(-c4...,NTFKDLHRWHPZJS,150483738,ZINC
372846,523.208219,9.2781,C(=C1\CCC[C@@H]2C1=NN(c1nc(-c3ccc(-c4ccccc4)cc...,VWPHKMNLUNIDAJ,4257754,ZINC
372847,538.251481,6.6504,C=CCn1c(CNc2cccc3ccccc23)nnc1SCC(=O)NN=C/C(C)=...,LYCUZTDIHJOGIP,8397298,ZINC
372848,567.178062,6.4608,O=c1/c(=C\c2cn(Cc3ccccc3F)c3ccccc23)sc2n1[C@@H...,SJZZWPWUHATCRF,8407324,ZINC


In [203]:
#Check duplicates in ZINC dataframe
ids = df_ZINC11['Inchi']
df_ZINC11[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')
#Drop duplicates
df_ZINC11 = df_ZINC11.drop_duplicates(subset=['Inchi'])
df_ZINC11
#Check duplicates
ids = df_ZINC11['Inchi']
df_ZINC11[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC11], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['Inchi']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC11 = df_ZINC11[~df_ZINC11['Inchi'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC11.loc[(df_ZINC11['logP'] > intervals_logP[count][0]) & (df_ZINC11['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC11 = df_ZINC11.sample(frac = 1)
df_ZINC11

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
320946,538.294391,5.63574,Cc1cc(C(=O)N2CCCC[C@H]2CCOc2cccc(CN(C)Cc3cccc4...,ZCPWPQGOGFMQLB,101957618,ZINC
28114,542.175658,1.44820,O=S(=O)(c1ccccc1)N1CCOCCOCCN(S(=O)(=O)c2ccccc2...,KXGHSVMGFLOSOF,616581066,ZINC
47579,601.253648,2.04440,COc1cc2ccc1Oc1cccc(c1)CO[C@H]1CN(C(=O)c3ccc(=O...,GZKRNVKKTQJZHV,952857039,ZINC
355005,537.965515,5.81200,O=C(Nc1cccc2c1OCO2)c1nn2c(C(F)(F)F)cc(-c3ccc(B...,XFALOEKXZLHKHX,845276,ZINC
50428,513.168205,2.28740,CCn1c(SCC(=O)Nc2ccc(C(=O)OC)cc2)nnc1[C@H](CO)N...,HBTORHMLMTUULB,408595323,ZINC
...,...,...,...,...,...,...
306854,541.007232,7.49622,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,IYFPBZIIJDRXMB,2093448,ZINC
220753,533.211505,4.52522,CC[C@@H](C(=O)NC1CCCCC1)N(Cc1cccc(C)c1)C(=O)CN...,SIQYHORNQFEGDV,225093718,ZINC
119658,667.114158,3.69780,O=C(CSc1nnc(CNC(=O)c2ccc(S(=O)(=O)N3CCOCC3)cc2...,ALLHSTXVTCYUQV,97985096,ZINC
159099,526.962620,4.22510,CCS(=O)(=O)n1nc(NCc2cccc(Br)c2)nc1NCc1cccc(Br)c1,DNHJUIIYSWRAHQ,97995283,ZINC


In [206]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC11.loc[(df_ZINC11['logP'] > intervals[0]) & (df_ZINC11['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:17135]
df_lgP_2 = lgP(intervals_logP[1])[:6104]
df_lgP_3 = lgP(intervals_logP[2])[:7440]
df_lgP_4 = lgP(intervals_logP[3])[:8867]
df_lgP_5 = lgP(intervals_logP[4])[:4958]
df_lgP_6 = lgP(intervals_logP[5])[:5642]
df_lgP_7 = lgP(intervals_logP[6])[:6114]
df_lgP_8 = lgP(intervals_logP[7])[:6254]
df_lgP_9 = lgP(intervals_logP[8])[:6613]
df_lgP_10 = lgP(intervals_logP[9])[:6427]
df_lgP_11 = lgP(intervals_logP[10])[:48732]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC11 = pd.concat(frames)

In [207]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC11.loc[(df_ZINC11['logP'] > intervals_logP[count][0]) & (df_ZINC11['logP']<= intervals_logP[count][1])]))
numbers

[1454, 1826, 4811, 8867, 4958, 5642, 6114, 6254, 6613, 6427, 48630]

In [208]:
print(len(df_ZINC11))

101596


In [209]:
df_ZINC11 = df_ZINC11.drop('MW', axis =1).drop('name', axis =1).drop('Inchi', axis = 1).drop('logP', axis = 1)
df_ZINC11.to_csv('ZINC_csv1/ZINC500up-1.csv', sep=' ', index = False)

### ALL

In [210]:
print(len(df_ZINC1)) 
print(len(df_ZINC2))
print(len(df_ZINC3))
print(len(df_ZINC4))
print(len(df_ZINC5))
print(len(df_ZINC6))
print(len(df_ZINC7))
print(len(df_ZINC8))
print(len(df_ZINC9))
print(len(df_ZINC10))
print(len(df_ZINC11))

20689
23172
34127
24233
27387
28633
27657
26907
25508
39113
101596


In [211]:
df_all = pd.concat([df_ZINC1,df_ZINC2,df_ZINC3,df_ZINC4,df_ZINC5,df_ZINC6,df_ZINC7,df_ZINC8,df_ZINC9,df_ZINC10,df_ZINC11], axis=0)
df_all

Unnamed: 0,Smiles,zinc_id
2238,N=C(N)NCC[C@H](N)C(=O)O,1589384
926,CN(C)CC(=O)NCC(=O)O,83822513
1851,O=C(O)CN1CCCNS1(=O)=O,214763687
455,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,306392345
1456,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,85343607
...,...,...
351372,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,97986502
347499,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,97948800
331656,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,54274624
306854,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,2093448


In [213]:
print(len(df_all))

379022


In [215]:
df_all.to_csv('ZINC_csv1/ZINCALL.csv', sep=' ', index = False)