### ZINC sampling

In [1]:
#Libraries
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
from rdkit.Chem import rdMolDescriptors as rdescriptors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*') 

In [2]:
#Loading coconut database from sdf format
coco = Chem.SDMolSupplier('COCONUT_DB.sdf')
coco_d = [m for m in coco if m]

In [3]:
#Number of natural products
len(coco_d)

405960

In [4]:
# List of molecular weights of natural products
db_mw= []
for substance in coco_d:
		db_mw.append(rdescriptors.CalcExactMolWt(substance))

In [5]:
# List of molecular octanol-water partition coefficient
db_logP = []
for substance in coco_d:
		db_logP.append(Descriptors.MolLogP(substance))

In [6]:
#List of smiles
db_smiles = []
for substance in coco_d:
		db_smiles.append(Chem.MolToSmiles(substance))

In [7]:
#List of Inchi
db_Inchi = []
for substance in coco_d:
		db_Inchi.append(substance.GetProp('inchi'))

In [8]:
#List of COCONUT_ID
db_COCONUT_id = []
for substance in coco_d:
		db_COCONUT_id.append(substance.GetProp('coconut_id'))

In [9]:
print(len(db_COCONUT_id))

405960


In [10]:
#Creating dataframe of logP, MW and smiles value of COCONUT products
df_COCO = pd.DataFrame(data={'MW':  db_mw, 'logP': db_logP,'Smiles': db_smiles, 'Inchi': db_Inchi, 'coconut_id': db_COCONUT_id})

df_COCO['name'] = "COCO"
df_COCO

Unnamed: 0,MW,logP,Smiles,Inchi,coconut_id,name
0,660.183639,-2.08210,CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N...,InChI=1S/C27H36N2O15S/c1-5-12(29-9-45)26(37)44...,CNP0000002,COCO
1,598.183897,3.63422,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,InChI=1S/C34H30O10/c1-14-10-16-24(21(11-14)42-...,CNP0000003,COCO
2,554.157682,3.32262,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C...,InChI=1S/C32H26O9/c1-13-9-16-24(20(10-13)39-2)...,CNP0000004,COCO
3,534.298139,6.87940,CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(...,"InChI=1S/C33H42O6/c1-31(2,3)30(36)37-18-17-33(...",CNP0000005,COCO
4,540.142032,3.01962,COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(...,InChI=1S/C31H24O9/c1-12-8-15-23(20(9-12)39-2)3...,CNP0000006,COCO
...,...,...,...,...,...,...
405955,216.078644,2.75660,COc1cccc2ccc(C(C)=O)c(O)c12,InChI=1S/C13H12O3/c1-8(14)10-7-6-9-4-3-5-11(16...,CNP0436851,COCO
405956,570.188983,4.77902,COc1cc(OC)c2c(c1)C(OC)(c1ccc3c(c1O)C(=O)c1c(O)...,InChI=1S/C33H30O9/c1-14-9-19-26(23(34)10-14)32...,CNP0436852,COCO
405957,420.157288,4.82510,CC1(C)C=Cc2cc(C3COc4c5c(cc(O)c4C3=O)OC(C)(C)C=...,InChI=1S/C25H24O6/c1-24(2)7-5-13-9-15(17(26)10...,CNP0436853,COCO
405958,1183.685261,-1.20930,CCCCCCCC1CC(=O)NC(CO)C(=O)NC(CO)C(=O)N2CCCC2C(...,InChI=1S/C58H93N11O15/c1-8-11-12-13-14-17-37-2...,CNP0436854,COCO


In [11]:
#Let's split the dataframe into intervals
intervals = (0,200),(200,250),(250,300),(300,325),(325,350),(350,375),(375,400),(400,425),(425,450),(450,500),(500, 3500)
numbers = []
for count, value in enumerate(intervals):
    numbers.append(len(df_COCO.loc[(df_COCO['MW'] > intervals[count][0]) & (df_COCO['MW']<= intervals[count][1])]))
numbers

[20733, 23172, 34127, 24327, 27387, 28633, 27732, 28910, 26016, 40637, 124286]

In [12]:
#intervals_MW = (0,200),(200,250),(250,300),(300,325),(325,350),(350,375),(375,400),(400,425),(425,450),(450,500),(500, 3500)
#intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
names = ['MW_0-200','MW_200-250','MW_250-300','MW_300-325','MW_325-350','MW_350-375','MW_375-400','MW_400-425','MW_425-450','MW_450-500','MW_500-3500']


def MW(intervals):
    return df_COCO.loc[(df_COCO['MW'] > intervals[0]) & (df_COCO['MW'] <= intervals[1])]  

df_MW_0_200 = MW([0,200])
df_MW_200_250 = MW([200,250])
df_MW_250_300 = MW([250,300])
df_MW_300_325 = MW([300,325])
df_MW_325_350 = MW([325,350])
df_MW_350_375 = MW([350,375])
df_MW_375_400 = MW([375,400])
df_MW_400_425 = MW([400,425])
df_MW_425_450 = MW([425,450])
df_MW_450_500 = MW([450,500])
df_MW_500_3500 = MW([500,3500])

In [13]:
#intervals_MW = (0,200),(200,250),(250,300),(300,325),(325,350),(350,375),(375,400),(400,425),(425,450),(450,500),(500, 3500)
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def logP(df):
    num = []
    for count, value in enumerate(intervals_logP):
        num.append(len(df.loc[(df['logP'] > intervals_logP[count][0]) & (df['logP']<= intervals_logP[count][1])]))
    return num
    
df_logP_1 = logP(df_MW_0_200)
df_logP_2 = logP(df_MW_200_250)
df_logP_3 = logP(df_MW_250_300)
df_logP_4 = logP(df_MW_300_325) 
df_logP_5 = logP(df_MW_325_350)
df_logP_6 = logP(df_MW_350_375) 
df_logP_7 = logP(df_MW_375_400)
df_logP_8 = logP(df_MW_400_425) 
df_logP_9 = logP(df_MW_425_450)
df_logP_10 = logP(df_MW_450_500) 
df_logP_11 = logP(df_MW_500_3500)
numbers

[20733, 23172, 34127, 24327, 27387, 28633, 27732, 28910, 26016, 40637, 124286]

In [14]:
my_array =np.stack([df_logP_1, df_logP_2, df_logP_3,df_logP_4,df_logP_5,df_logP_6,df_logP_7,df_logP_8,df_logP_9,df_logP_10, df_logP_11], axis=1)

In [15]:
df_array = pd.DataFrame(my_array, columns = ['200','250','300','325','350','375','400','425', '450','500','>500'],
                        index=['-1', '0','1','2','2.5','3','3.5','4','4.5','5','>5'])
df_array

Unnamed: 0,200,250,300,325,350,375,400,425,450,500,>500
-1,1588,983,1276,794,1091,943,819,891,832,1487,17135
0,2226,1195,1333,816,948,842,948,922,925,1599,6104
1,5031,2454,3055,1601,1747,1770,1747,1605,1523,2330,7440
2,5848,4776,6575,3454,3636,4154,3537,2995,2183,3182,8867
2.5,2443,3086,4561,2414,2937,3038,2608,2307,1694,2217,4958
3,1843,3360,4678,3001,3753,3607,3138,2750,2069,2736,5642
3.5,1038,2592,4180,3146,3717,3613,3256,2980,2470,3366,6114
4,429,2196,2897,2749,3126,3214,2967,2932,2619,3578,6254
4.5,183,1357,1958,2626,2491,2682,2644,2702,2599,3880,6613
5,62,812,1347,1782,1660,1847,1973,2508,2306,3741,6427


In [16]:
numbers

[20733, 23172, 34127, 24327, 27387, 28633, 27732, 28910, 26016, 40637, 124286]

### ZINC 200

In [106]:
suppl_csv = pd.read_csv('Data_ZINC/full200.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)
print(len(suppl_csv))

107670


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [107]:
#load smiles
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

107631
107631
107631


In [108]:
#Creating dataframe of logP and MW value of zinc products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,164.068473,-1.9282,CO[C@H]1OC[C@@H](O)[C@H](O)[C@H]1O,InChI=1S/C6H12O5/c1-10-6-5(9)4(8)3(7)2-11-6/h3...,4371221,ZINC
1,121.019749,-1.1498,C[S@@](=O)CC(N)=O,"InChI=1S/C3H7NO2S/c1-7(6)2-3(4)5/h2H2,1H3,(H2,...",34310585,ZINC
2,158.043990,-2.1798,NC(=O)N[C@@H]1NC(=O)NC1=O,InChI=1S/C4H6N4O3/c5-3(10)6-1-2(9)8-4(11)7-1/h...,1843030,ZINC
3,157.121512,-1.1052,NC(=O)CN1CCC(N)CC1,InChI=1S/C7H15N3O/c8-6-1-3-10(4-2-6)5-7(9)11/h...,9256947,ZINC
4,141.065060,-1.2535,CNC(=O)c1n[nH]c(N)n1,"InChI=1S/C4H7N5O/c1-6-3(10)2-7-4(5)9-8-2/h1H3,...",19844301,ZINC
...,...,...,...,...,...,...
107664,196.219101,5.4834,C=C(CCCCCC)CCCCCC,InChI=1S/C14H28/c1-4-6-8-10-12-14(3)13-11-9-7-...,2528344,ZINC
107665,184.219101,5.0291,CCCCCCC[C@H](C)C(C)(C)C,"InChI=1S/C13H28/c1-6-7-8-9-10-11-12(2)13(3,4)5...",2510819,ZINC
107666,196.219101,5.4834,C=C(CCCC)CCCCCCCC,InChI=1S/C14H28/c1-4-6-8-9-10-11-13-14(3)12-7-...,2528345,ZINC
107667,184.219101,5.1732,CCCCCCCCCCC(C)C,InChI=1S/C13H28/c1-4-5-6-7-8-9-10-11-12-13(2)3...,2528314,ZINC


In [109]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers


# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
24402,144.115030,0.77580,CC(C)[C@@H](O)C1(CO)CC1,"InChI=1S/C8H16O2/c1-6(2)7(10)8(5-9)3-4-8/h6-7,...",32227402,ZINC
53703,164.094963,1.46160,C[C@@H]1CNc2cccc(N)c2O1,InChI=1S/C9H12N2O/c1-6-5-11-8-4-2-3-7(10)9(8)1...,84766876,ZINC
66967,195.162314,2.29100,CC(C)C(C)(C)C(=O)N1CC2(CC2)C1,"InChI=1S/C12H21NO/c1-9(2)11(3,4)10(14)13-7-12(...",334872079,ZINC
580,118.037842,-2.06400,O=C1N[C@@H](O)[C@@H](O)N1,"InChI=1S/C3H6N2O3/c6-1-2(7)5-3(8)4-1/h1-2,6-7H...",18258078,ZINC
84215,191.167400,3.30960,CCN[C@@H](CC)c1ccc(CC)cc1,InChI=1S/C13H21N/c1-4-11-7-9-12(10-8-11)13(5-2...,37223532,ZINC
...,...,...,...,...,...,...
26385,186.100442,0.63610,CCN(CC)C(=O)N/C=C/C(=O)O,InChI=1S/C8H14N2O3/c1-3-10(4-2)8(13)9-6-5-7(11...,222698382,ZINC
87133,192.162649,3.23652,CC[C@@H](C)[C@H](C)Nc1ccc(C)nc1,InChI=1S/C12H20N2/c1-5-9(2)11(4)14-12-7-6-10(3...,96033362,ZINC
97239,199.022248,3.37960,Clc1cccnc1SC1CCC1,InChI=1S/C9H10ClNS/c10-8-5-2-6-11-9(8)12-7-3-1...,306154241,ZINC
18610,157.110279,0.84920,CCN[C@@H]1CC[C@H](C(=O)O)C1,InChI=1S/C8H15NO2/c1-2-9-7-4-3-6(5-7)8(10)11/h...,217903241,ZINC


In [110]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1588]
df_lgP_2 = lgP(intervals_logP[1])[:2226]
df_lgP_3 = lgP(intervals_logP[2])[:5031]
df_lgP_4 = lgP(intervals_logP[3])[:5848]
df_lgP_5 = lgP(intervals_logP[4])[:2443]
df_lgP_6 = lgP(intervals_logP[5])[:1843]
df_lgP_7 = lgP(intervals_logP[6])[:1038]
df_lgP_8 = lgP(intervals_logP[7])[:429]
df_lgP_9 = lgP(intervals_logP[8])[:183]
df_lgP_10 = lgP(intervals_logP[9])[:62]
df_lgP_11 = lgP(intervals_logP[10])[:42]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC1 = pd.concat(frames)

In [111]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC1.loc[(df_ZINC1['logP'] > intervals_logP[count][0]) & (df_ZINC1['logP']<= intervals_logP[count][1])]))
numbers

[1588, 2226, 5031, 5848, 2443, 1843, 1038, 429, 183, 62, 18]

In [112]:
df_ZINC1 = df_ZINC1.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC1.to_csv('ZINC_csv/ZINC200-0.csv', sep=' ', index = False)

  df_ZINC1 = df_ZINC1.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


In [113]:
df_ZINC['MW'].describe()

count    106689.000000
mean        175.564236
std          22.079689
min          52.006148
25%         164.094963
50%         182.141913
75%         193.183050
max         199.230000
Name: MW, dtype: float64

### ZINC 250

In [61]:
suppl_csv = pd.read_csv('full250.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)
print(len(suppl_csv))

442145


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [45]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

442099
442099
442099


In [50]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,225.078327,-1.25590,CN(C)S(=O)(=O)CCNCC[N+](=O)[O-],"InChI=1S/C6H15N3O4S/c1-8(2)14(12,13)6-4-7-3-5-...",905261650,ZINC
1,203.126991,-1.08070,O=[N+]([O-])CCNCCN1CC[C@@H](O)C1,InChI=1S/C8H17N3O3/c12-8-1-4-10(7-8)5-2-9-3-6-...,905397937,ZINC
2,203.126991,-1.12700,O=[N+]([O-])CCN1CCN(CCO)CC1,InChI=1S/C8H17N3O3/c12-8-7-10-3-1-9(2-4-10)5-6...,1351065444,ZINC
3,228.049469,-1.00790,O=C(O)C(=O)NCCn1cc([N+](=O)[O-])cn1,InChI=1S/C7H8N4O5/c12-6(7(13)14)8-1-2-10-4-5(3...,274154699,ZINC
4,243.060368,-1.27070,CN(CC(=O)O)C(=O)Cn1cnc([N+](=O)[O-])n1,InChI=1S/C7H9N5O5/c1-10(3-6(14)15)5(13)2-11-4-...,737982890,ZINC
...,...,...,...,...,...,...
442139,238.229666,5.44260,CCCCCC/C=C\CCCCCCCC=O,InChI=1S/C16H30O/c1-2-3-4-5-6-7-8-9-10-11-12-1...,59725570,ZINC
442140,239.224915,5.29160,ON=C1CCCCCCCCCCCCCC1,InChI=1S/C15H29NO/c17-16-15-13-11-9-7-5-3-1-2-...,5225592,ZINC
442141,246.211252,5.53000,CCCCCCCCCCCCP(C)(C)=O,InChI=1S/C14H31OP/c1-4-5-6-7-8-9-10-11-12-13-1...,2040264,ZINC
442142,241.186421,5.01838,CCC[C@H](CC)SCCCCC(C)(C)C#N,InChI=1S/C14H27NS/c1-5-9-13(6-2)16-11-8-7-10-1...,477035098,ZINC


In [58]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers


# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
297219,216.115030,2.90090,O=C(O)[C@H]1[C@H]2CC[C@H](C2)[C@@H]1c1ccccc1,InChI=1S/C14H16O2/c15-14(16)13-11-7-6-10(8-11)...,4769260,ZINC
158286,247.143310,1.68390,CC(C)n1cc(-c2cn([C@H]3CCOC3)nn2)cn1,InChI=1S/C12H17N5O/c1-9(2)16-6-10(5-13-16)12-7...,630595894,ZINC
315212,234.082684,2.58582,COc1cc(-c2scnc2C)ccc1CN,InChI=1S/C12H14N2OS/c1-8-12(16-7-14-8)9-3-4-10...,642882604,ZINC
149995,247.143310,1.55982,CC[C@H](C)n1[nH]c(=NC(=O)c2cn[nH]c2)cc1C,InChI=1S/C12H17N5O/c1-4-8(2)17-9(3)5-11(16-17)...,330702592,ZINC
39257,216.101111,0.77720,O=C(NCCc1c[nH]cn1)c1ccncc1,InChI=1S/C11H12N4O/c16-11(9-1-4-12-5-2-9)14-6-...,34431504,ZINC
...,...,...,...,...,...,...
404312,245.141579,3.51530,C=CCCCCCNc1cccc2c1COC2=O,InChI=1S/C15H19NO2/c1-2-3-4-5-6-10-16-14-9-7-8...,809200771,ZINC
75749,218.123070,0.99610,O=C(C(F)F)N1CCC2(CCCN2)CC1,InChI=1S/C10H16F2N2O/c11-8(12)9(15)14-6-3-10(4...,96049369,ZINC
432222,243.162314,4.63194,Cc1cc(CNc2ccccc2C(C)C)c(C)o1,InChI=1S/C16H21NO/c1-11(2)15-7-5-6-8-16(15)17-...,1474047624,ZINC
310555,247.120843,2.76900,CC(C)CC(=O)Nc1cccc(/C=C/C(=O)O)c1,InChI=1S/C14H17NO3/c1-10(2)8-13(16)15-12-5-3-4...,49591952,ZINC


In [92]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1000]
df_lgP_2 = lgP(intervals_logP[1])[:1200]
df_lgP_3 = lgP(intervals_logP[2])[:2500]
df_lgP_4 = lgP(intervals_logP[3])[:4800]
df_lgP_5 = lgP(intervals_logP[4])[:3100]
df_lgP_6 = lgP(intervals_logP[5])[:3400]
df_lgP_7 = lgP(intervals_logP[6])[:2600]
df_lgP_8 = lgP(intervals_logP[7])[:2200]
df_lgP_9 = lgP(intervals_logP[8])[:1400]
df_lgP_10 = lgP(intervals_logP[9])[:850]
df_lgP_11 = lgP(intervals_logP[10])[:400]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC2 = pd.concat(frames)

In [93]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC2.loc[(df_ZINC2['logP'] > intervals_logP[count][0]) & (df_ZINC2['logP']<= intervals_logP[count][1])]))
numbers

[1000, 1200, 2500, 4800, 3100, 3400, 2600, 2200, 1400, 850, 400]

In [94]:
df_ZINC2 = df_ZINC2.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC2.to_csv('ZINC250-0.csv', sep=' ', index = False)

  df_ZINC2 = df_ZINC2.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


### ZINC 300

In [18]:
suppl_csv = pd.read_csv('Data_ZINC/full300.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [19]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

305952
305952
305952


In [20]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,275.148121,-1.12310,N[C@H](CCCCNC(=O)CC[C@@H](N)C(=O)O)C(=O)O,InChI=1S/C11H21N3O5/c12-7(10(16)17)3-1-2-6-14-...,4545889,ZINC
1,254.137890,-1.08460,COCCN1CNc2c(c(=O)n(C)c(=O)n2C)C1,InChI=1S/C11H18N4O3/c1-13-9-8(10(16)14(2)11(13...,55227771,ZINC
2,254.126657,-1.67580,O=C1CC[C@H](C(=O)N2C[C@@H]3[C@@H](C2)C3(CO)CO)N1,InChI=1S/C12H18N2O4/c15-5-12(6-16)7-3-14(4-8(7...,1772813580,ZINC
3,272.057926,-1.26660,COC(=O)CNC(=O)CSc1nc(N)cc(=O)[nH]1,InChI=1S/C9H12N4O4S/c1-17-8(16)3-11-7(15)4-18-...,8671294,ZINC
4,286.080101,-2.28110,CC(=O)OC[C@@H]1O[C@H](n2ccc(=O)[nH]c2=O)[C@H](...,InChI=1S/C11H14N2O7/c1-5(14)19-4-6-8(16)9(17)1...,100935907,ZINC
...,...,...,...,...,...,...
305959,284.271530,6.18840,CC[C@@H](C)CCCCCCCCCCCCCC(=O)O,InChI=1S/C18H36O2/c1-3-17(2)15-13-11-9-7-5-4-6...,4556918,ZINC
305960,292.276616,6.34680,CC[C@](C)(O)CC/C=C(\C)CC/C=C(/C)CCC=C(C)C,"InChI=1S/C20H36O/c1-7-20(6,21)16-10-15-19(5)14...",38655889,ZINC
305961,296.271530,5.45280,CCCC(=O)O[C@@H](C)CC[C@H]1[C@@H](C(C)C)C[C@@H]...,InChI=1S/C19H36O2/c1-7-9-19(20)21-14(5)10-11-1...,4783332,ZINC
305962,297.151750,5.73412,Cc1cccc2c(-c3ccccc3)c(Cc3ccccc3)[nH]c12,InChI=1S/C22H19N/c1-16-9-8-14-19-21(18-12-6-3-...,1675789,ZINC


In [21]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers


# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
181504,281.105193,2.79110,O=C(O)[C@H]1CC(=O)N(c2ccc(-c3ccccc3)cc2)C1,InChI=1S/C17H15NO3/c19-16-10-14(17(20)21)11-18...,45954013,ZINC
204296,268.124549,2.61882,Cc1cc(CSCCC(=O)N2CCCCC2)on1,InChI=1S/C13H20N2O2S/c1-11-9-12(17-14-11)10-18...,96500735,ZINC
49936,255.147058,1.26880,C[C@]1(C(=O)O)CCCN1C(=O)CC[C@@H]1CCOC1,InChI=1S/C13H21NO4/c1-13(12(16)17)6-2-7-14(13)...,263551823,ZINC
192712,283.104228,2.65300,CC(C)S(=O)(=O)N1CC=C(c2ccc(F)cc2)CC1,"InChI=1S/C14H18FNO2S/c1-11(2)19(17,18)16-9-7-1...",19450992,ZINC
250136,295.157229,3.45580,CC(=O)C[C@@H](CC(=O)NCc1ccccc1)c1ccccc1,InChI=1S/C19H21NO2/c1-15(21)12-18(17-10-6-3-7-...,2735333,ZINC
...,...,...,...,...,...,...
153634,250.077599,2.00044,Cc1ccc(CS(=O)(=O)c2ncc[nH]2)cc1C,InChI=1S/C12H14N2O2S/c1-9-3-4-11(7-10(9)2)8-17...,190836750,ZINC
4994,266.112738,-1.27880,Nc1nc(O)c2ncn([C@@H]3C[C@@H](N)[C@@H](CO)O3)c2n1,InChI=1S/C10H14N6O3/c11-4-1-6(19-5(4)2-17)16-3...,256673484,ZINC
297338,291.138992,4.95450,CCCCN1c2cc(Cl)c(C=O)cc2C(C)=CC1(C)C,InChI=1S/C17H22ClNO/c1-5-6-7-19-16-9-15(18)13(...,223791011,ZINC
221941,256.030648,3.05650,O=C(O)c1ccc2sc(-c3ccncc3)nc2c1,InChI=1S/C13H8N2O2S/c16-13(17)9-1-2-11-10(7-9)...,33420417,ZINC


In [25]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1400]
df_lgP_2 = lgP(intervals_logP[1])[:1400]
df_lgP_3 = lgP(intervals_logP[2])[:4000]
df_lgP_4 = lgP(intervals_logP[3])[:8000]
df_lgP_5 = lgP(intervals_logP[4])[:5000]
df_lgP_6 = lgP(intervals_logP[5])[:5000]
df_lgP_7 = lgP(intervals_logP[6])[:5000]
df_lgP_8 = lgP(intervals_logP[7])[:3000]
df_lgP_9 = lgP(intervals_logP[8])[:2000]
df_lgP_10 = lgP(intervals_logP[9])[:1400]
df_lgP_11 = lgP(intervals_logP[10])[:2300]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC3 = pd.concat(frames)



In [23]:
df_ZINC3['MW'].describe()

count    23450.000000
mean       278.035598
std         14.190247
min        248.905939
25%        266.143056
50%        279.993511
75%        290.145284
max        318.230728
Name: MW, dtype: float64

In [26]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC3.loc[(df_ZINC3['logP'] > intervals_logP[count][0]) & (df_ZINC3['logP']<= intervals_logP[count][1])]))
numbers

[1400, 1400, 4000, 8000, 5000, 5000, 5000, 3000, 2000, 1400, 2300]

In [27]:
df_ZINC3 = df_ZINC3.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC3.to_csv('ZINC_csv/ZINC300-0.csv', sep=' ', index = False)

  df_ZINC3 = df_ZINC3.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


### ZINC 325

In [105]:
suppl_csv = pd.read_csv('full325.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [106]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

89501
89501
89501


In [107]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,307.156577,-1.04810,CC[C@H](CO)N1CCN(S(=O)(=O)N2CCOCC2)CC1,InChI=1S/C12H25N3O4S/c1-2-12(11-16)13-3-5-14(6...,89943537,ZINC
1,318.099791,-1.87530,CS(=O)(=O)N1CCC[C@@H](NC(=O)C[C@@H]2NC(=O)NC2=...,"InChI=1S/C11H18N4O5S/c1-21(19,20)15-4-2-3-7(6-...",108483283,ZINC
2,302.104876,-1.13598,Cc1c(S(=O)(=O)NC[C@H]2CN(C)C(=O)CO2)cnn1C,InChI=1S/C11H18N4O4S/c1-8-10(5-12-15(8)3)20(17...,885939216,ZINC
3,317.152161,-1.01790,CCn1ncnc1CN1C[C@@H](O)[C@H](CS(=O)(=O)N(C)C)C1,InChI=1S/C12H23N5O3S/c1-4-17-12(13-9-14-17)7-1...,1529195917,ZINC
4,323.122969,-1.86960,O=C(NCCCN1CCCC1=O)C(=O)Nc1c[nH]c(=O)[nH]c1=O,InChI=1S/C13H17N5O5/c19-9-3-1-5-18(9)6-2-4-14-...,12389828,ZINC
...,...,...,...,...,...,...
89506,317.141579,5.04700,C=C(C)COc1cccc(C(=O)Nc2cccc3ccccc23)c1,InChI=1S/C21H19NO2/c1-15(2)14-24-18-10-5-9-17(...,8570619,ZINC
89507,323.201592,6.58950,CCCCCCCCCCCCC(=O)Nc1ccc(Cl)cc1,InChI=1S/C19H30ClNO/c1-2-3-4-5-6-7-8-9-10-11-1...,100286102,ZINC
89508,319.157229,5.38780,CCCOc1ccc2ccccc2c1/C=N/c1ccc(OC)cc1,InChI=1S/C21H21NO2/c1-3-14-24-21-13-8-16-6-4-5...,6942611,ZINC
89509,315.102606,5.05410,C=CCOc1c(Cl)cc(/C=N/c2ccccc2)cc1OCC,InChI=1S/C18H18ClNO2/c1-3-10-22-18-16(19)11-14...,6959801,ZINC


In [108]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers


# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
20688,320.072784,2.30550,NC(=O)[C@H](Cc1ccccc1)NC(=O)c1cccc(F)c1Cl,InChI=1S/C16H14ClFN2O2/c17-14-11(7-4-8-12(14)1...,79763243,ZINC
69883,318.022976,4.38790,Oc1ccc(-c2nnc(SCc3ccc(Cl)cc3)o2)cc1,InChI=1S/C15H11ClN2O2S/c16-12-5-1-10(2-6-12)9-...,411387,ZINC
48107,313.167794,3.89194,COc1cccc(OC)c1CCC(=O)Nc1ccc(C)c(C)c1,InChI=1S/C19H23NO3/c1-13-8-9-15(12-14(13)2)20-...,225993961,ZINC
14361,300.089226,1.22530,CSC[C@](C)(O)CNC(=O)Nc1ccncc1[N+](=O)[O-],"InChI=1S/C11H16N4O4S/c1-11(17,7-20-2)6-13-10(1...",272770984,ZINC
5929,300.077993,-0.02790,O=C(O)c1cccn(CCS(=O)(=O)N2CCCC2)c1=O,InChI=1S/C12H16N2O5S/c15-11-10(12(16)17)4-3-5-...,225792068,ZINC
...,...,...,...,...,...,...
75993,299.962396,4.29410,O=C(Nc1ccc(Cl)cc1Cl)c1cnccc1Cl,InChI=1S/C12H7Cl3N2O/c13-7-1-2-11(10(15)5-7)17...,163281,ZINC
24710,318.155513,2.66312,Cc1cccc([C@H](C)CNC(=O)N(C)C[C@H](O)C(F)(F)F)c1,InChI=1S/C15H21F3N2O2/c1-10-5-4-6-12(7-10)11(2...,96502865,ZINC
33706,311.879318,3.94030,O=S(=O)(c1cc(Cl)c(Cl)c(Cl)c1)C(F)(F)F,InChI=1S/C7H2Cl3F3O2S/c8-4-1-3(2-5(9)6(4)10)16...,2540420,ZINC
57124,324.183778,3.55042,COc1cccc(C(=O)NCC[C@@H]2CCc3ccccc3N2)c1C,InChI=1S/C20H24N2O2/c1-14-17(7-5-9-19(14)24-2)...,662584461,ZINC


In [109]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1000]
df_lgP_2 = lgP(intervals_logP[1])[:1200]
df_lgP_3 = lgP(intervals_logP[2])[:2500]
df_lgP_4 = lgP(intervals_logP[3])[:4800]
df_lgP_5 = lgP(intervals_logP[4])[:3100]
df_lgP_6 = lgP(intervals_logP[5])[:3400]
df_lgP_7 = lgP(intervals_logP[6])[:3010]
df_lgP_8 = lgP(intervals_logP[7])[:3200]
df_lgP_9 = lgP(intervals_logP[8])[:2700]
df_lgP_10 = lgP(intervals_logP[9])[:1800]
df_lgP_11 = lgP(intervals_logP[10])[:2000]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC4 = pd.concat(frames)

In [110]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC4.loc[(df_ZINC4['logP'] > intervals_logP[count][0]) & (df_ZINC4['logP']<= intervals_logP[count][1])]))
numbers

[1000, 1200, 2500, 4800, 3100, 3400, 3010, 3200, 2700, 1800, 2000]

In [111]:
df_ZINC4 = df_ZINC4.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC4.to_csv('ZINC325-0.csv', sep=' ', index = False)

  df_ZINC4 = df_ZINC4.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


### ZINC 350

In [98]:
suppl_csv = pd.read_csv('full350.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [99]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

72244
72244
72244


In [100]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,342.172562,-1.22260,C#CCN1CCN(C(=O)CN2CC[C@@H](S(=O)(=O)NCC)C2)CC1,InChI=1S/C15H26N4O3S/c1-3-6-17-8-10-19(11-9-17...,92843693,ZINC
1,331.175673,-1.08522,CN1CCC[C@@H]1C(=O)N1CCN(C(=O)Cn2cnc(C#N)n2)CC1,InChI=1S/C15H21N7O2/c1-19-4-2-3-12(19)15(24)21...,330210951,ZINC
2,335.159354,-0.63208,C=CCNC(=O)CN1CCN(C(=O)C(=O)Nc2cc(C)on2)CC1,InChI=1S/C15H21N5O4/c1-3-4-16-13(21)10-19-5-7-...,340572537,ZINC
3,340.131760,-1.24532,C[C@H](NS(=O)(=O)c1cnn(C)c1)C(=O)N1CCN(CC#N)CC1,"InChI=1S/C13H20N6O3S/c1-11(16-23(21,22)12-9-15...",446314287,ZINC
4,349.102233,-1.36120,C=CCN1C(=O)C(=O)N(CC(=O)N[C@H](C(=O)O)c2ccnn2C...,InChI=1S/C14H15N5O6/c1-3-6-18-11(21)12(22)19(1...,647772990,ZINC
...,...,...,...,...,...,...
72249,348.220164,5.09718,CC[C@@H](C#N)Oc1cccc(CN[C@@H](C)c2ccc3c(c2)CCC...,InChI=1S/C23H28N2O/c1-3-22(15-24)26-23-10-6-7-...,772982152,ZINC
72250,343.004547,5.02780,O=C1/C(=C\c2ccc(Cl)s2)CCc2ncc(C(F)(F)F)cc21,InChI=1S/C15H9ClF3NOS/c16-13-4-2-10(22-13)5-8-...,777469307,ZINC
72251,338.088912,5.06868,N#CCOc1cccc(NCc2ccc(-c3cccc(F)c3)s2)c1,InChI=1S/C19H15FN2OS/c20-15-4-1-3-14(11-15)19-...,779586094,ZINC
72252,347.160932,5.18618,C[C@@H](N[C@H](CCCC#N)c1ccccc1)c1ccc(C(F)(F)F)cn1,InChI=1S/C19H20F3N3/c1-14(17-11-10-16(13-24-17...,903001658,ZINC


In [101]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers


# shuffle
# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
3752,343.175673,-0.20040,Cc1ccc(C#N)c(N(C)CCNC(=O)CCn2ncn(C)c2=O)n1,InChI=1S/C16H21N7O2/c1-12-4-5-13(10-17)15(20-1...,1100235074,ZINC
64852,346.123676,5.65188,C[C@H](N[C@H](c1ccccc1)c1cccc(Cl)c1)c1cccc(C#N)c1,InChI=1S/C22H19ClN2/c1-16(19-10-5-7-17(13-19)1...,553398215,ZINC
64624,334.117355,5.00998,N#Cc1c(N=C(O)C2CCSCC2)sc2c1CCCCCC2,InChI=1S/C17H22N2OS2/c18-11-14-13-5-3-1-2-4-6-...,329415714,ZINC
9175,329.173942,1.71508,C[C@H](c1cccc(C#N)c1)N(C)C(=O)C(=O)N1CCOC(C)(C)C1,InChI=1S/C18H23N3O3/c1-13(15-7-5-6-14(10-15)11...,425391874,ZINC
57831,337.161269,5.02502,Cc1ccc(C)c(NC(=S)N[C@@H](CCCC#N)c2ccccc2)c1,InChI=1S/C20H23N3S/c1-15-11-12-16(2)19(14-15)2...,814210965,ZINC
...,...,...,...,...,...,...
32686,338.100146,3.93790,Cc1ccc(CSc2ncn(Cc3ccc(C#N)cc3F)n2)cc1,InChI=1S/C18H15FN4S/c1-13-2-4-14(5-3-13)11-24-...,478937401,ZINC
7254,325.153875,0.51888,Cn1cccc1C(=O)NC[C@@H]1CN(c2ncccc2C#N)C[C@@H]1O,InChI=1S/C17H19N5O2/c1-21-7-3-5-14(21)17(24)20...,1067649897,ZINC
231,330.144038,-1.63500,NC(=O)Cn1cc(C(=O)NCC2CN(C(=O)C#CC3CC3)C2)nn1,InChI=1S/C15H18N6O3/c16-13(22)9-21-8-12(18-19-...,1002476023,ZINC
46473,341.189198,5.08598,N#Cc1ccc(C2(N[C@H]3CCCc4c3[nH]c3ccccc43)CCC2)cc1,InChI=1S/C23H23N3/c24-15-16-9-11-17(12-10-16)2...,594519688,ZINC


In [102]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1100]
df_lgP_2 = lgP(intervals_logP[1])[:1000]
df_lgP_3 = lgP(intervals_logP[2])[:2000]
df_lgP_4 = lgP(intervals_logP[3])[:4000]
df_lgP_5 = lgP(intervals_logP[4])[:3000]
df_lgP_6 = lgP(intervals_logP[5])[:3800]
df_lgP_7 = lgP(intervals_logP[6])[:3210]
df_lgP_8 = lgP(intervals_logP[7])[:2700]
df_lgP_9 = lgP(intervals_logP[8])[:2000]
df_lgP_10 = lgP(intervals_logP[9])[:1800]
df_lgP_11 = lgP(intervals_logP[10])[:2300]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC5 = pd.concat(frames)

In [103]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC5.loc[(df_ZINC5['logP'] > intervals_logP[count][0]) & (df_ZINC5['logP']<= intervals_logP[count][1])]))
numbers

[1100, 1000, 2000, 4000, 3000, 3800, 3210, 2700, 2000, 1800, 2300]

In [104]:
df_ZINC5 = df_ZINC5.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC5.to_csv('ZINC350-0.csv', sep=' ', index = False)

  df_ZINC5 = df_ZINC5.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


### ZINC 375

In [112]:
suppl_csv = pd.read_csv('full375.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [113]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

102442
102442
102442


In [114]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,355.131425,-1.28956,Cc1nn(C)c(C)c1S(=O)(=O)N1CCN2C(=O)CN(C)C(=O)[C...,InChI=1S/C14H21N5O4S/c1-9-13(10(2)17(4)15-9)24...,65423162,ZINC
1,354.190320,-1.67260,O=C(CN1C(=O)CNC1=O)N1C[C@@H](CO)C[C@H](CN2CCOC...,InChI=1S/C16H26N4O5/c21-11-13-5-12(7-18-1-3-25...,91985397,ZINC
2,354.142701,-1.03250,CC(=O)Nc1ccc(O[C@@H]2O[C@H](CO)[C@H](O)[C@H](O...,InChI=1S/C16H22N2O7/c1-8(20)17-10-3-5-11(6-4-1...,239433518,ZINC
3,354.142701,-1.03250,CC(=O)Nc1ccccc1O[C@@H]1O[C@H](CO)[C@H](O)[C@@H...,InChI=1S/C16H22N2O7/c1-8(20)17-10-5-3-4-6-11(1...,245377209,ZINC
4,370.074719,-1.13340,O=C(CNS(=O)(=O)c1cccc(F)c1)N1CC(N2C(=O)CNC2=O)C1,InChI=1S/C14H15FN4O5S/c15-9-2-1-3-11(4-9)25(23...,253471034,ZINC
...,...,...,...,...,...,...
102449,357.045397,5.13460,O=C(Nc1cccc(Cl)c1)C1[C@H]2CC[C@H]3[C@@H](CC[C@...,InChI=1S/C17H18Cl3NO/c18-9-2-1-3-10(8-9)21-16(...,18205557,ZINC
102450,367.004175,5.01592,Cc1ccc(S[C@@H](C)C(=O)Nc2ccc(Br)cc2F)cc1,InChI=1S/C16H15BrFNOS/c1-10-3-6-13(7-4-10)21-1...,21601520,ZINC
102451,361.204179,5.76070,CCc1cccc(CC)c1NC(=O)[C@@H](CC)Oc1cccc2ccccc12,InChI=1S/C24H27NO2/c1-4-17-12-9-13-18(5-2)23(1...,44947951,ZINC
102452,368.209993,5.12270,CC(C)(C)OC(=O)N1CCCC[C@@H]1c1cccnc1OCc1ccccc1,"InChI=1S/C22H28N2O3/c1-22(2,3)27-21(25)24-15-8...",72212107,ZINC


In [115]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers


# shuffle
# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
83530,360.150764,3.71422,CCC[C@@H](C)NC(=O)c1ccc(C)c(NS(=O)(=O)c2ccccc2)c1,InChI=1S/C19H24N2O3S/c1-4-8-15(3)20-19(22)16-1...,10369592,ZINC
59182,374.141262,1.35890,CS(=O)(=O)N1Cc2ccccc2C2(CCN(C(=O)c3c[nH]cn3)CC...,"InChI=1S/C18H22N4O3S/c1-26(24,25)22-11-14-4-2-...",96242382,ZINC
100853,361.204179,5.43700,CCc1ccc([C@H](C)NC(=O)[C@H](CC)Oc2ccc3ccccc3c2...,InChI=1S/C24H27NO2/c1-4-18-10-12-19(13-11-18)1...,78458588,ZINC
128,355.131425,-1.07530,Cn1cc(N2C[C@H](C(=O)N3CCNS(=O)(=O)CC3)CCC2=O)cn1,InChI=1S/C14H21N5O4S/c1-17-10-12(8-15-17)19-9-...,1875318473,ZINC
68398,354.121572,2.32700,COc1ccc(NC(=O)[C@@H](C)N2C(=O)c3ccccc3C2=O)c(O...,InChI=1S/C19H18N2O5/c1-11(21-18(23)13-6-4-5-7-...,851356,ZINC
...,...,...,...,...,...,...
96141,353.181335,4.98442,Cc1ccc(CSCC(=O)N[C@@H](C)c2ccc3c(c2)CCCC3)cc1,InChI=1S/C22H27NOS/c1-16-7-9-18(10-8-16)14-25-...,222904460,ZINC
40182,368.159689,1.54872,COc1ccccc1C(=O)N1CCC[C@@H]1Cn1nnc2c(O)nc(C)nc21,InChI=1S/C18H20N6O3/c1-11-19-16-15(17(25)20-11...,72324955,ZINC
94300,363.194677,4.58774,Cc1cc(C)n(Cc2ccc(C(=O)Nc3ccccc3OC(C)C)cc2)n1,InChI=1S/C22H25N3O2/c1-15(2)27-21-8-6-5-7-20(2...,149357341,ZINC
80519,369.033810,3.13334,Cc1cc(C)n(CCC(=O)Nc2ccccc2I)n1,InChI=1S/C14H16IN3O/c1-10-9-11(2)18(17-10)8-7-...,24257173,ZINC


In [119]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1100]
df_lgP_2 = lgP(intervals_logP[1])[:1000]
df_lgP_3 = lgP(intervals_logP[2])[:2000]
df_lgP_4 = lgP(intervals_logP[3])[:4000]
df_lgP_5 = lgP(intervals_logP[4])[:3000]
df_lgP_6 = lgP(intervals_logP[5])[:3800]
df_lgP_7 = lgP(intervals_logP[6])[:3210]
df_lgP_8 = lgP(intervals_logP[7])[:2700]
df_lgP_9 = lgP(intervals_logP[8])[:2000]
df_lgP_10 = lgP(intervals_logP[9])[:1800]
df_lgP_11 = lgP(intervals_logP[10])[:2300]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC6 = pd.concat(frames)

In [120]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC6.loc[(df_ZINC6['logP'] > intervals_logP[count][0]) & (df_ZINC6['logP']<= intervals_logP[count][1])]))
numbers

[1100, 1000, 2000, 4000, 3000, 3800, 3210, 2700, 2000, 1800, 2300]

In [121]:
df_ZINC6 = df_ZINC6.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC6.to_csv('ZINC375-0.csv', sep=' ', index = False)

  df_ZINC6 = df_ZINC6.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


### ZINC 400

In [19]:
suppl_csv = pd.read_csv('full400.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [20]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

325515
325515
325515


In [21]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,392.180818,-1.63473,N=C(N)NCCC[C@@H](NC(=O)c1ccccc1)C(=O)NCC(=O)NC...,InChI=1S/C17H24N6O5/c18-17(19)20-8-4-7-12(23-1...,4546695,ZINC
1,386.145101,-1.13900,Nc1ncnc2c1nc(N/N=C/c1cccnc1)n2[C@@H]1O[C@H](CO...,InChI=1S/C16H18N8O4/c17-13-10-14(20-7-19-13)24...,12660345,ZINC
2,390.116212,-3.42700,CC(=O)[C@H](O)[C@H]1O[C@@](O)(C(C)=O)[C@@](O)(...,"InChI=1S/C16H22O11/c1-6(17)11(22)12-13(23,7(2)...",87492653,ZINC
3,386.170253,-1.38160,Cn1cc(CC(=O)N[C@H]2[C@H]3C[C@H]2N(C(=O)CCn2cc[...,InChI=1S/C18H22N6O4/c1-22-9-11(8-20-22)6-14(25...,1615611744,ZINC
4,390.079826,-2.00280,COC(=O)C1(C(=O)OC)C(C(=O)OC)(C(=O)OC)C1(C(=O)O...,InChI=1S/C15H18O12/c1-22-7(16)13(8(17)23-2)14(...,4016120,ZINC
...,...,...,...,...,...,...
325522,395.135528,5.18702,COc1cccc(C(=S)NCc2ccc(C)cc2)c1OCc1ccc(F)cc1,InChI=1S/C23H22FNO2S/c1-16-6-8-17(9-7-16)14-25...,408577604,ZINC
325523,378.036039,6.25470,CC(C)(C)C(=O)Nc1c(Cl)cc(Cl)cc1-c1nc2ccccc2s1,"InChI=1S/C18H16Cl2N2OS/c1-18(2,3)17(23)22-15-1...",408628625,ZINC
325524,397.134779,5.10900,CC[C@@H](C)N1C(=O)S/C(=C/c2ccc(OC)c(OCc3ccccc3...,InChI=1S/C22H23NO4S/c1-4-15(2)23-21(24)20(28-2...,409012693,ZINC
325525,392.086245,5.69630,CCc1ccc(-n2c(Sc3ccccn3)nnc2-c2ccc(Cl)cc2)cc1,InChI=1S/C21H17ClN4S/c1-2-15-6-12-18(13-7-15)2...,409097356,ZINC


In [22]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
45020,395.195740,1.46830,O=C(CCN1CCN(Cc2ccc3c(c2)OCO3)CC1)NN=Cc1cccnc1,InChI=1S/C21H25N5O3/c27-21(24-23-14-18-2-1-6-2...,22030468,ZINC
58763,382.211724,2.40402,Cc1c(C(=O)N2CC[C@H](c3cc(O)n4nc(C(C)(C)C)cc4n3...,InChI=1S/C20H26N6O2/c1-12-14(10-21-24(12)5)19(...,12741959,ZINC
325364,383.188529,5.42960,COc1ccc(/C=C(/C(=O)N[C@@H]2CCCc3ccccc32)c2cccc...,InChI=1S/C26H25NO2/c1-29-22-16-14-19(15-17-22)...,216472745,ZINC
6542,393.120382,-0.09950,Cn1c(=O)c2c(nc(NC[C@@H](O)CO)n2Cc2ccc(Cl)cc2)n...,InChI=1S/C17H20ClN5O4/c1-21-14-13(15(26)22(2)1...,1127226,ZINC
323528,389.199094,5.41580,CC[C@H](Oc1cccc2ccccc12)C(=O)N[C@H]1CC(C)(C)Oc...,InChI=1S/C25H27NO3/c1-4-21(28-22-15-9-11-17-10...,219337141,ZINC
...,...,...,...,...,...,...
111826,387.161663,2.67282,Cc1nc(-c2cccc(NC(=O)CN3CC[C@H](O)[C@]4(CCCO4)C...,InChI=1S/C20H25N3O3S/c1-14-21-17(12-27-14)15-4...,257218359,ZINC
182017,388.182064,3.56368,Cc1ccc([C@H](C)NC(=O)CN(c2c(C)cccc2C)S(C)(=O)=...,InChI=1S/C21H28N2O3S/c1-14-10-11-19(12-17(14)4...,22338782,ZINC
119139,384.051299,3.37160,CC(=O)Nc1ccc(C2=N/C(=C\c3cc4c(cc3Cl)OCO4)C(=O)...,InChI=1S/C19H13ClN2O5/c1-10(23)21-13-4-2-11(3-...,5291502,ZINC
302180,381.230394,5.10618,Cc1ccc2c(c1)OC(C)(C)C[C@H]2NC(=O)[C@@H](C)Oc1c...,"InChI=1S/C24H31NO3/c1-14-8-9-19-20(13-24(6,7)2...",219302733,ZINC


In [23]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:819]
df_lgP_2 = lgP(intervals_logP[1])[:948]
df_lgP_3 = lgP(intervals_logP[2])[:1747]
df_lgP_4 = lgP(intervals_logP[3])[:3537]
df_lgP_5 = lgP(intervals_logP[4])[:2608]
df_lgP_6 = lgP(intervals_logP[5])[:3138]
df_lgP_7 = lgP(intervals_logP[6])[:3256]
df_lgP_8 = lgP(intervals_logP[7])[:2967]
df_lgP_9 = lgP(intervals_logP[8])[:2644]
df_lgP_10 = lgP(intervals_logP[9])[:1973]
df_lgP_11 = lgP(intervals_logP[10])[:4100]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC7 = pd.concat(frames)

In [24]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC7.loc[(df_ZINC7['logP'] > intervals_logP[count][0]) & (df_ZINC7['logP']<= intervals_logP[count][1])]))
numbers

[819, 948, 1747, 3537, 2608, 3138, 3256, 2967, 2644, 1973, 4100]

In [25]:
df_ZINC7 = df_ZINC7.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC7.to_csv('ZINC400-0.csv', sep=' ', index = False)

  df_ZINC7 = df_ZINC7.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


### ZINC  425

In [80]:
suppl_csv = pd.read_csv('Data_ZINC/full425.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [81]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

67022
67022
67022


In [82]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,424.136947,-1.36950,C[C@@H]1C(=O)O[C@H]2[C@H](O)[C@]34[C@H]5C[C@@H...,InChI=1S/C20H24O10/c1-6-12(23)28-11-9(21)18-8-...,256002083,ZINC
1,400.197137,-1.59320,Cn1c(=O)c2c(ncn2C[C@H](O)CN2CCN(c3ncccn3)CC2)n...,InChI=1S/C18H24N8O3/c1-22-15-14(16(28)23(2)18(...,19938680,ZINC
2,417.163496,-3.27740,OC[C@@H]1O[C@H](Nc2ccccc2)[C@H](O)[C@H](O)[C@@...,InChI=1S/C18H27NO10/c20-6-9-11(22)12(23)15(26)...,1857525952,ZINC
3,412.145027,-1.64250,C[C@H](NS(C)(=O)=O)C(=O)NC1CC(N(C)C(=O)C(C)(C)...,"InChI=1S/C14H28N4O6S2/c1-9(16-25(5,21)22)12(19...",1875322338,ZINC
4,414.116212,-2.29600,CC(=O)OCC1=C[C@@H]2OC(=O)C3=CO[C@@H](O[C@@H]4O...,InChI=1S/C18H22O11/c1-6(20)25-4-7-2-9-12-8(16(...,4098332,ZINC
...,...,...,...,...,...,...
67029,415.074199,6.58180,CC(C)(Oc1ccc(Cl)cc1)C(=O)Nc1cc(Cl)ccc1Oc1ccccc1,"InChI=1S/C22H19Cl2NO3/c1-22(2,28-18-11-8-15(23...",6221170,ZINC
67030,404.130697,5.09606,Cc1cc(C)c2[nH]c(-c3ccccc3)c(/C=C(\Sc3nnc(C)[nH...,InChI=1S/C22H20N4O2S/c1-12-9-13(2)19-16(10-12)...,6456543,ZINC
67031,406.078742,5.22120,O=C(OCc1nnc(-c2ccccc2)o1)/C(=C/c1ccc(F)cc1)c1c...,InChI=1S/C22H15FN2O3S/c23-17-10-8-15(9-11-17)1...,7705564,ZINC
67032,413.199094,5.43690,CCc1cccc(CC)c1N1C[C@@H](C(=O)Oc2ccc(-c3ccccc3)...,InChI=1S/C27H27NO3/c1-3-19-11-8-12-20(4-2)26(1...,7929346,ZINC


In [83]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
36173,416.222369,2.31780,CCN1CCN(c2ccc(CNC(=O)c3cncc(OCCOC)c3)cc2F)CC1,InChI=1S/C22H29FN4O3/c1-3-26-6-8-27(9-7-26)21-...,800883599,ZINC
23225,402.091928,1.90460,COC(=O)c1c(C(C)(C)C)n(S(C)(=O)=O)c2ccc(NS(C)(=...,"InChI=1S/C16H22N2O6S2/c1-16(2,3)14-13(15(19)24...",5516181,ZINC
33391,404.077514,2.04410,O=C(C(=O)N1CCCC12CN(C(=O)c1cc(O)c(O)c(Cl)c1)C2...,InChI=1S/C19H17ClN2O6/c20-12-7-11(8-13(23)15(1...,1841954102,ZINC
52527,400.156912,4.38602,CCCCn1c(SCC(=O)Nc2ccc(OC)cc2)nnc1-c1ccoc1C,InChI=1S/C20H24N4O3S/c1-4-5-11-24-19(17-10-12-...,4860516,ZINC
48835,410.206639,3.57654,Cc1cc(C(=O)N2CCC(c3nc(NC(C)C)ncc3-c3cc(C)no3)C...,InChI=1S/C21H26N6O3/c1-12(2)23-21-22-11-16(17-...,12156488,ZINC
...,...,...,...,...,...,...
48724,414.197714,3.76960,CCc1ccc(N([C@@H](C)C(=O)N[C@H](C)c2ccc3c(c2)CC...,InChI=1S/C23H30N2O3S/c1-5-18-9-13-22(14-10-18)...,260930968,ZINC
6090,424.222289,0.90510,COCCn1c(N2CCN(C/C=C/c3ccccc3)CC2)nc2c1c(=O)[nH...,InChI=1S/C22H28N6O3/c1-25-19-18(20(29)24-22(25...,4802756,ZINC
959,405.104949,-1.53090,CC(C)(Nc1nc(O)c2ncn([C@@H]3O[C@@H](CO)[C@@H](O...,"InChI=1S/C13H20N5O8P/c1-13(2,27(23,24)25)17-12...",5496661,ZINC
25610,404.218541,1.66028,Cc1cc(C)n(-c2ccc(N3CCN(c4c(C)c(C)nc5ncnn45)CC3...,InChI=1S/C20H24N10/c1-13-11-14(2)29(26-13)18-6...,1889213058,ZINC


In [89]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:891]
df_lgP_2 = lgP(intervals_logP[1])[:922]
df_lgP_3 = lgP(intervals_logP[2])[:1605]
df_lgP_4 = lgP(intervals_logP[3])[:2995]
df_lgP_5 = lgP(intervals_logP[4])[:2307]
df_lgP_6 = lgP(intervals_logP[5])[:2750]
df_lgP_7 = lgP(intervals_logP[6])[:2980]
df_lgP_8 = lgP(intervals_logP[7])[:2932]
df_lgP_9 = lgP(intervals_logP[8])[:2702]
df_lgP_10 = lgP(intervals_logP[9])[:2508]
df_lgP_11 = lgP(intervals_logP[10])[:6318]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC8 = pd.concat(frames)

In [90]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC8.loc[(df_ZINC8['logP'] > intervals_logP[count][0]) & (df_ZINC8['logP']<= intervals_logP[count][1])]))
numbers

[891, 922, 1605, 2995, 2307, 2750, 2980, 2932, 2702, 2508, 5000]

In [91]:
df_ZINC8 = df_ZINC8.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC8.to_csv('ZINC_csv/ZINC425-0.csv', sep=' ', index = False)

  df_ZINC8 = df_ZINC8.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


### ZINC 450

In [170]:
suppl_csv = pd.read_csv('full450.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [171]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

241698
241698
241698


In [172]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,444.153266,-1.46800,CN(C)[C@@H]1C(=O)[C@@H](C(N)=O)C(=O)[C@@]2(O)C...,InChI=1S/C22H24N2O8/c1-21(31)8-5-4-6-11(25)12(...,169686734,ZINC
1,447.269299,-3.85150,CN[C@H]1[C@@H](O)[C@H](O[C@@H]2[C@@H](N)C[C@@H...,InChI=1S/C19H37N5O7/c1-19(27)7-28-18(13(26)16(...,38139452,ZINC
2,426.204924,-1.26008,Cc1ncn(C)c1C(=O)N1C[C@H]2CN(C(=O)CNS(C)(=O)=O)...,InChI=1S/C18H30N6O4S/c1-13-16(22(4)12-19-13)17...,1772705120,ZINC
3,446.142426,-2.68750,COC(=O)C1=CO[C@@H](O[C@@H]2O[C@H](CO)[C@@H](O)...,InChI=1S/C19H26O12/c1-7(21)28-5-8-3-10(22)13-9...,31159654,ZINC
4,427.029415,-1.74600,Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO[P@@](=O)(O)OP(=...,InChI=1S/C10H15N5O10P2/c11-8-5-9(13-2-12-8)15(...,12360703,ZINC
...,...,...,...,...,...,...
241706,446.074223,5.56594,Cc1ccc(-n2nc(C(=O)Nc3cccc(C)c3)nc2-c2ccc(Br)cc...,InChI=1S/C23H19BrN4O/c1-15-6-12-20(13-7-15)28-...,408699055,ZINC
241707,431.983525,6.00410,O=C1N/C(=C/c2ccc(-c3cc(Cl)ccc3Cl)o2)C(=O)N1c1c...,InChI=1S/C20H11Cl3N2O3/c21-11-2-1-3-13(8-11)25...,409074243,ZINC
241708,434.214033,5.37830,CCc1ccc(-n2c(SCC(=O)N3[C@@H](C)CCC[C@@H]3C)nnc...,InChI=1S/C25H30N4OS/c1-4-20-13-15-22(16-14-20)...,409093827,ZINC
241709,438.123584,6.44794,CCCc1sc2nc(SCc3ccc(C)cc3)n(-c3ccc(F)cc3)c(=O)c...,InChI=1S/C24H23FN2OS2/c1-4-5-20-16(3)21-22(30-...,409120073,ZINC


In [173]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
38305,436.247441,2.42610,CN1CCN(C(=O)Cn2cc(C(=O)NC3CCCCC3)ccc2=O)[C@@H]...,InChI=1S/C25H32N4O3/c1-27-14-15-29(22(17-27)19...,19751662,ZINC
186329,436.051211,5.91920,COc1cc(CNCc2cccnc2)cc(Cl)c1OCc1ccc(Cl)cc1Cl,InChI=1S/C21H19Cl3N2O2/c1-27-20-8-15(12-26-11-...,16195657,ZINC
12172,430.221620,0.61730,CCOC(=O)[C@@H]1CCCN(CC(=O)N2CCNC(=O)[C@H]2CC(=...,InChI=1S/C22H30N4O5/c1-2-31-22(30)16-7-6-11-25...,250657109,ZINC
213097,442.205311,6.88426,Cc1cccc(N=[P@@](c2c(C)nn(-c3ccccc3)c2C)(N(C)C)...,InChI=1S/C24H32ClN4P/c1-17-13-12-16-21(22(17)2...,12579219,ZINC
28130,427.213201,1.35794,COc1ccc(CN2CCN(C(=O)Cc3c(C)nc4nc(N)nn4c3C)CC2)...,InChI=1S/C21H26FN7O2/c1-13-16(14(2)29-21(24-13...,24637977,ZINC
...,...,...,...,...,...,...
23114,434.241687,1.73880,CC(C)(CNC(=O)[C@@H]1[C@H]2CC[C@@H](C2)[C@@H]1C...,"InChI=1S/C23H34N2O6/c1-23(2,9-24-19(26)15-11-3...",102305997,ZINC
165041,441.168856,4.82162,COc1ccc(NC(=O)c2ccccc2NC(=O)COc2cc(C)c3ccccc3n...,InChI=1S/C26H23N3O4/c1-17-15-25(29-22-9-5-3-7-...,15981956,ZINC
109659,448.219844,4.12364,CCc1cc(C(F)(F)F)n2nc([C@H]3CCCN3C(=O)[C@@H](C)...,"InChI=1S/C22H27F3N6O/c1-5-16-10-19(22(23,24)25...",19522248,ZINC
199824,427.815935,5.51740,Brc1cccc(-c2nc3cc(Br)cc(Br)c3[nH]2)c1,InChI=1S/C13H7Br3N2/c14-8-3-1-2-7(4-8)13-17-11...,226343095,ZINC


In [174]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:832]
df_lgP_2 = lgP(intervals_logP[1])[:925]
df_lgP_3 = lgP(intervals_logP[2])[:1523]
df_lgP_4 = lgP(intervals_logP[3])[:2183]
df_lgP_5 = lgP(intervals_logP[4])[:1694]
df_lgP_6 = lgP(intervals_logP[5])[:2069]
df_lgP_7 = lgP(intervals_logP[6])[:2470]
df_lgP_8 = lgP(intervals_logP[7])[:2619]
df_lgP_9 = lgP(intervals_logP[8])[:2599]
df_lgP_10 = lgP(intervals_logP[9])[:2306]
df_lgP_11 = lgP(intervals_logP[10])[:6796]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC9 = pd.concat(frames)

In [175]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC9.loc[(df_ZINC9['logP'] > intervals_logP[count][0]) & (df_ZINC9['logP']<= intervals_logP[count][1])]))
numbers

[832, 925, 1523, 2183, 1694, 2069, 2470, 2619, 2599, 2306, 6796]

In [176]:
df_ZINC9 = df_ZINC9.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC9.to_csv('ZINC450-0.csv', sep=' ', index = False)

  df_ZINC9 = df_ZINC9.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


### ZINC 500

In [4]:
suppl_csv = pd.read_csv('full500.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)
suppl_csv1

Unnamed: 0,smiles,zinc_id
0,O=C1NCC(=O)N2CCC[C@@H]2C(=O)NCC(=O)N2CCC[C@H]2...,4536545
1,Cc1ccoc1CC(=O)N1CC2(C1)[C@@H](NC(=O)Cn1ccc(O)n...,1875346121
2,N=C1N=C(O)[C@@H]2[C@H](NC[C@H](CNc3ccc(C(=O)N[...,38300505
3,O=C(CN1C(=O)CNC1=O)NC1(c2ccccc2)CN(C(=O)[C@H]2...,1875358550
4,O=c1ccn([C@@H]2O[C@H](CO[P@](=O)(O)O[P@](=O)(O...,12959005
...,...,...
412290,Cc1cc(C)c(C(=O)Nc2cc(Cl)ccc2Oc2ccc(C(=O)c3cccc...,9256798
412291,CC(=O)N(c1nc(CSc2nnc(Nc3cccc(Cl)c3)s2)cs1)c1cc...,9056470
412292,CC(=O)c1cccc(NC(=O)[C@@H](Sc2nnc(NCc3ccc(F)cc3...,9476743
412293,Cc1nc2ccc(NC(=O)c3cc(S(=O)(=O)N(C)c4ccc(F)cc4)...,9632772


In [164]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

412279
412279
412279


In [165]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,462.222683,-2.67840,O=C1NCC(=O)N2CCC[C@@H]2C(=O)NCC(=O)N2CCC[C@H]2...,InChI=1S/C21H30N6O6/c28-16-10-23-20(32)14-5-2-...,4536545,ZINC
1,450.120920,-1.02268,Cc1ccoc1CC(=O)N1CC2(C1)[C@@H](NC(=O)Cn1ccc(O)n...,InChI=1S/C19H22N4O7S/c1-12-3-6-30-13(12)8-17(2...,1875346121,ZINC
2,475.181546,-1.23433,N=C1N=C(O)[C@@H]2[C@H](NC[C@H](CNc3ccc(C(=O)N[...,InChI=1S/C20H25N7O7/c21-20-25-16-15(18(32)26-2...,38300505,ZINC
3,453.176067,-1.04820,O=C(CN1C(=O)CNC1=O)NC1(c2ccccc2)CN(C(=O)[C@H]2...,InChI=1S/C21H23N7O5/c29-16(10-28-17(30)8-22-19...,1875358550,ZINC
4,483.968528,-2.50090,O=c1ccn([C@@H]2O[C@H](CO[P@](=O)(O)O[P@](=O)(O...,InChI=1S/C9H15N2O15P3/c12-5-1-2-11(9(15)10-5)8...,12959005,ZINC
...,...,...,...,...,...,...
412290,472.118985,5.92074,Cc1cc(C)c(C(=O)Nc2cc(Cl)ccc2Oc2ccc(C(=O)c3cccc...,InChI=1S/C27H21ClN2O4/c1-16-14-17(2)29-26(32)2...,9256798,ZINC
412291,491.011129,6.50760,CC(=O)N(c1nc(CSc2nnc(Nc3cccc(Cl)c3)s2)cs1)c1cc...,InChI=1S/C20H15ClFN5OS3/c1-12(28)27(17-8-3-2-7...,9056470,ZINC
412292,492.108996,5.96400,CC(=O)c1cccc(NC(=O)[C@@H](Sc2nnc(NCc3ccc(F)cc3...,InChI=1S/C25H21FN4O2S2/c1-16(31)19-8-5-9-21(14...,9476743,ZINC
412293,489.038389,5.47462,Cc1nc2ccc(NC(=O)c3cc(S(=O)(=O)N(C)c4ccc(F)cc4)...,InChI=1S/C22H17ClFN3O3S2/c1-13-25-20-10-5-15(1...,9632772,ZINC


In [166]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
175110,452.231122,3.49242,COc1ccc(CN(C(=O)COC(=O)[C@H](NC(=O)c2ccc(C)cc2...,InChI=1S/C26H32N2O5/c1-17(2)24(27-25(30)20-9-5...,32745433,ZINC
313441,498.078298,4.06390,CC(C)CNS(=O)(=O)c1cc(C(=O)OCC(=O)N2CCCc3ccccc3...,"InChI=1S/C22H24Cl2N2O5S/c1-14(2)12-25-32(29,30...",2632197,ZINC
169721,459.219178,3.09462,CC[C@@H](C(=O)NC)N(Cc1cccc(C)c1)C(=O)CCCN(c1cc...,InChI=1S/C24H33N3O4S/c1-5-22(24(29)25-3)26(18-...,223050172,ZINC
184942,496.267233,3.19470,CC(C)(C)c1cc([C@H]2COC(=O)COCCOCC(=O)OCCOCCO2)...,"InChI=1S/C26H40O9/c1-25(2,3)19-13-18(14-20(24(...",8665702,ZINC
252610,461.195071,3.66900,COc1cc2c(cc1OC)[C@H](c1ccccc1)N(CN1C(=O)N[C@@]...,InChI=1S/C26H27N3O5/c1-26(22-10-7-13-34-22)24(...,23967619,ZINC
...,...,...,...,...,...,...
69302,463.167811,2.27980,Cn1cnc2c1c(=O)n(CC(=O)N[C@H](c1ccc3c(c1)CCCC3)...,InChI=1S/C24H25N5O3S/c1-27-14-25-22-21(27)23(3...,24658688,ZINC
297823,454.088469,4.19380,CN(Cc1ccc(Cl)cc1)C(=O)C1CCN(S(=O)(=O)Cc2ccc(Cl...,InChI=1S/C21H24Cl2N2O3S/c1-24(14-16-2-6-19(22)...,20452587,ZINC
348770,463.166269,4.67650,O=C(C[C@H](NC(=O)c1ccccc1Cl)c1ccccc1)Nc1ccc(N2...,InChI=1S/C26H26ClN3O3/c27-23-9-5-4-8-22(23)26(...,9492650,ZINC
7784,461.125671,0.58500,O=C(NCc1ccc2c(c1)OCO2)C(=O)NC[C@H]1OCCCN1S(=O)...,InChI=1S/C21H23N3O7S/c25-20(22-12-15-7-8-17-18...,9576706,ZINC


In [167]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:1500]
df_lgP_2 = lgP(intervals_logP[1])[:1600]
df_lgP_3 = lgP(intervals_logP[2])[:2400]
df_lgP_4 = lgP(intervals_logP[3])[:3200]
df_lgP_5 = lgP(intervals_logP[4])[:2200]
df_lgP_6 = lgP(intervals_logP[5])[:3000]
df_lgP_7 = lgP(intervals_logP[6])[:3500]
df_lgP_8 = lgP(intervals_logP[7])[:3700]
df_lgP_9 = lgP(intervals_logP[8])[:4000]
df_lgP_10 = lgP(intervals_logP[9])[:4000]
df_lgP_11 = lgP(intervals_logP[10])[:15000]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC10 = pd.concat(frames)

In [168]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC10.loc[(df_ZINC10['logP'] > intervals_logP[count][0]) & (df_ZINC10['logP']<= intervals_logP[count][1])]))
numbers

[1500, 1600, 2400, 3200, 2200, 3000, 3500, 3700, 4000, 4000, 14808]

In [169]:
df_ZINC10 = df_ZINC10.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC10.to_csv('ZINC500-0.csv', sep=' ', index = False)

  df_ZINC10 = df_ZINC10.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


### ZINC 500 up

In [23]:
suppl_csv = pd.read_csv('full500up.csv', delimiter = ' ')
suppl_csv1 = suppl_csv[suppl_csv.zinc_id != 'zinc_id']
suppl_csv1.drop(suppl_csv.tail(1).index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [24]:
ligandm_database = []
for ligand in suppl_csv1["smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))
#List of MW from csv
db_MW_z = []
for substance in ligandm_database:
		db_MW_z.append(rdescriptors.CalcExactMolWt(substance))
print(len(db_MW_z))
#List of logP from csv
db_logP_z = []
for substance in ligandm_database:
		db_logP_z.append(Descriptors.MolLogP(substance))
print(len(db_logP_z))
#List of INCHI from csv
db_Inchi_z = []
for substance in ligandm_database:
		db_Inchi_z.append(Chem.MolToInchi((substance)))
print(len(db_Inchi_z))

372726
372726
372726


In [25]:
#Creating dataframe of logP and MW value of COCONUT products
df_ZINC = pd.DataFrame(data={'MW': db_MW_z,'logP': db_logP_z,'Smiles':suppl_csv1["smiles"], 'Inchi': db_Inchi_z, 'zinc_id' : suppl_csv1['zinc_id'] })

df_ZINC['name'] = "ZINC"
df_ZINC

Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
0,524.174120,-5.8624,OCC1=C[C@@H](O)[C@]2(O[C@@H]3O[C@H](CO)[C@@H](...,InChI=1S/C21H32O15/c22-4-7-3-10(25)21(36-20-17...,28536449,ZINC
1,752.258638,-9.6188,C[C@@H]1O[C@H](O[C@@H]2CO[C@@H](O)[C@H](O)[C@@...,InChI=1S/C28H48O23/c1-6-22(15(35)20(40)26(45-6...,1857525859,ZINC
2,558.122085,-1.8040,COC(=O)C1=C(C(=O)OC)[C@H](C(=O)OC)[C@@H](C(=O)...,InChI=1S/C23H26O16/c1-32-15(24)9-10(16(25)33-2...,5813216,ZINC
3,610.189770,-1.1566,COc1ccc([C@@H]2CC(=O)c3c(O)cc(O[C@@H]4O[C@H](C...,InChI=1S/C28H34O15/c1-10-21(32)23(34)25(36)27(...,8382286,ZINC
4,920.478974,-2.3974,CSCC[C@H](NC(=O)[C@H](CC(C)C)N1CC[C@H](NC(=O)[...,InChI=1S/C42H68N10O11S/c1-23(2)19-32(40(61)46-...,169289394,ZINC
...,...,...,...,...,...,...
372845,693.129690,8.9729,CC(C)c1ccc(NC(=O)CSc2ccc(NC(=O)/C(=C/c3ccc(-c4...,InChI=1S/C37H32BrN3O4S/c1-24(2)25-10-14-29(15-...,150483738,ZINC
372846,523.208219,9.2781,C(=C1\CCC[C@@H]2C1=NN(c1nc(-c3ccc(-c4ccccc4)cc...,InChI=1S/C35H29N3S/c1-4-11-25(12-5-1)23-30-17-...,4257754,ZINC
372847,538.251481,6.6504,C=CCn1c(CNc2cccc3ccccc23)nnc1SCC(=O)NN=C/C(C)=...,InChI=1S/C31H34N6OS/c1-5-17-37-29(20-32-28-12-...,8397298,ZINC
372848,567.178062,6.4608,O=c1/c(=C\c2cn(Cc3ccccc3F)c3ccccc23)sc2n1[C@@H...,InChI=1S/C36H26FN3OS/c37-30-16-8-5-13-25(30)21...,8407324,ZINC


In [26]:
#Check duplicates in ZINC dataframe
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')
#Drop duplicates
df_ZINC = df_ZINC.drop_duplicates(subset=['zinc_id'])
df_ZINC
#Check duplicates
ids = df_ZINC['zinc_id']
df_ZINC[ids.isin(ids[ids.duplicated()])].sort_values('zinc_id')

#Join ZINC and COCONUT data
df_merge = pd.concat([df_COCO,df_ZINC], axis=0)
df_merge

#Check dplicate in both dataframes by Inchi
ids = df_merge['Inchi']
df_merge = df_merge[ids.isin(ids[ids.duplicated()])].sort_values('Inchi')

#Leave out COCO columns
df_merge = df_merge[df_merge.name != 'COCO']
df_merge = df_merge.drop('coconut_id', 1)

#get zinc_ids
list_ids = []
for x in df_merge['zinc_id']:
    list_ids.append(x)
 
#Get rid of duplicates in df_ZINC
df_ZINC = df_ZINC[~df_ZINC['zinc_id'].isin(list_ids)]

#Look at the numbers in dataframe
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC.loc[(df_ZINC['logP'] > intervals_logP[count][0]) & (df_ZINC['logP']<= intervals_logP[count][1])]))
numbers

# shuffle
# shuffle the DataFrame rows
df_ZINC = df_ZINC.sample(frac = 1)
df_ZINC

  df_merge = df_merge.drop('coconut_id', 1)


Unnamed: 0,MW,logP,Smiles,Inchi,zinc_id,name
259378,525.160138,4.54134,Cc1ccc(C)c(N(C(=O)c2snc(C(N)=O)c2N)[C@@H](C(=O...,InChI=1S/C26H28ClN5O3S/c1-14-7-8-15(2)19(13-14...,12702433,ZINC
101020,520.114374,3.10350,O=C(c1cccc(NS(=O)(=O)c2ccc(Br)cc2)c1)N1CCN(CCN...,InChI=1S/C23H29BrN4O3S/c24-20-6-8-22(9-7-20)32...,514388729,ZINC
212266,542.055064,4.89380,COc1cc(/C=C2\SC(=O)N(CC(=O)Nc3ccc(Cl)cc3)C2=O)...,InChI=1S/C25H19ClN2O8S/c1-34-20-10-14(2-8-18(2...,409384365,ZINC
179751,634.136964,4.29430,CCOC(=O)C1=C(C)n2c(s/c(=C\c3cc(OCC)c(OCc4ccccc...,InChI=1S/C30H26N4O10S/c1-4-41-22-13-19(20(33(3...,408731428,ZINC
235892,504.071105,4.59780,COc1ccccc1N(CCCC(=O)NCCSCc1ccc(Cl)cc1Cl)S(C)(=...,InChI=1S/C21H26Cl2N2O4S2/c1-29-20-7-4-3-6-19(2...,225051855,ZINC
...,...,...,...,...,...,...
174390,513.186832,4.40710,CCN(CC)c1ccc(S(=O)(=O)N2CCCCC2)cc1NC(=O)Cc1csc...,InChI=1S/C25H31N5O3S2/c1-3-29(4-2)23-11-10-21(...,514383918,ZINC
266016,508.087113,4.94810,CCOc1ccc(S(=O)(=O)Nc2cc(OC)c(Cl)cc2OC)cc1NC(=O...,InChI=1S/C23H22ClFN2O6S/c1-4-33-20-10-9-14(11-...,408593789,ZINC
214545,547.086610,4.66472,CCCNC(=O)[C@@H](C)N(Cc1c(Cl)cccc1Cl)C(=O)CN(c1...,InChI=1S/C23H28Cl3N3O4S/c1-5-12-27-23(31)16(3)...,224958528,ZINC
94708,540.035455,3.48610,CCOC(=O)C1=C(C)N=c2s/c(=C\c3cccc(Br)c3)c(=O)n2...,InChI=1S/C25H21BrN2O5S/c1-4-32-24(31)21-14(2)2...,2058862,ZINC


In [27]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)

def lgP(intervals):
    return df_ZINC.loc[(df_ZINC['logP'] > intervals[0]) & (df_ZINC['logP'] <= intervals[1])]  

df_lgP_1 = lgP(intervals_logP[0])[:17135]
df_lgP_2 = lgP(intervals_logP[1])[:6104]
df_lgP_3 = lgP(intervals_logP[2])[:7440]
df_lgP_4 = lgP(intervals_logP[3])[:8867]
df_lgP_5 = lgP(intervals_logP[4])[:4958]
df_lgP_6 = lgP(intervals_logP[5])[:5642]
df_lgP_7 = lgP(intervals_logP[6])[:6114]
df_lgP_8 = lgP(intervals_logP[7])[:6254]
df_lgP_9 = lgP(intervals_logP[8])[:6613]
df_lgP_10 = lgP(intervals_logP[9])[:6427]
df_lgP_11 = lgP(intervals_logP[10])[:48732]

#Merge all dataframes
frames = [df_lgP_1, df_lgP_2, df_lgP_3,df_lgP_4,df_lgP_5,df_lgP_6,df_lgP_7,df_lgP_8,df_lgP_9,df_lgP_10,df_lgP_11]
df_ZINC11 = pd.concat(frames)

In [28]:
intervals_logP = (-34,-1),(-1,0),(0,1),(1,2),(2,2.5),(2.5,3),(3,3.5),(3.5,4),(4,4.5),(4.5,5),(5, 50)
numbers = []
for count, value in enumerate(intervals_logP):
    numbers.append(len(df_ZINC11.loc[(df_ZINC11['logP'] > intervals_logP[count][0]) & (df_ZINC11['logP']<= intervals_logP[count][1])]))
numbers

[6169, 4324, 7440, 8867, 4958, 5642, 6114, 6254, 6613, 6427, 48732]

In [29]:
df_ZINC11 = df_ZINC11.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)
df_ZINC11.to_csv('ZINC500up-0.csv', sep=' ', index = False)

  df_ZINC11 = df_ZINC11.drop('MW', 1).drop('name', 1).drop('Inchi', 1).drop('logP', 1)


In [30]:
df_all = pd.concat([df_ZINC1,df_ZINC2,df_ZINC3,df_ZINC4,df_ZINC5,df_ZINC6,df_ZINC7,df_ZINC8,df_ZINC9,df_ZINC10,df_ZINC11], axis=0)
df_all

NameError: name 'df_ZINC1' is not defined