# Data Collection

As a starting point, a set of over 100,000 named chemical structures was downloaded from ZINC (https://zinc.docking.org/).
Using pubchem's pugrest apis, the structures were matched with CID and predicted logp values (xlogp3).
The CID can be mapped to DSSTox for filling in experimental values.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('named.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,zinc_id,smiles,cid,Canonicalized Compound,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Allowed IUPAC Name,CAS-like Style IUPAC Name,...,Standard InChI,Standard InChIKey,XLogP3-AA Log P,Exact Mass,Canonical SMILES,Isomeric SMILES,Polar Surface Area Topological,MonoIsotopic Weight,XLogP3 Log P,logp_experimental
0,0,ZINC000030727788,C=C[C@]1(C)C[C@@H](OC(=O)CSC(C)(C)CNC(=O)[C@H]...,9850878.0,1.0,7.0,3.0,10.0,"[(1S,2R,3S,4S,6R,7R,8R,14R)-3-hydroxy-2,4,7,14...",2-[[1-[[(2R)-2-amino-3-methyl-1-oxobutyl]amino...,...,InChI=1S/C31H52N2O5S/c1-10-29(8)15-22(38-23(35...,LLYYNOVSVPBRGV-MVNKZKPCSA-N,5.3,564.359694,CC1CCC23CCC(=O)C2C1(C(CC(C(C3C)O)(C)C=C)OC(=O)...,C[C@@H]1CC[C@@]23CCC(=O)[C@H]2[C@@]1([C@@H](C[...,144.0,564.359694,,
1,1,ZINC000150377216,CCCCCC/C=C\C/C=C\CCCCCCCC(=O)OC[C@H](COCCCCCCC...,,,,,,,,...,,,,,,,,,,
2,2,ZINC000100780125,CC(=O)O[C@H]1C[C@](C)(O)[C@@H]2CC=C(C)[C@@H]2[...,,,,,,,,...,,,,,,,,,,
3,3,ZINC000006580536,O=C(O)[C@H](Cc1ccccc1)N(CCCl)CCCl,9971439.0,1.0,3.0,1.0,8.0,(2S)-2-[bis(2-chloroethyl)amino]-3-phenyl-prop...,(2S)-2-[bis(2-chloroethyl)amino]-3-phenylpropa...,...,InChI=1S/C13H17Cl2NO2/c14-6-8-16(9-7-15)12(13(...,QEGIMPLUXVXFNQ-LBPRGKRZSA-N,0.8,289.063634,C1=CC=C(C=C1)CC(C(=O)O)N(CCCl)CCCl,C1=CC=C(C=C1)C[C@@H](C(=O)O)N(CCCl)CCCl,40.5,289.063634,,
4,4,ZINC000150351802,O=C1C[C@H](c2ccc(O)c(O)c2)Oc2c1c(O)cc(O[C@H]1O...,,,,,,,,...,,,,,,,,,,


In [4]:
import urllib.request
import urllib.parse
import json
import re
import statistics

In [5]:
def extract_value(info):
    if ('Number' in info['Value']):
        return float(info['Value']['Number'][0])
    elif ('StringWithMarkup' in info['Value']):
        match = re.search('(-?\d+\.?\d{0,4})', info['Value']['StringWithMarkup'][0]['String'])
        if (match):
            return float(match.group())
    return None

In [9]:
def get_experimental_logp(cid):
    try:
        response = urllib.request.urlopen(
            "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/%s/JSON/?heading=LogP" % str(cid)).read()
        data = json.loads(response.decode('utf-8'))
        values = map(extract_value, data['Record']['Section'][0]['Section'][0]['Section'][0]['Information'])
        return round(statistics.mean(filter(lambda v: v is not None, values)), 2)
    except Exception as e:
        print('%s (%s)' % (str(cid), e))
        pass
    return ''

In [13]:
import time
import numpy as np

def throttled_pubchem_logp(cid):
    if (not cid or np.isnan(cid)):
        return None
    start = time.time()
    result = get_experimental_logp(str(int(cid)))
    end = time.time()
    elapsed = end - start
    if (elapsed < 0.2):
        time.sleep(0.2 - elapsed)
    return result

In [None]:
df['logp_experimental'] = df['cid'].apply(throttled_pubchem_logp)
df_combined.isna().sum()

In [15]:
df.isna().sum()

Unnamed: 0                            0
zinc_id                              48
smiles                                0
cid                               64116
Canonicalized Compound            64116
Hydrogen Bond Acceptor Count      64116
Hydrogen Bond Donor Count         64116
Rotatable Bond Count              64116
Allowed IUPAC Name                64177
CAS-like Style IUPAC Name         64177
Markup IUPAC Name                 64177
Preferred IUPAC Name              64177
Systematic IUPAC Name             64177
Traditional IUPAC Name            64177
Standard InChI                    64116
Standard InChIKey                 64116
XLogP3-AA Log P                   73123
Exact Mass                        64116
Canonical SMILES                  64116
Isomeric SMILES                   64116
Polar Surface Area Topological    64116
MonoIsotopic Weight               64116
XLogP3 Log P                      95867
logp_experimental                 64116
dtype: int64

In [11]:
df['xlogp'] = df['XLogP3-AA Log P']
df['xlogp'].fillna(df['XLogP3 Log P'])
df.head(10)

Unnamed: 0.1,Unnamed: 0,zinc_id,smiles,cid,Canonicalized Compound,Hydrogen Bond Acceptor Count,Hydrogen Bond Donor Count,Rotatable Bond Count,Allowed IUPAC Name,CAS-like Style IUPAC Name,...,Standard InChI,Standard InChIKey,XLogP3-AA Log P,Exact Mass,Canonical SMILES,Isomeric SMILES,Polar Surface Area Topological,MonoIsotopic Weight,XLogP3 Log P,xlogp
0,0,ZINC000030727788,C=C[C@]1(C)C[C@@H](OC(=O)CSC(C)(C)CNC(=O)[C@H]...,9850878.0,1.0,7.0,3.0,10.0,"[(1S,2R,3S,4S,6R,7R,8R,14R)-3-hydroxy-2,4,7,14...",2-[[1-[[(2R)-2-amino-3-methyl-1-oxobutyl]amino...,...,InChI=1S/C31H52N2O5S/c1-10-29(8)15-22(38-23(35...,LLYYNOVSVPBRGV-MVNKZKPCSA-N,5.3,564.359694,CC1CCC23CCC(=O)C2C1(C(CC(C(C3C)O)(C)C=C)OC(=O)...,C[C@@H]1CC[C@@]23CCC(=O)[C@H]2[C@@]1([C@@H](C[...,144.0,564.359694,,5.3
1,1,ZINC000150377216,CCCCCC/C=C\C/C=C\CCCCCCCC(=O)OC[C@H](COCCCCCCC...,,,,,,,,...,,,,,,,,,,
2,2,ZINC000100780125,CC(=O)O[C@H]1C[C@](C)(O)[C@@H]2CC=C(C)[C@@H]2[...,,,,,,,,...,,,,,,,,,,
3,3,ZINC000006580536,O=C(O)[C@H](Cc1ccccc1)N(CCCl)CCCl,9971439.0,1.0,3.0,1.0,8.0,(2S)-2-[bis(2-chloroethyl)amino]-3-phenyl-prop...,(2S)-2-[bis(2-chloroethyl)amino]-3-phenylpropa...,...,InChI=1S/C13H17Cl2NO2/c14-6-8-16(9-7-15)12(13(...,QEGIMPLUXVXFNQ-LBPRGKRZSA-N,0.8,289.063634,C1=CC=C(C=C1)CC(C(=O)O)N(CCCl)CCCl,C1=CC=C(C=C1)C[C@@H](C(=O)O)N(CCCl)CCCl,40.5,289.063634,,0.8
4,4,ZINC000150351802,O=C1C[C@H](c2ccc(O)c(O)c2)Oc2c1c(O)cc(O[C@H]1O...,,,,,,,,...,,,,,,,,,,
5,5,ZINC000230052473,CC[C@@H]1[C@H](Cc2[nH]c(Cc3[nH]c(C[C@@H]4NC(=O...,,,,,,,,...,,,,,,,,,,
6,6,ZINC000085875971,O=C(O)/C=C/c1cc2ccoc2cc1O[C@H]1O[C@@H](CO)[C@@...,,,,,,,,...,,,,,,,,,,
7,7,ZINC000015206001,COc1ccc2c([nH]c3cc(O)c(C)cc32)c1C/C=C(\C)CCC=C...,,,,,,,,...,,,,,,,,,,
8,8,ZINC000150357141,CC/C=C\C/C=C\C/C=C\C/C=C\CCCCCCC(=O)O[C@@H](CO...,,,,,,,,...,,,,,,,,,,
9,9,ZINC000150366233,CC/C=C\C/C=C\C/C=C\C/C=C\C/C=C\C/C=C\CCC(=O)OC...,,,,,,,,...,,,,,,,,,,


In [28]:
df_pc = df[df.cid.notna()]

In [30]:
df_pc['xlogp'] = df_pc['xlogp'].fillna(df_pc['XLogP3 Log P'])
df_pc['xlogp'].isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pc['xlogp'] = df_pc['xlogp'].fillna(df_pc['XLogP3 Log P'])


82

In [None]:
df['xlogp'] = df['XLogP3-AA Log P']
df['xlogp'].fillna(df['XLogP3 Log P'])

In [34]:
df.to_csv('chem_data.csv')