# Program to Clean and Extract Data from Lowe et al paper

https://pubs.acs.org/doi/10.1021/acs.chemrestox.2c00379

Property value is in moldm-3 so need to log base 10

Version: 0.0.0

In [31]:
import pandas as pd
import numpy as np
import re

In [32]:
df = pd.read_csv("../Data/SourceData/LoweSol-sourceData.csv")
print(df.columns)

Index(['Compound', 'SOURCE_CASRN', 'SOURCE_SMILES', 'SOURCE_DTXSID',
       'SOURCE_DTXCID', 'SOURCE_DTXRID', 'SOURCE_NAME', 'PROPERTY', 'UNIT',
       'PROPERTY_VALUE', 'Temperature', 'MAPPED_DTXSID', 'INPUT', 'FOUND_BY',
       'PREFERRED_NAME', 'CASRN', 'INCHIKEY', 'SMILES', 'INCHI_STRING',
       'MOLECULAR_FORMULA', 'AVERAGE_MASS', 'MS_READY_SMILES',
       'QSAR_READY_SMILES', 'PFASMASTER', 'PROPERTY_VALUE_CONVERTED'],
      dtype='object')


In [33]:
df = df[["Compound", "SMILES", "INCHI_STRING", "Temperature", "PROPERTY_VALUE_CONVERTED"]]
df = df.rename(columns={"INCHI_STRING":"InChI", "PROPERTY_VALUE_CONVERTED": "Solubility"})

In [49]:
temps = df["Temperature"].values.tolist()
cleanTemps = []

for t in temps:
    if "Temperature:" in str(t):
        t = str(t.split("Temperature:")[1].strip())
        cleanTemps.append(float(t[:4]))
    else:
        cleanTemps.append(25.0)

df["Temperature"] = cleanTemps

df = df[df["Temperature"] < 40] #Dropping things where string parsing must've failed (paper says only values between 20 and 30c used)
df = df[df["Temperature"] > 10]

In [51]:
sol = df["Solubility"].values.astype(float).tolist()
logSol = np.log10(sol)
df["logS"] = logSol

print(df["logS"].describe()) #Checking for any weird values

count    39853.000000
mean        -3.048113
std          2.202369
min        -13.171985
25%         -4.303644
50%         -2.758610
75%         -1.522000
max          2.409933
Name: logS, dtype: float64


In [53]:
df.dropna(subset="InChI", inplace=True)
df.to_csv("../Data/Processed/0.0.0-LoweSol.csv", index=False)