# Solubility

Version: 0.1.0

Notes: All values are in celcius.

## Importing libraries

In [22]:
import pandas as pd
from rdkit import Chem
from rdkit import RDLogger
import numpy as np
RDLogger.DisableLog('rdApp.*')

from MyFunctions.smilesToInChI import *

## Reading in Data

All previously collected features are cut off, hence why taking only 0:6 columns.

In [23]:
data = pd.read_csv("../Data/SourceData/solubility-sourceData.csv")
print(data.shape)
data = data[data["MoleFraction"] <= 1.0] #Removing mole fraction values that are definitely wrong
data = data[data["MoleFraction"] >= 0.0]

data = data[data["SMILES2"] == "O"] #Taking aqueous data only
print(data.shape)
cleanedData = data.iloc[:, 0:6]
del cleanedData["SMILES2"]; del cleanedData["Compound2"]
print(cleanedData.head())

(18567, 236)
(10423, 236)
            Compound1                            SMILES1  MoleFraction  \
0   hexachlorobenzene  C1(=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl)Cl  3.170000e-10   
1  pentachlorobenzene      C1=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl  3.950000e-08   
2   pentachlorophenol   C1(=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl)O  3.400000e-07   
3   pentachlorophenol   C1(=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl)O  1.220000e-06   
4   pentachlorophenol   C1(=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl)O  2.370000e-06   

   Temperature  
0         25.0  
1         25.0  
2          0.0  
3         27.0  
4         50.0  


## Converting SMILES to InChI

In [24]:
cleanedData["InChI"] = smilesToInChI(cleanedData["SMILES1"])

Converting SMILES to InChI: 100%|██████████| 10423/10423 [00:00<00:00, 23590.00it/s]

8193 occurred





### Converting Mole Fraction to LogS

Solubility = $\frac{A}{volume}$  
Mole Fraction = $\frac{A}{A + water}$  
Moles water = $1-A$  
Mass water = $0.018(1-A)$ dm $^3$  
Solubility = $\frac{A}{0.018(1-A)}$

Checked against original AqSolDB dataset and seems to match up.

In [25]:
molFracs = cleanedData["MoleFraction"].values

def calcLogS(molFracs):
    solubility = molFracs/(0.01801528*(1-molFracs))
    return np.log10(solubility)

logS = calcLogS(molFracs)
cleanedData["logS"] = logS

del cleanedData["MoleFraction"]
cleanedData = cleanedData.rename(columns={"SMILES1": "SMILES", "Compound1": "Compound"})

print(cleanedData.columns.to_list())

cleanedData.to_csv("../Data/Processed/0.2.0-solubility.csv", index=False)

['Compound', 'SMILES', 'Temperature', 'InChI', 'logS']


  solubility = molFracs/(0.01801528*(1-molFracs))
  return np.log10(solubility)
