In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdSLNParse, MolToSmiles

In [2]:
df_d = pd.read_csv("data_delaney.csv")
df_w = pd.read_csv("data_wang.csv")

In [5]:
df_d.head(3)

Unnamed: 0,Compound ID,measured log(solubility:mol/L),ESOL predicted log(solubility:mol/L),SMILES
0,"1,1,1,2-Tetrachloroethane",-2.18,-2.794,ClCC(Cl)(Cl)Cl
1,"1,1,1-Trichloroethane",-2.0,-2.232,CC(Cl)(Cl)Cl
2,"1,1,2,2-Tetrachloroethane",-1.74,-2.549,ClC(Cl)C(Cl)Cl


In [7]:
df_w.head(3)

Unnamed: 0,Expt.,SLN
0,-4.35,BrC[2]C[3]=C(CH=CHCH=@2)CH=CHCH=CH@3
1,-3.33,BrC[2]=C(C(=CHC(=CH@2)C#N)Br)OH
2,-6.09,BrC[2]=C(CH=C(C(=CH@2)Cl)OP(=S)(OCH3)OCH3)Cl


In [3]:
## Need to standardize notation for the datasets before combining --> Will convert SLN notation to SMILES 
## Define function to convert notation
def sln_to_smiles(sln_notation):
    mol = rdSLNParse.MolFromSLN(sln_notation)
    smiles_string = MolToSmiles(mol)
    return smiles_string

In [8]:
## Apply function to df_w, create new column with SMILES notation
df_w['SMILES'] = df_w['SLN'].apply(sln_to_smiles)

In [10]:
df_w.head()

Unnamed: 0,Expt.,SLN,SMILES
0,-4.35,BrC[2]C[3]=C(CH=CHCH=@2)CH=CHCH=CH@3,Brc1cccc2ccccc12
1,-3.33,BrC[2]=C(C(=CHC(=CH@2)C#N)Br)OH,N#Cc1cc(Br)c(O)c(Br)c1
2,-6.09,BrC[2]=C(CH=C(C(=CH@2)Cl)OP(=S)(OCH3)OCH3)Cl,COP(=S)(OC)Oc1cc(Cl)c(Br)cc1Cl
3,-4.5,BrC[2]=C(CH=CHC(=CH@2)Br)Br,Brc1ccc(Br)c(Br)c1
4,-5.04,BrC[2]=C(CH=CHCH=C@2Br)Br,Brc1cccc(Br)c1Br


In [11]:
## Rename columns
df_d = df_d.rename(columns = {'measured log(solubility:mol/L)':'solubility'})
df_w = df_w.rename(columns = {'Expt.':'solubility'})

In [17]:
## drop columns we are not interested in 
df_d = df_d[['solubility', 'SMILES']]
df_w = df_w[['solubility', 'SMILES']]

In [20]:
## Combine dataframes
df_combined = pd.concat([df_d, df_w])

In [21]:
df_combined

Unnamed: 0,solubility,SMILES
0,-2.1800,ClCC(Cl)(Cl)Cl
1,-2.0000,CC(Cl)(Cl)Cl
2,-1.7400,ClC(Cl)C(Cl)Cl
3,-1.4800,ClCC(Cl)Cl
4,-3.0400,FC(F)(Cl)C(F)(Cl)Cl
...,...,...
3346,-3.3319,NS(=O)(=O)c1ccc(C(=O)c2ccc(CNCc3ccccc3)cc2)s1
3347,-2.1669,CCCCNCc1ccc(C(=O)c2ccc(S(N)(=O)=O)s2)cc1
3348,-1.4812,NS(=O)(=O)c1ccc(C(=O)c2ccc(CN3CCOCC3)cc2)s1
3349,-1.8802,CN1CCN(Cc2ccc(C(=O)c3ccc(S(N)(=O)=O)s3)cc2)CC1


In [24]:
df_combined = df_combined.reset_index(drop=True)

In [25]:
df_combined

Unnamed: 0,solubility,SMILES
0,-2.1800,ClCC(Cl)(Cl)Cl
1,-2.0000,CC(Cl)(Cl)Cl
2,-1.7400,ClC(Cl)C(Cl)Cl
3,-1.4800,ClCC(Cl)Cl
4,-3.0400,FC(F)(Cl)C(F)(Cl)Cl
...,...,...
4490,-3.3319,NS(=O)(=O)c1ccc(C(=O)c2ccc(CNCc3ccccc3)cc2)s1
4491,-2.1669,CCCCNCc1ccc(C(=O)c2ccc(S(N)(=O)=O)s2)cc1
4492,-1.4812,NS(=O)(=O)c1ccc(C(=O)c2ccc(CN3CCOCC3)cc2)s1
4493,-1.8802,CN1CCN(Cc2ccc(C(=O)c3ccc(S(N)(=O)=O)s3)cc2)CC1


In [26]:
df_combined.to_csv('data_combined.csv', index = False)