In [None]:
import pandas as pd
import requests
from tqdm.notebook import tqdm
from math import log10
from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt

solvent_dict = {'dmf': 'DMF', 
                'dmso': 'DMSO', 
                'thf': 'THF', 
                'nmp': 'NMP', 
                'mtbe': 'MTBE', 
                'mibk': 'MIBK', 
                'dmac': 'DMAc',
                'peg-400': 'PEG-400',
                'peg-300': 'PEG-300', 
                'peg-200': 'PEG-200',
                'peg-600': 'PEG-600', 
                'pegdme 250': 'PEGDME 250'}

In [None]:
df = pd.read_csv('BigSolDBv2.0.csv')

In [None]:
#checking that the mole fraction does not exceed 1
df[df['Solubility(mole_fraction)'] >= 1]

In [None]:
#checking that all DOIs are valid
dois = df['Source'].unique().tolist()
for doi in tqdm(dois):
    d = requests.get(f'https://api.crossref.org/works/{doi}')
    if d.status_code == 200:
        print('ok')
    else:
        print(f'This DOI does not exist: {doi}')

In [None]:
#checking that all SMILES_Solute have the same Compound_Name
dict(df.groupby('Compound_Name')['SMILES_Solute'].nunique().nlargest(10))

In [None]:
#checking that all Compound_Name have the same SMILES_Solute
dict(df.groupby('SMILES_Solute')['Compound_Name'].nunique().nlargest(10))

In [None]:
def convert_to_mol_l(smi_solvent, x, ro):
    """
    smi_solvent: solvent SMILES
    x: mole fraction solubility of the solute
    ro: solvent density
    """
    M = MolWt(Chem.MolFromSmiles(smi_solvent))
    S = x/(1-x)*1000*ro/M
    return S, log10(S)

In [None]:
densities = pd.read_csv('BigSolDBv2.0_densities.csv')
densities['Solvent'] = densities['Solvent'].apply(lambda x: x.lower())
densities['Solvent'] = densities['Solvent'].apply(lambda x: solvent_dict[x] if x in solvent_dict else x)
densities['Density_g/cm^3'] = densities['Density_g/cm^3'].apply(lambda x: float(x.replace(',', '.')))
solvents_with_density = densities['Solvent'].unique().tolist()

In [None]:
coeff = pd.read_csv('Coeffs.csv')
coeff['Solvent'] = coeff['Solvent'].apply(lambda x: x.lower().strip())
coeff['Solvent'] = coeff['Solvent'].apply(lambda x: solvent_dict[x] if x in solvent_dict else x)

In [None]:
smiles_solvents_dict = dict(df[['Solvent', 'SMILES_Solvent']].drop_duplicates().values)

In [None]:
#conversion from mole fraction to mol/L
logs_list = []
solub_list = []
for solv, solub, temp in tqdm(zip(df['Solvent'], df['Solubility(mole_fraction)'], df['Temperature_K'])):
    if solv in solvents_with_density:
        ro_df = densities[(densities['Solvent'] == solv) & (densities['Temperature_K'] == temp)]
        if (ro_df.shape[0] > 0):
            ro = ro_df['Density_g/cm^3'].mean()
        else:
            coeff_df = coeff[(coeff['Solvent'] == solv)]
            a = coeff_df['a'].iloc[0]
            b = coeff_df['b'].iloc[0]
            ro = a*temp + b
        smi_solvent = smiles_solvents_dict[solv]
        S, logs = convert_to_mol_l(smi_solvent, solub, ro)
        solub_list.append(S)
        logs_list.append(logs)
    else:
        solub_list.append(None)
        logs_list.append(None)

In [None]:
import re
import pubchempy as pcp
def get_substructure_cas(smiles):
    """
    Getting CAS and PubChem CID from PubChem API
    """
    cas_rns = []
    results = pcp.get_synonyms(smiles, 'smiles')
    if results:
        cid = results[0]['CID']
        for result in results:
            for syn in result.get('Synonym', []):
                match = re.match('(\d{2,7}-\d\d-\d)', syn)
                if match:
                    cas_rns.append(match.group(1))
        if cas_rns:
            return cas_rns[0], cid
        else:
            return None, cid
    else:
        print(smiles)
        return None, None

In [None]:
#getting data for Aspirin
get_substructure_cas('CC(=O)Oc1ccccc1C(=O)O')

In [None]:
from chembl_webresource_client.new_client import new_client

molecule = new_client.molecule
approved_drugs = molecule.filter(max_phase=4).only(['molecule_chembl_id', 'molecule_structures'])

In [None]:
len(approved_drugs)

In [None]:
fda_smiles_list = []
for drug in tqdm_notebook(approved_drugs):
    if drug['molecule_structures'] is not None:
        fda_smiles_list.append(drug['molecule_structures']['canonical_smiles'])
    else:
        fda_smiles_list.append(None)

In [None]:
df['FDA Approved'] = df['SMILES'].apply(lambda x: 'Yes' if x in fda_smiles_list else 'No')