In [1]:
import pandas as pd
import requests
import numpy as np
import pubchempy as pcp
from tqdm import tqdm, trange
import time as t
from MyFunctions.IdentifierConversion import convertors

In [2]:
def IDToSmiles(idValue, idType):
    """ 
    This function converts compound names to canonical SMILES strings using PubChemPy.

    Parameters:
    idValue: string
    idType: string

    Returns:
    smile:
    """


    compound = pcp.get_compounds(idValue, idType)
    try:
        smile = compound[0].canonical_smiles
    except:
        smile = np.nan
        # print(idValue, 'smile not found with ID:', idType)

    return smile

def tryConvFloat(val):
    try:
        return float(val)
    except:
        return val
    
def MCMRetrieval(compound):
    """ 
    This function retrieves the Henry's Law constants from the Master Chemical Mechanism database.

    Parameters:
    compound: string

    Returns:
    cons: list
    """
    try:
        page = requests.get(f"https://mcm.york.ac.uk/MCM/species/{compound}")
    except:
        print("Request error.")
        return np.nan

    lines = page.text.splitlines() #Only reached if the page is successfully retrieved
    for line in lines:
        if "InChI" in line:
            inchi = line.split("InChI: ")[1]
            inchi = inchi.split("<")[0]
            return inchi
    print("InChI not found.")
    return np.nan

In [3]:
vers = "2.7.1"
dataset = pd.read_csv(f"../Data/Processed/HenrysLaw/{vers}-HenrysLaw.csv")
dataset["InChI"].value_counts()

noInchi = dataset[dataset["InChI"].isnull()]
hasInchi = dataset[dataset["InChI"].notnull()]

uniqueDataset = noInchi.copy()
uniqueDataset.drop_duplicates(subset=["Compound"], inplace=True)
print(uniqueDataset.shape)

names = uniqueDataset["Compound"].values
noMCMNames = []
MCMNames = []

for name in names:
    if "MCM:" in name:
        noMCMNames.append(np.nan)
        MCMNames.append(name.split("MCM:")[1])
    else:
        noMCMNames.append(name)
        MCMNames.append(np.nan)

mcmDataset = uniqueDataset.copy()
mcmDataset["Compounds"] = MCMNames
mcmDataset.dropna(subset=["Compounds"], inplace=True)

noMCMDataset = uniqueDataset.copy()
noMCMDataset["Compounds"] = noMCMNames
noMCMDataset.dropna(subset=["Compounds"], inplace=True)
print(mcmDataset.shape, noMCMDataset.shape)

(2621, 7)
(2358, 8) (263, 8)


In [4]:
try:
    smilesdf = pd.read_csv(f"../Data/Processed/HenrysLaw/CheckPoints/{vers}-missing_SMILES_nonMCM.csv")
    print("Loaded SMILES from checkpoint.")
except:
    compounds = list(noMCMDataset["Compound"].values)
    inchikeys = list(noMCMDataset["InChIKey"].values)

    smilesDict = {}

    for i in tqdm(range(len(compounds)), desc="Converting identifiers to SMILES"):
        smile = IDToSmiles(inchikeys[i], "inchikey")
        smile = tryConvFloat(smile)

        if type(smile) == float:
            smile = IDToSmiles(compounds[i], "name") #If InChIKey conversion fails, try converting the compound name
            smile = tryConvFloat(smile)

            if type(smile) == float:
                smilesDict[compounds[i]] = np.nan
                print("No SMILES available for", compounds[i])
        
        smilesDict[compounds[i]] = smile

    smilesdf = pd.DataFrame(smilesDict, index=[0]).T.reset_index()
    smilesdf.columns = ["Compound", "SMILES"]
    smilesdf.to_csv(f"../Data/Processed/HenrysLaw/CheckPoints/{vers}-missing_SMILES_nonMCM.csv", index=False)

Loaded SMILES from checkpoint.


In [6]:
inchiDict = {}
compounds = list(mcmDataset["Compounds"].values)

myTrange = trange(len(compounds), desc="Retrieving InChIs")
for i in myTrange:
    comp = compounds[i]
    napTime = np.random.uniform(0.5, 3)
    myTrange.set_description(f"Retrieving InChI for {comp}, sleeping for {napTime:.2f} seconds")
    t.sleep(napTime)

    inchi = MCMRetrieval(comp)
    inchiDict[comp] = inchi

inchidf = pd.DataFrame(inchiDict, index=[0]).T.reset_index()
inchidf.columns = ["Compounds", "InChI"]
inchidf
inchidf.to_csv(f"../Data/Processed/HenrysLaw/CheckPoints/{vers}-missing_InChI_MCM.csv", index=False)

Retrieving InChI for C514OOH, sleeping for 1.35 seconds:  12%|█▏        | 287/2358 [09:22<53:47,  1.56s/it]     

InChI not found.


Retrieving InChI for C522CO3H, sleeping for 0.96 seconds:  12%|█▏        | 293/2358 [09:30<43:20,  1.26s/it]  

InChI not found.


Retrieving InChI for C1H4C5CO3H, sleeping for 1.60 seconds:  14%|█▍        | 330/2358 [10:38<50:40,  1.50s/it]  

InChI not found.


Retrieving InChI for NO3CH2CHO, sleeping for 1.17 seconds:  74%|███████▎  | 1734/2358 [56:58<12:51,  1.24s/it]  

InChI not found.


Retrieving InChI for CH3SOO2NO2, sleeping for 0.98 seconds: 100%|██████████| 2358/2358 [1:17:27<00:00,  1.97s/it]


In [7]:
smiles = smilesdf["SMILES"].values
inchis = inchidf["InChI"].values

noMCMDataset["SMILES"] = smiles
noMCMInChI = convertors.smilesToInChI(smiles)
noMCMDataset["InChI"] = noMCMInChI

mcmDataset["InChI"] = inchis

# Need to re-expand lists

Converting SMILES to InChI: 100%|██████████| 263/263 [00:00<00:00, 4281.98it/s]

73 errors occurred





In [9]:
def createInchiDict(df):
    inchiDict = {}
    compounds = list(df["Compound"].values)
    inchis = list(df["InChI"].values)

    for i in range(len(compounds)):
        inchiDict[compounds[i]] = inchis[i]

    return inchiDict

print(len(noMCMDataset), len(mcmDataset))
inchiDict = createInchiDict(noMCMDataset)
inchiDict.update(createInchiDict(mcmDataset))
print(len(inchiDict))

263 2358
2621


In [12]:
inchiList = []
compoundList = noInchi["Compound"].values

for compound in compoundList:
    try:
        inchiList.append(inchiDict[compound])
    except:
        inchiList.append(np.nan)
        print("InChI not found for", compound)

noInchi["InChI"] = inchiList
dataset = pd.concat([hasInchi, noInchi])
dataset.to_csv(f"../Data/Processed/HenrysLaw/{vers}-HenrysLaw.csv", index=False)