In [35]:
from rdkit import Chem
import selfies
import deepsmiles
from tucan.io.molfile_reader import graph_from_molfile_text
from tucan.canonicalization import canonicalize_molecule
from tucan.serialization import serialize_molecule

from io import StringIO
import requests 
import pubchempy as pcp
import time

In [None]:
https://github.com/volkamerlab/maxsmi/blob/main/maxsmi/utils/utils_smiles.py

In [25]:
def augment_smiles(smiles, int_aug=50, deduplicate=True):
    """
    Takes a SMILES (not necessarily canonical) and returns `int_aug` random variations of this SMILES.
    """

    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        return None
    else:
        if int_aug > 0:
            augmented = [
                Chem.MolToSmiles(mol, canonical=False, doRandom=True)
                for _ in range(int_aug)
            ]
            if deduplicate:
                augmented = list(set(augmented))
            return augmented
        else:
            raise ValueError("int_aug must be greater than zero.")

In [43]:
def smiles_to_max_random(smiles, max_duplication=10):
    """
    Returns estimated maximum number of random SMILES.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    smi_unique = []
    counter = 0
    while counter < max_duplication:
        rand = Chem.MolToSmiles(mol, canonical=False, doRandom=True)
        if rand not in smi_unique:
            smi_unique.append(rand)
            counter = 0
        else:
            counter += 1
    return smi_unique


In [31]:
def smiles_to_selfies(smiles):
    """
    Takes a SMILES and return the selfies encoding.
    """

    return [selfies.encoder(smiles)]


def smiles_to_deepsmiles(smiles):
    """
    Takes a SMILES and return the DeepSMILES encoding.
    """
    converter = deepsmiles.Converter(rings=True, branches=True)
    return converter.encode(smiles)


def smiles_to_canoncial(smiles):
    """
    Takes a SMILES and return the canoncial SMILES.
    """
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(mol)

def smiles_to_inchi(smiles):
    """
    Takes a SMILES and return the InChI.
    """
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MolToInchi(mol)

In [27]:
def smiles_to_tucan(smiles: str):
    """
    Takes a SMILES and return the Tucan encoding. 
    For this, create a molfile as StringIO, read it with graph_from_file, 
    canonicalize it and serialize it.
    """
    molfile = Chem.MolToMolBlock(Chem.MolFromSmiles(smiles))
    mol = graph_from_molfile_text(molfile)
    mol = canonicalize_molecule(mol)
    return serialize_molecule(mol)

In [28]:
smiles_to_tucan('CCO')

'C2O/(1-2)(2-3)'

In [29]:
smiles_to_deepsmiles('CCO')

'CCO'

In [30]:
augment_smiles('CCO', int_aug=10)

['CCO', 'C(C)O', 'C(O)C', 'OCC']

In [32]:
smiles_to_inchi('CCO')

'InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3'

In [44]:
smiles_to_max_random('CCO')

['OCC', 'CCO', 'C(C)O', 'C(O)C']

In [36]:
CACTUS = "https://cactus.nci.nih.gov/chemical/structure/{0}/{1}"

def smiles_to_iupac_name(smiles: str):
    """Use the chemical name resolver https://cactus.nci.nih.gov/chemical/structure.
    If this does not work, use pubchem.
    """
    try:
        time.sleep(0.001)
        rep = "iupac_name"
        url = CACTUS.format(smiles, rep)
        response = requests.get(url)
        response.raise_for_status()
        name = response.text
        if "html" in name:
            return None
        return name
    except Exception:
        try:
            compound = pcp.get_compounds(smiles, "smiles")
            return compound[0].iupac_name
        except Exception:
            return None

In [37]:
smiles_to_iupac_name('CCO')

'ethanol'