In [4]:
from typing import Tuple, Dict
import warnings

import pandas as pd 
pd.options.mode.chained_assignment = None

import numpy as np 
from joblib import load, dump

import selfies as sf

In [5]:
def encode_smiles(smiles: pd.DataFrame):
    '''
    Haley Scolati SMILES -> SELFIES encoder
    
    This function encodes given smiles in a PD dataframe into selfies and returns a new pandas dataframe with a 
    column of selfies.

    A try except serves as a check for any smiles that could not be correctly encoded and returns them as a list
    It's important to check that these smiles are written correctly, 
    or strict=False needs to be given to sf.encoder in line 41 in cases like resonance structures. 
    '''
#     smiles.insert(loc=3,
#                column = 'SELFIES',		
#                value = '',
#                )

    j = 0
    removed = []
    for i in smiles['SMILES']:
        try:
            smiles['SELFIES'][j] = sf.encoder(i)
            j+=1
        except:
            removed.append(i)
            smiles.drop(labels=j, axis=0, inplace=True)
            j+=1

    print('smiles not encoded: ', removed)
    new_df = smiles	

    return new_df


In [6]:
sf.encoder('C12=C3C4=C5C6=C1C7=C8C9=C1C%10=C%11C(=C29)C3=C2C3=C4C4=C5C5=C9C6=C7C6=C7C8=C1C1=C8C%10=C%10C%11=C2C2=C3C3=C4C4=C5C5=C%11C%12=C(C6=C95)C7=C1C1=C%12C5=C%11C4=C3C3=C5C(=C81)C%10=C23  ')

EncoderError: failed to parse input
	SMILES: C12=C3C4=C5C6=C1C7=C8C9=C1C%10=C%11C(=C29)C3=C2C3=C4C4=C5C5=C9C6=C7C6=C7C8=C1C1=C8C%10=C%10C%11=C2C2=C3C3=C4C4=C5C5=C%11C%12=C(C6=C95)C7=C1C1=C%12C5=C%11C4=C3C3=C5C(=C81)C%10=C23  

In [17]:
sf.decoder('[14N]')

'[14N]'

In [9]:
sf.encoder('C12=C3C4=C5C6=C1C7=C8C9=C1C%10=C%11C(=C29)C3=C2C3=C4C4=C5C5=C9C6=C7C6=C7C8=C1C1=C8C%10=C%10C%11=C2C2=C3C3=C4C4=C5C5=C%11C%12=C(C6=C95)C7=C1C1=C%12C5=C%11C4=C3C3=C5C(=C81)C%10=C23')

'[C][=C][C][=C][C][=C][Ring1][=Branch1][C][=C][C][=C][C][=C][C][=Branch1][=Branch1][=C][Ring1][=C][Ring1][=Branch1][C][Ring1][=C][=C][C][=C][Ring1][S][C][=C][Ring1][P][C][=C][C][Ring2][Ring1][Ring1][=C][Ring2][Ring1][C][C][=C][C][Ring2][Ring1][Ring2][=C][Ring2][Ring1][Ring1][C][=C][C][Ring2][Ring1][Branch1][=C][C][Ring2][Ring1][=Branch1][=C][Ring2][Ring1][Ring1][C][=C][Ring2][Ring1][Ring2][C][=C][Ring2][Ring1][Ring2][C][=C][Ring2][Ring1][Ring2][C][=C][C][=C][Branch1][O][C][Ring2][Ring1][Branch1][=C][Ring2][Ring1][=Branch2][Ring1][=Branch1][C][Ring2][Ring1][=Branch1][=C][Ring2][Ring1][Ring2][C][=C][Ring1][Branch2][C][=C][Ring1][O][C][Ring1][#C][=C][Ring2][Ring1][C][C][=C][Ring1][=Branch1][C][=Branch1][#Branch1][=C][Ring2][Ring1][=N][Ring1][#Branch2][C][Ring2][Ring1][N][=C][Ring2][Ring1][#Branch2][Ring1][=Branch1]'

In [7]:
mol = sf.encoder('[13CH3][13C](=O)[13CH]=[13CH][13CH]1[13CH2][13CH]=[13CH][13CH2][13CH2]1')

In [8]:
mol

'[13CH3][13C][=Branch1][C][=O][13CH1][=13CH1][13CH1][13CH2][13CH1][=13CH1][13CH2][13CH2][Ring1][=Branch1]'

In [7]:
ind = sf.encoder('C1=CC=CC2=C1C=C(C2)C#[N]')

In [8]:
print(ind)

[C][=C][C][=C][C][=C][Ring1][=Branch1][C][=C][Branch1][Ring2][C][Ring1][Branch1][C][#NH0]
