In [5]:
import selfies as sf
import pandas as pd
import numpy as np

In [6]:
def get_selfie_encodings_for_dataset(file_path):
    """
    Returns encoding, alphabet and length of largest molecule in SMILES and
    SELFIES, given a file containing SMILES molecules.
    input:
        csv file with molecules. Column's name must be 'smiles'.
    output:
        - selfies encoding
        - selfies alphabet
        - longest selfies string
    """

    df = pd.read_csv(file_path)

    smiles_list = np.asanyarray(df.smiles)

    print('--> Translating SMILES to SELFIES...')
    selfies_list = list(map(sf.encoder, smiles_list))

    all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list)
    all_selfies_symbols.add('[nop]')
    selfies_alphabet = list(all_selfies_symbols)

    largest_selfies_len = max(sf.len_selfies(s) for s in selfies_list)

    print('Finished translating SMILES to SELFIES.')

    return selfies_list, selfies_alphabet, largest_selfies_len

In [7]:
get_selfie_and_smiles_encodings_for_dataset("../data/raw/test_smiles.txt")

--> Translating SMILES to SELFIES...
Finished translating SMILES to SELFIES.


(['[C]',
  '[N]',
  '[O]',
  '[C][#C]',
  '[C][#N]',
  '[C][=O]',
  '[C][C]',
  '[C][O]',
  '[C][C][#C]',
  '[C][C][#N]',
  '[C][C][=O]',
  '[N][C][=O]',
  '[C][C][C]',
  '[C][C][O]',
  '[C][O][C]',
  '[C][C][C][Ring1][Ring1]',
  '[C][C][O][Ring1][Ring1]',
  '[C][C][Branch1][C][C][=O]',
  '[C][C][Branch1][C][N][=O]',
  '[N][C][Branch1][C][N][=O]',
  '[C][C][Branch1][C][C][C]',
  '[C][C][Branch1][C][C][O]',
  '[C][#C][C][#C]',
  '[C][#C][C][#N]',
  '[N][#C][C][#N]',
  '[O][=C][C][#C]',
  '[O][=C][C][#N]',
  '[O][=C][C][=O]',
  '[C][C][#C][C]',
  '[C][C][C][#C]',
  '[C][C][C][#N]',
  '[N][C][C][#N]',
  '[O][C][C][#C]',
  '[O][C][C][#N]',
  '[C][C][C][=O]',
  '[C][N][C][=O]',
  '[C][O][C][=O]',
  '[O][C][C][=O]',
  '[C][C][C][C]',
  '[C][C][C][O]',
  '[C][C][O][C]',
  '[O][C][C][O]',
  '[C][C][C][C][Ring1][Ring1]',
  '[C][C][C][O][Ring1][Ring1]',
  '[C][N][C][C][Ring1][Ring1]',
  '[O][C][C][C][Ring1][Ring1]',
  '[C][C][C][C][Ring1][Ring2]',
  '[C][C][O][C][Ring1][Ring2]',
  '[C][C][Branch