# SMILES Augmentation Example

#### This example shows how to augment a SMILES dataset using SMILESAugmentation.

In [9]:
import pandas as pd

from smiles_augmentation.smiles_enumerator import SmilesEnumerator

*Load the SMILES data:*

In [6]:
smiles = pd.read_csv('data/compounds.csv').SMILES.values
smiles

array(['C1=CC=C(C=C1)C2=CC=CC=C2C(=O)NC3=CC(=C(C=C3)F)F',
       'CC(=C1C2CCC1C3C2C(=O)N(C3=O)NC(=O)C4=CC=CO4)C',
       'CC1CCCC(N1NC(=S)NC2=CC=C(C=C2)OC)C',
       'C1CCC(C1)NC(=O)CSC2=NC=CN2CC3=CC=CC=C3',
       'COC1=CC=CC(=C1O)C=NNC(=O)C2=C3CCC4=CC=CC=C4C3=NN2',
       'C1=CC=C(C=C1)CCCN(CCOC2C(C(OC2N3C=NC4=C(N=CN=C43)N)CO)O)CC5C(C(C(O5)N6C=NC7=C(N=CN=C76)N)O)O',
       'CC(=O)O', 'CCC(=O)OCC',
       'COC(=O)CCCN(CCOC1C(C(OC1N2C=NC3=C(N=CN=C32)N)CO)O)CC4C(C(C(O4)N5C=NC6=C(N=CN=C65)N)O)O',
       'C1=CC(=CC=C1S(=O)(=O)N(CCOC2C(C(OC2N3C=NC4=C(N=CN=C43)N)CO)O)CC5C(C(C(O5)N6C=NC7=C(N=CN=C76)N)O)O)Cl'],
      dtype=object)

*Create an SmilesEnumerator object and enumerate the SMILES by calling the enumerate method:*

You can define if you want to keep or remove duplicates, define a seed for reproducibility, the number of jobs to run in parallel, the level of verbosity and the maximum number of SMILES to enumerate.

In [11]:
enumerator = SmilesEnumerator(smiles=smiles, remove_duplicates=True, seed=123, n_jobs=1, verbose=0)
enumerated_smiles = enumerator.enumerate(n_max=10)
enumerated_smiles

[['c1(c(F)cc(cc1)NC(c1c(-c2ccccc2)cccc1)=O)F',
  'c1ccccc1-c1ccccc1C(=O)Nc1ccc(c(c1)F)F',
  'N(C(=O)c1ccccc1-c1ccccc1)c1cc(F)c(F)cc1',
  'Fc1ccc(NC(c2c(-c3ccccc3)cccc2)=O)cc1F',
  'c1c(c(F)ccc1NC(=O)c1ccccc1-c1ccccc1)F',
  'c1(-c2c(cccc2)C(Nc2cc(F)c(cc2)F)=O)ccccc1',
  'c1(cc(ccc1F)NC(=O)c1c(cccc1)-c1ccccc1)F',
  'C(Nc1cc(c(F)cc1)F)(c1c(cccc1)-c1ccccc1)=O',
  'O=C(c1ccccc1-c1ccccc1)Nc1ccc(c(c1)F)F',
  'c1cc(NC(c2c(-c3ccccc3)cccc2)=O)cc(c1F)F'],
 ['C1(=C(C)C)C2C3C(=O)N(C(=O)C3C1CC2)NC(=O)c1ccco1',
  'O=C1C2C3CCC(C3=C(C)C)C2C(=O)N1NC(c1ccco1)=O',
  'C1C2C3C(N(C(=O)C3C(C1)C2=C(C)C)NC(=O)c1ccco1)=O',
  'N1(C(=O)C2C3CCC(C3=C(C)C)C2C1=O)NC(c1ccco1)=O',
  'o1cccc1C(NN1C(C2C(C3CCC2C3=C(C)C)C1=O)=O)=O',
  'C12C(C3CCC2C3=C(C)C)C(N(C1=O)NC(=O)c1ccco1)=O',
  'o1c(ccc1)C(NN1C(=O)C2C3CCC(C2C1=O)C3=C(C)C)=O',
  'o1cccc1C(=O)NN1C(C2C3C(=C(C)C)C(C2C1=O)CC3)=O',
  'c1cc(oc1)C(=O)NN1C(=O)C2C3C(=C(C)C)C(CC3)C2C1=O',
  'C12C3C(=C(C)C)C(C1C(=O)N(C2=O)NC(=O)c1occc1)CC3'],
 ['N(c1ccc(cc1)OC)C(NN1C(CCCC1C)C)=S

*Let's see the enumerated SMILES for the first compound:*

In [13]:
original_smiles = smiles[0]
print(f"Original SMILES: {original_smiles}")

new_enumerated_smiles = enumerated_smiles[0]
print(f"New enumerated SMILES: {new_enumerated_smiles}")

Original SMILES: C1=CC=C(C=C1)C2=CC=CC=C2C(=O)NC3=CC(=C(C=C3)F)F
New enumerated SMILES: ['c1(c(F)cc(cc1)NC(c1c(-c2ccccc2)cccc1)=O)F', 'c1ccccc1-c1ccccc1C(=O)Nc1ccc(c(c1)F)F', 'N(C(=O)c1ccccc1-c1ccccc1)c1cc(F)c(F)cc1', 'Fc1ccc(NC(c2c(-c3ccccc3)cccc2)=O)cc1F', 'c1c(c(F)ccc1NC(=O)c1ccccc1-c1ccccc1)F', 'c1(-c2c(cccc2)C(Nc2cc(F)c(cc2)F)=O)ccccc1', 'c1(cc(ccc1F)NC(=O)c1c(cccc1)-c1ccccc1)F', 'C(Nc1cc(c(F)cc1)F)(c1c(cccc1)-c1ccccc1)=O', 'O=C(c1ccccc1-c1ccccc1)Nc1ccc(c(c1)F)F', 'c1cc(NC(c2c(-c3ccccc3)cccc2)=O)cc(c1F)F']
