# Canonicalize SMILES strings

Jacqueline R. M. A. Maasch | May 2022

## Preamble

In [17]:
from rdkit import Chem
import pandas as pd

## Define functions

In [18]:
def canonicalize(smiles_strings):
    canonical_smiles = []
    noncanonical = 0
    for smiles in smiles_strings:
        canonical = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical = True)
        canonical_smiles.append(canonical)
        if (smiles != canonical):
            noncanonical += 1
    print("\nTotal SMILES that were not canonical:", noncanonical)
    return canonical_smiles

## Read data

In [19]:
# Read CSV files.
df_train = pd.read_csv("train_100k.csv")
df_test = pd.read_csv("test_25k.csv")

In [20]:
# Explore data.
print(df_train.info())
display(df_train.head())

print(df_test.info())
display(df_test.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   SMILES  100000 non-null  object
 1   SPLIT   100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB
None


Unnamed: 0,SMILES,SPLIT
0,CNC(=O)c1ccc(Sc2ncc(C(C)C)n2C2CC2)nn1,train
1,CCC(C)(C)C(=O)Nc1ccc(NC(=O)OC(C)(C)C)nc1,train
2,COc1ccc2c(c1)CN(C(=O)NCc1cccc(C)n1)CC2,train
3,COc1cccc(OC)c1CNC1COc2ccccc2C1,train
4,Cc1cccc(C)c1CCNC(=O)c1ccc(-n2ccnn2)cc1,train


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  25000 non-null  object
 1   SPLIT   25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB
None


Unnamed: 0,SMILES,SPLIT
0,CC(C)Cc1noc(C2CCN(S(=O)(=O)c3ccccc3)CC2)n1,test
1,CC(C)CC(=O)Nc1cc2c(cc1N)OCCO2,test
2,CC(Oc1cccc(C#N)c1)C(=O)N1CCc2ccc(Cl)cc2C1,test
3,Cc1ccc(NC(=O)CN(C)S(C)(=O)=O)c(O)c1,test
4,COCc1cc(N2CCc3ccccc3C2)n2nccc2n1,test


In [21]:
print("Total actives for training:", len(df_train))
print("Total actives for validation:", len(df_test))

Total actives for training: 100000
Total actives for validation: 25000


## Canonicalize SMILES

In [22]:
# Canonicalize training set molecules.
train_canonical = canonicalize(df_train["SMILES"])


Total SMILES that were not canonical: 12


In [23]:
# Canonicalize testing set molecules.
test_canonical = canonicalize(df_test["SMILES"])


Total SMILES that were not canonical: 6


In [24]:
# Convert to dataframes.
train_canonical = pd.DataFrame({"SMILES": train_canonical})
test_canonical = pd.DataFrame({"SMILES": test_canonical})

print(train_canonical.info())
display(train_canonical.head())

print(test_canonical.info())
display(test_canonical.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   SMILES  100000 non-null  object
dtypes: object(1)
memory usage: 781.4+ KB
None


Unnamed: 0,SMILES
0,CNC(=O)c1ccc(Sc2ncc(C(C)C)n2C2CC2)nn1
1,CCC(C)(C)C(=O)Nc1ccc(NC(=O)OC(C)(C)C)nc1
2,COc1ccc2c(c1)CN(C(=O)NCc1cccc(C)n1)CC2
3,COc1cccc(OC)c1CNC1COc2ccccc2C1
4,Cc1cccc(C)c1CCNC(=O)c1ccc(-n2ccnn2)cc1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  25000 non-null  object
dtypes: object(1)
memory usage: 195.4+ KB
None


Unnamed: 0,SMILES
0,CC(C)Cc1noc(C2CCN(S(=O)(=O)c3ccccc3)CC2)n1
1,CC(C)CC(=O)Nc1cc2c(cc1N)OCCO2
2,CC(Oc1cccc(C#N)c1)C(=O)N1CCc2ccc(Cl)cc2C1
3,Cc1ccc(NC(=O)CN(C)S(C)(=O)=O)c(O)c1
4,COCc1cc(N2CCc3ccccc3C2)n2nccc2n1


## Export data

In [25]:
train_canonical.to_csv("train_100k_canonical.csv", index = False)

In [26]:
test_canonical.to_csv("test_25k_canonical.csv", index = False)

## End of document