# Canonicalize SMILES strings

Jacqueline R. M. A. Maasch | May 2022

## Preamble

In [27]:
from rdkit import Chem
import pandas as pd

## Define functions

In [28]:
def canonicalize(smiles_strings):
    canonical_smiles = []
    noncanonical = 0
    for smiles in smiles_strings:
        canonical = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical = True)
        canonical_smiles.append(canonical)
        if (smiles != canonical):
            noncanonical += 1
    print("\nTotal SMILES that were not canonical:", noncanonical)
    return canonical_smiles

## Read data

In [29]:
# Read CSV files.
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [30]:
# Explore data.
print(df_train.info())
display(df_train.head())

print(df_test.info())
display(df_test.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1584663 entries, 0 to 1584662
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   SMILES  1584663 non-null  object
 1   SPLIT   1584663 non-null  object
dtypes: object(2)
memory usage: 24.2+ MB
None


Unnamed: 0,SMILES,SPLIT
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1,train
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1,train
2,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO,train
3,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C,train
4,CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O,train


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176074 entries, 0 to 176073
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   SMILES  176074 non-null  object
 1   SPLIT   176074 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB
None


Unnamed: 0,SMILES,SPLIT
0,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1,test
1,COc1ccc(-c2cc(=O)c3c(O)c(OC)c(OC)cc3o2)cc1O,test
2,CCOC(=O)c1ncn2c1CN(C)C(=O)c1cc(F)ccc1-2,test
3,Clc1ccccc1-c1nc(-c2ccncc2)no1,test
4,CC(C)(Oc1ccc(Cl)cc1)C(=O)OCc1cccc(CO)n1,test


In [31]:
print("Total actives for training:", len(df_train))
print("Total actives for validation:", len(df_test))

Total actives for training: 1584663
Total actives for validation: 176074


## Canonicalize SMILES

In [32]:
# Canonicalize training set molecules.
train_canonical = canonicalize(df_train["SMILES"])


Total SMILES that were not canonical: 276


In [33]:
# Canonicalize testing set molecules.
test_canonical = canonicalize(df_test["SMILES"])


Total SMILES that were not canonical: 36


In [34]:
# Convert to dataframes.
train_canonical = pd.DataFrame({"SMILES": train_canonical})
test_canonical = pd.DataFrame({"SMILES": test_canonical})

print(train_canonical.info())
display(train_canonical.head())

print(test_canonical.info())
display(test_canonical.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1584663 entries, 0 to 1584662
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   SMILES  1584663 non-null  object
dtypes: object(1)
memory usage: 12.1+ MB
None


Unnamed: 0,SMILES
0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1
1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1
2,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO
3,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C
4,CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176074 entries, 0 to 176073
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   SMILES  176074 non-null  object
dtypes: object(1)
memory usage: 1.3+ MB
None


Unnamed: 0,SMILES
0,CC1C2CCC(C2)C1CN(CCO)C(=O)c1ccc(Cl)cc1
1,COc1ccc(-c2cc(=O)c3c(O)c(OC)c(OC)cc3o2)cc1O
2,CCOC(=O)c1ncn2c1CN(C)C(=O)c1cc(F)ccc1-2
3,Clc1ccccc1-c1nc(-c2ccncc2)no1
4,CC(C)(Oc1ccc(Cl)cc1)C(=O)OCc1cccc(CO)n1


## Export data

In [35]:
train_canonical.to_csv("train_canonical.csv", index = False)

In [36]:
test_canonical.to_csv("test_canonical.csv", index = False)

## End of document