# Canonicalize SMILES strings

Jacqueline R. M. A. Maasch | May 2022

## Preamble

In [3]:
from rdkit import Chem
import pandas as pd

## Define functions

In [13]:
def canonicalize(smiles_strings):
    canonical_smiles = []
    noncanonical = 0
    for smiles in smiles_strings:
        canonical = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), canonical = True)
        canonical_smiles.append(canonical)
        if (smiles != canonical):
            noncanonical += 1
    print("\nTotal SMILES that were not canonical:", noncanonical)
    return canonical_smiles

## Read data

In [4]:
# Read CSV files.
df_actives_train = pd.read_csv("actives_train.csv")
df_actives_val = pd.read_csv("actives_val.csv")

df_inactives_train = pd.read_csv("inactives_train.csv")
df_inactives_val = pd.read_csv("inactives_val.csv")

In [5]:
# Explore data.
print(df_actives_train.info())
display(df_actives_train.head())

print(df_actives_val.info())
display(df_actives_val.head())

print(df_inactives_train.info())
display(df_inactives_train.head())

print(df_inactives_val.info())
display(df_inactives_val.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  132 non-null    object
dtypes: object(1)
memory usage: 1.2+ KB
None


Unnamed: 0,SMILES
0,NC(=O)c1ccc(NC(=O)[C@@H]2CCCO2)cc1
1,Cc1nccn1CC1CCc2c(c3ccccc3n2C)C1=O.Cl
2,COC(=O)Nc1sc(C)nc1-c1ccccc1
3,OC1CCN(Cc2ccsc2)CC1
4,CC(C(O)c1ccc(O)cc1)N1CCC(Cc2ccccc2)CC1.CC(C(O)...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  34 non-null     object
dtypes: object(1)
memory usage: 400.0+ bytes
None


Unnamed: 0,SMILES
0,O=C(CCl)N1CCN(Cc2cccc3ccccc23)CC1
1,O=C(CCl)N1CCN(S(=O)(=O)c2cccs2)CC1
2,C[C@H]1CN(c2c(F)c(N)c3c(=O)c(C(=O)O)cn(C4CC4)c...
3,C/C=C/c1ccc(OC)cc1
4,Clc1cccc(CN2CCOCC2)c1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1758 entries, 0 to 1757
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  1758 non-null   object
dtypes: object(1)
memory usage: 13.9+ KB
None


Unnamed: 0,SMILES
0,Nc1ccn([C@@H]2O[C@H](CO)[C@@H](O)[C@@H]2O)c(=O)n1
1,COc1cc(Br)c(C[N+]2(CCOCCC3CCC4CC3C4(C)C)CCOCC2...
2,O=C1CCc2ccc(OCCCCN3CCN(c4cccc(Cl)c4Cl)CC3)cc2N1
3,COCC1=C(C(=O)OC(C)OC(=O)OC(C)C)N2C(=O)[C@@H](N...
4,c1nc(N2CCC2)c2[nH]cnc2n1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   SMILES  440 non-null    object
dtypes: object(1)
memory usage: 3.6+ KB
None


Unnamed: 0,SMILES
0,CCC1(C)CC(=O)NC1=O
1,CC(=O)N[C@@H](CCC(N)=O)C(=O)O
2,O=C1O[C@H]([C@@H](O)CO)C(O)=C1O
3,CC(C)n1cnc(S(N)(=O)=O)c1
4,Cc1ccc(=O)n(-c2ccccc2)c1


In [6]:
print("Total actives for training:", len(df_actives_train))
print("Total actives for validation:", len(df_actives_val))
print("Total inactives for training:", len(df_inactives_train))
print("Total inactives for validation:", len(df_inactives_val))

Total actives for training: 132
Total actives for validation: 34
Total inactives for training: 1758
Total inactives for validation: 440


## Canonicalize SMILES

In [14]:
# Canonicalize active molecules.
actives_train_canonical = canonicalize(df_actives_train["SMILES"])
actives_val_canonical = canonicalize(df_actives_val["SMILES"])


Total SMILES that were not canonical: 0

Total SMILES that were not canonical: 0


In [15]:
# Canonicalize inactive molecules.
inactives_train_canonical = canonicalize(df_inactives_train["SMILES"])
inactives_val_canonical = canonicalize(df_inactives_val["SMILES"])


Total SMILES that were not canonical: 0

Total SMILES that were not canonical: 0


## Export data

## End of document