Structure similarity investigation of GDB-13 fragrance-like molecules database 
Input molecule as SMILES, sort by structural similarity 
Play around with fingerprint representation 
Generate structural similarity maps 

In [2]:
# first attempt - fingerprint similarity comparison, no 3D structure 
import rdkit
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
import pandas as pd

In [3]:
# convert SMI to csv - use pandas to convert text file to dataframe 
column_name=["Smiles"]
df = pd.read_csv('/home/martyn0000/fragrance_investigation/GDB-13-FL.smi',
                sep='\r\n', engine='python', names=column_name
                )
df = df.iloc[1: , :]
print(df.head(2))

  Smiles
1    C#C
2  C1CC1


In [None]:
# sanitise input - smiles are canonical, do they need aromatised? 

In [4]:
# rdkit calculate features - Morgan fingerprint 
 
PandasTools.AddMoleculeColumnToFrame(df,'Smiles','molecule',includeFingerprints=True)
df['morgan'] = df['molecule'].map(lambda x:GetMorganFingerprintAsBitVect(x,2)) 

In [None]:
print (df.head(2))

In [None]:
# similarity search of database 
from rdkit.DataStructs import DataStructs
similarity_target=Chem.MolFromSmiles("C[C@@]1(C(C)=O)CC=C2CCCC(C)(C)[C@]2(C)[C@H]1C") 
def get_dataframe_with_x_most_similar_compounds_to_query(query, mol_df, molCol='molecule', x=int):
    query_fp = GetMorganFingerprintAsBitVect(query,2)
    mol_df['similarity'] = mol_df['morgan'].map(lambda x:DataStructs.TanimotoSimilarity(query_fp, x))
    mol_df.sort_values(['similarity'], ascending=False, inplace=True)
    return mol_df[:x].copy()  
similarity_df=get_dataframe_with_x_most_similar_compounds_to_query(similarity_target, df, x=50)
similarity_df.head()

In [None]:
# generate similarity maps of most suitable candidates 
    # get smiles of similar molecules as a list, loop through list making  
from rdkit.Chem import Draw 
from rdkit.Chem.Draw import SimilarityMaps
smiles=similarity_df["morgan"].to_list()
for similar_structure in list: 
    fp = SimilarityMaps.GetMorganFingerprint(similar_structure, fpType='bv')
    fig, maxweight = SimilarityMaps.GetSimilarityMapForFingerprint(similarity_target, similar_structure, SimilarityMaps.GetMorganFingerprint)