### Imports 

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

### Function definitions

In [2]:
#defines a function to calculate the tanimoto similarity between two molecules in SMILES format (from https://medium.com/data-professor/how-to-calculate-molecular-similarity-25d543ea7f40).
def tanimoto_calc(smi1, smi2):
    mol1 = Chem.MolFromSmiles(smi1)
    mol2 = Chem.MolFromSmiles(smi2)
    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 5, nBits=2048)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 5, nBits=2048)
    sim = round(DataStructs.TanimotoSimilarity(fp1,fp2),3)
    return sim

#define a function to rank the molecules by their Tanimoto score against the input molecule.
def tanimoto_ranker(query): #query must be a SMILES string
    df_mol = pd.read_csv('./TF_DB_clean_pathway.csv')
    simil = []
    for i in df_mol['SMILES']:
        simil.append(float(tanimoto_calc(query,i)))
    df_mol['Tanimoto_score_vs_query'] = simil
    df_mol = df_mol.sort_values(by=['Tanimoto_score_vs_query'], ascending=False).reset_index(drop=True)
    return df_mol

### Usage

In [3]:
input_molecule = "C1C(OC2=CC(=CC(=C2C1=O)O)O)C3=CC=CC=C3" #pinocembrin

In [4]:
dataframe = tanimoto_ranker(input_molecule)
dataframe

Unnamed: 0,Molecule,SMILES,InChI,Species,TF,Bibliographic_ref,Database_ref,NCBI_Accession,UniProt,AA_sequence,Pathways,Tanimoto_score_vs_query
0,naringenin,O=C1CC(c2ccc(O)cc2)Oc2cc(O)cc(O)c21,InChI=1S/C15H12O5/c16-9-3-1-8(2-4-9)13-7-12(19...,Sinorhizobium meliloti,nodD1,1021/acssynbio.8b00326,,WP_010967456,,MRFRGLDLNLLVALDALMTERKLTAAARRINLSQPAMSAAIARLRT...,Biosynthesis of other secondary metabolites;Me...,0.648
1,naringenin,O=C1CC(c2ccc(O)cc2)Oc2cc(O)cc(O)c21,InChI=1S/C15H12O5/c16-9-3-1-8(2-4-9)13-7-12(19...,R.leguminosarum,nodD,PMID:12799442,RegTransBase v20120406 (20170227),WP_207159894.1,,MRFKGLDLNLLVALDALMTERKLTAAARSINLSQPAMSAAISRLRA...,Biosynthesis of other secondary metabolites;Me...,0.648
2,naringenin,O=C1CC(c2ccc(O)cc2)Oc2cc(O)cc(O)c21,InChI=1S/C15H12O5/c16-9-3-1-8(2-4-9)13-7-12(19...,Azorhizobium caulinodans,nodD,PMID:7590297,RegTransBase v20120406 (20170227),WP_012172315.1,,MRFKGLDLNLLVALNALLSEHSVTSAAKSINLSQPAMSAAVQRLRI...,Biosynthesis of other secondary metabolites;Me...,0.648
3,naringenin,O=C1CC(c2ccc(O)cc2)Oc2cc(O)cc(O)c21,InChI=1S/C15H12O5/c16-9-3-1-8(2-4-9)13-7-12(19...,Azorhizobium caulinodans ORS571,nodD,PMID:2158977,RegTransBase v20120406 (20170227),CAA88827.1,,MRFKGLDLNLLVALNALLSEHSVTSAAKSINLSQPAMSAAVQRLRI...,Biosynthesis of other secondary metabolites;Me...,0.648
4,naringenin,O=C1CC(c2ccc(O)cc2)Oc2cc(O)cc(O)c21,InChI=1S/C15H12O5/c16-9-3-1-8(2-4-9)13-7-12(19...,Pseudomonas putida,TtgR,PMID:16407274,RegTransBase v20120406 (20170227),BAN52789.1,,MVRRTKEEAQETRAQIIEAAEKAFYKRGVARTTLADIAELAGVTRG...,Biosynthesis of other secondary metabolites;Me...,0.648
...,...,...,...,...,...,...,...,...,...,...,...,...
5385,iron(fe2+),[Fe+2],InChI=1S/Fe/q+2,Lactobacillus reuteri JCM 1112,perr,,https://regprecise.lbl.gov/sites.jsp?regulog_i...,YP_001842228.1,,MAEAEFDRALDHLRENKVRLTPQRKTILNYLINHHTHPSVEMIYDD...,Energy metabolism;Carbohydrate metabolism;Meta...,0.000
5386,iron(fe2+),[Fe+2],InChI=1S/Fe/q+2,Lactobacillus sakei subsp. sakei 23K,perr,,https://regprecise.lbl.gov/sites.jsp?regulog_i...,YP_395079.1,,MSQTIVEQTLRKLKVNNVRITPQRQAVLEFMIGTHMHPTADDVYQA...,Energy metabolism;Carbohydrate metabolism;Meta...,0.000
5387,iron(fe2+),[Fe+2],InChI=1S/Fe/q+2,Lactobacillus salivarius UCC118,perr,,https://regprecise.lbl.gov/sites.jsp?regulog_i...,YP_536255.1,,MNNQEQLMMAAEKLKKRHIKNTPQRQVILAYLMSSKEHPSIEMIYS...,Energy metabolism;Carbohydrate metabolism;Meta...,0.000
5388,iron(fe2+),[Fe+2],InChI=1S/Fe/q+2,Staphylococcus aureus subsp. aureus N315,perr,,https://regprecise.lbl.gov/sites.jsp?regulog_i...,NP_374968.1,,MSVEIESIEHELEESIASLRQAGIRITPQRQAILRYLISSHTHPTA...,Energy metabolism;Carbohydrate metabolism;Meta...,0.000
