Jupyter Notebook version of ECScore (RDKit Fingerprints) with additional fingerprints to try (Morgan, AtomPair, MHFP6, Avalon, MACCS) 
Please use the template for csv structure located on the GitHub to avoid any preprocessing errors!

In [4]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import RDKFingerprint, AllChem, DataStructs, MACCSkeys
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect, GetHashedAtomPairFingerprintAsBitVect
from rdkit.Chem.rdMHFPFingerprint import MHFPEncoder
from rdkit.Avalon import pyAvalonTools
import mhfp

In [26]:
#input your csv path with smiles following the template on github
df = pd.read_csv('strychnine_smiles.csv')

In [None]:
mhfp_encoder = MHFPEncoder() #just initializer

# sim calc function
def calc_sim(row, last_step, final_fp_func, step_fp_func, fp_name):
    final_smi = row.iloc[last_step]
    final_mol = Chem.MolFromSmiles(final_smi)
    if final_mol is None:
        return None
    final_fp = final_fp_func(final_mol)
    sims = []
    for i in range(1, last_step + 1): #+1 ensures final molecule is calculated against itself which gives 1 at end
        step_smi = row.iloc[i]
        step_mol = Chem.MolFromSmiles(step_smi)
        if step_mol is None:
            sims.append(None)
            continue
        try:
            step_fp = step_fp_func(step_mol) #forces manual jaccard calc for mhfp
            if fp_name == 'mhfp6':
                sim = jaccard_sim(final_fp, step_fp)
            else:
                sim = DataStructs.FingerprintSimilarity(final_fp, step_fp) #tan-sim calc for remaining fps
            sims.append(sim)
        except:
            sims.append(None)
    return sims

#fingerprint setup
#all standard parameters, see respective documentation to change individual parameters
def rdkit_fp(mol):
    return RDKFingerprint(mol)
def morgan_fp(mol):
    return GetMorganFingerprintAsBitVect(mol, radius=2)
def atom_pair_fp(mol):
    return GetHashedAtomPairFingerprintAsBitVect(mol)
def mhfp6_fp(mol):
    return mhfp_encoder.EncodeMol(mol)
def avalon_fp(mol):
    return pyAvalonTools.GetAvalonFP(mol)
def maccs_fp(mol):
    return MACCSkeys.GenMACCSKeys(mol)

# used only for mhfp6 tanimoto/jaccard score calc
def jaccard_sim(fp1, fp2):
    set1 = set(fp1)
    set2 = set(fp2)
    inter = len(set1 & set2)
    union = len(set1 | set2)
    return inter / union if union != 0 else 0

#comment out whatever you don't want to run
fp_types = {
    'rdkit': rdkit_fp,
    'morgan': morgan_fp,
    'atom_pair': atom_pair_fp,
    'mhfp6': mhfp6_fp,
    'avalon': avalon_fp,
    'maccs': maccs_fp,
}

# looping thru csv
#psa you must have header row and column, see csv template
for fp_name, fp_func in fp_types.items():
    sim_data = []
    for _, row in df.iterrows():
        last_step = None
        for i in range(1, len(row)): #skips header column set to 0 if no header column just smiles
            smiles = row.iloc[i]
            if pd.isna(smiles) or str(smiles).strip() == '':
                last_step = i - 1
                break
        if last_step is None:
            last_step = len(row) - 1
        sims = calc_sim(row, last_step, fp_func, fp_func, fp_name)
        if sims is not None:
            sim_data.append(sims)

#output results to csvs
    sim_df = pd.DataFrame(sim_data)
    sim_df.columns = df.columns[1:1 + sim_df.shape[1]]
    output = pd.concat([df.iloc[:, [0]].reset_index(drop=True), sim_df], axis=1)
    output.to_csv(f'ecscore_{fp_name}.csv', index=False)