In [None]:
'''This code take a csv structured with smiles strings for intermediates of a total synthesis in the same row,
identifies the target molecule (final smiles) and iteratively calculates the tanimoto similarity of each intermediate
to the target molecule. This code uses both RDKit Fingerprints and Avalon Fingerprints to calculate tanimoto similarity
and is placed into the respective new csvs'''

import pandas as pd
from rdkit import Chem
from rdkit.Chem import RDKFingerprint, AllChem, DataStructs
from rdkit.Avalon import pyAvalonTools


In [None]:
#path to csv
file = ''

#tanimoto function
def calculate_similarity(row, last_valid_step, final_fp_func, step_fp_func, fingerprint_name):
    final_structure_smiles = row[str(last_valid_step)]
    final_mol = Chem.MolFromSmiles(final_structure_smiles)
    if final_mol is None:
        return None

    final_fp = final_fp_func(final_mol)

    row_similarities = {}
    for i in range(1, last_valid_step):
        step_smiles = row[str(i)]
        step_mol = Chem.MolFromSmiles(step_smiles)
        if step_mol is None:
            print(f"Invalid SMILES '{step_smiles}' at step {i} for row {row.name}")
            continue

        try:
            step_fp = step_fp_func(step_mol)
            similarity = DataStructs.FingerprintSimilarity(final_fp, step_fp)
            row_similarities[f'Step_{i}'] = similarity
        except Exception as e:
            continue

    return row_similarities

#2 fingerprint types
def rdkit_fp(mol):
    return RDKFingerprint(mol)

fingerprint_types = {
    'rdkit': rdkit_fp
}
df = pd.read_csv(file)

#iteration for all data in csv
for fingerprint_name, fingerprint_func in fingerprint_types.items():
    similarities = []
    
    for idx, row in df.iterrows():
        last_valid_step = None
        for i in range(1, len(df.columns)):
            if pd.isna(row[str(i)]) or row[str(i)].strip() == '':
                last_valid_step = i - 1
                break

        if last_valid_step is None:
            last_valid_step = len(df.columns) - 1 

        row_similarities = calculate_similarity(row, last_valid_step, fingerprint_func, fingerprint_func, fingerprint_name)
        if row_similarities is not None:
            similarities.append(row_similarities)
#adds in names if wanted
    similarity_df = pd.DataFrame(similarities)
    similarity_df.insert(0, '', '')

    similarity_df.iloc[:, 0] = df.iloc[:, 1]
    similarity_df.to_csv(f'ECScore_{fingerprint_name}.csv', index=False)