# Molecular fingerprints

First, lets import/install the packages we will need:

In [None]:
! pip install rdkit

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import AllChem
from rdkit.Chem import MolFromSmiles,MolToSmiles
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG, display
import numpy as np
from collections import defaultdict

Like before, we need to load in our SMILES string and convert to an RDKit mol object:

In [None]:
smi = "c1ccc(C[CH]2CO2)cc1"
mol = MolFromSmiles(smi)

Before generating our fingerprints, lets visualize the molecule:

In [None]:
d2d = rdMolDraw2D.MolDraw2DSVG(300, 300)
d2d.drawOptions().addAtomIndices = True
rdMolDraw2D.PrepareAndDrawMolecule(d2d, mol)
d2d.FinishDrawing()
svg = d2d.GetDrawingText()
display(SVG(svg))

Now let's generate our fingerprint, we can easily do this in RDKit (this is for a radius of 2):

In [None]:
info = {}
fp = AllChem.GetMorganFingerprint(mol, 2, bitInfo=info)

The `info` dictionary we have made contains all of the bit ids and values (atom index, radius) in our fingerprint.

In [None]:
print(info)

Lets have a look at the first bit and see what atoms and radii are involved:

In [None]:
print(f"The first bit set in the fingerprint is bit number {list(info.keys())[0]} and it corresponds to the following substructures:")

first_key = list(info.keys())[0]

for i, (atom_idx, radius) in enumerate(info[first_key]):
    print(f"Substructure {i+1}: atom index {atom_idx}, radius {radius}")

All of these bits are combined together to give a string of 0s and 1s which encode the 2D structure of the molecule:

In [None]:
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)

In [None]:
np.set_printoptions(threshold=np.inf)
print(fp_array)

Lets dig into how the bits are generated at each of the different atoms in the molecule. Have a play around by changing the `target_atom` variable to another atom in the molecule (use the image we generated above to get the atom numbers).

In [None]:
target_atom = 3

# collect atoms and bonds contributing at each radius for the target atom
atoms_by_radius = defaultdict(set)
bonds_by_radius = defaultdict(set)
for _, contribs in info.items():
    for aidx, r in contribs:
        if aidx != target_atom:
            continue
        env_bonds = Chem.FindAtomEnvironmentOfRadiusN(mol, r, aidx)
        bonds_by_radius[r].update(env_bonds)
        atoms_by_radius[r].add(aidx)
        for bidx in env_bonds:
            b = mol.GetBondWithIdx(bidx)
            atoms_by_radius[r].add(b.GetBeginAtomIdx())
            atoms_by_radius[r].add(b.GetEndAtomIdx())

radii_sorted = sorted(atoms_by_radius.keys())
if not radii_sorted:
    raise ValueError(f"No radius information found in 'info' for atom index {target_atom}.")

# color per radius
def _hex_to_rgb_tuple(hex_color: str):
    s = hex_color.lstrip('#')
    if len(s) == 3:
        s = ''.join(ch * 2 for ch in s)
    return tuple(int(s[i:i+2], 16) / 255.0 for i in (0, 2, 4))

_raw_palette = ["#BFD9F0", "#F1C1C1", "#BFDFC0", (0.30, 0.30, 0.90)]
palette = [_hex_to_rgb_tuple(c) if isinstance(c, str) else tuple(c) for c in _raw_palette]
radius_colors = {r: palette[i % len(palette)] for i, r in enumerate(radii_sorted)}

# collect 32-bit identifiers (bit ids) per radius for the target atom
bitids_by_radius = defaultdict(set)
for bit_id, contribs in info.items():
    for aidx, r in contribs:
        if aidx == target_atom:
            bitids_by_radius[r].add(bit_id)

# draw and save one SVG per radius; also print the 32-bit identifiers
for r in radii_sorted:
    # print 32-bit identifiers associated with this radius
    bit_ids = sorted(bitids_by_radius.get(r, []))
    if bit_ids:
        print(f"atom {target_atom}, radius {r} identifiers:")
        for bid in bit_ids:
            print(f"  {bid} (0x{bid & 0xFFFFFFFF:08X})")

    highlight_atoms = sorted(atoms_by_radius[r])
    highlight_bonds = sorted(bonds_by_radius[r])
    h_atom_colors = {a: radius_colors[r] for a in highlight_atoms}
    h_bond_colors = {b: radius_colors[r] for b in highlight_bonds}

    d2d = rdMolDraw2D.MolDraw2DSVG(300, 300)
    d2d.drawOptions().addAtomIndices = True
    d2d.DrawMolecule(
        mol,
        highlightAtoms=highlight_atoms,
        highlightBonds=highlight_bonds,
        highlightAtomColors=h_atom_colors,
        highlightBondColors=h_bond_colors,
        legend=f"atom {target_atom}, radius {r}"
    )
    d2d.FinishDrawing()
    svg_single = d2d.GetDrawingText()
    display(SVG(svg_single))