# MACCSKeys analysis

In [None]:
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import MACCSkeys, Draw

In [None]:
df = pd.read_csv("../data/dataS4_with_fps.csv.bz2")
df.head()

In [None]:
# get just the MACCSKeys fingerprints
maccs = np.stack([np.array([int(s) for s in i]) for i in df["MACCSKeysFP"]])
maccs.shape

In [None]:
# how many of the 167 bits are turned on?
np.max(maccs, axis=0).sum()

In [None]:
# is this the same if we just look at 50000 random compounds?
random.seed(567)
random_idx = random.sample(range(len(maccs)), 50000)
np.max(maccs[random_idx], axis=0).sum()

In [None]:
# how often is each bit turned on?
[f"{i}: {ratio:.0%}" for i, ratio in enumerate(np.sum(maccs, axis=0) / maccs.shape[0])]

In [None]:
plt.bar(list(range(maccs.shape[1])), np.sum(maccs, axis=0) / maccs.shape[0])

In [None]:
# can we get the structural rationales?
# we obtain the SMARTSpatterns from rdkit...iirc, the first bit is always 0 and bits 125 and 166 have non-SMARTS definitions


Draw.MolToImage(Chem.MolFromSmarts(MACCSkeys.smartsPatts[66][0]))

In [None]:
# compare this to chembl
df_chembl = pd.read_csv("../data/chembl_34_50k-random_with_FP.csv")
df_chembl.head()

In [None]:
# get just the MACCSKeys fingerprints
maccs_chembl = np.stack([np.array([int(s) for s in i]) for i in df_chembl["MACCSKeysFP"]])
maccs_chembl.shape

In [None]:
# how many of the 167 bits are turned on?
np.max(maccs_chembl, axis=0).sum()

In [None]:
plt.bar(list(range(maccs_chembl.shape[1])), np.sum(maccs_chembl, axis=0) / maccs_chembl.shape[0])

In [None]:
plt.bar(list(range(maccs.shape[1])), np.sum(maccs, axis=0) / maccs.shape[0])
plt.bar(list(range(maccs_chembl.shape[1])), np.sum(maccs_chembl, axis=0) / maccs_chembl.shape[0], color="red")

In [None]:
# Calculate the mean values across rows for both datasets
maccs_mean = np.mean(maccs[random_idx], axis=0)
maccs_chembl_mean = np.mean(maccs_chembl, axis=0)

# Define the number of bars and their positions with a slight offset
bar_width = 0.4
index = np.arange(maccs.shape[1])

# Create the plot with a larger figure size for better presentation
plt.figure(figsize=(6.75, 4))

# Plot the bars with offset
plt.bar(index - bar_width/2, maccs_mean, bar_width, label="MACCS", color="#4b4c68")
plt.bar(index + bar_width/2, maccs_chembl_mean, bar_width, label="MACCS ChEMBL", color=(175/256, 87/256, 38/256))

# Add labels, title, and grid
plt.xlabel('MACCSKeys index', fontsize=8)
plt.ylabel('Mean', fontsize=8)

# Customize tick labels
plt.xticks(fontsize=6)
plt.yticks(fontsize=6)

# Add legend
plt.legend(["PRIME", "ChEMBL"], fontsize=6)

# Add gridlines for clarity
plt.grid(axis='y', linestyle='--', alpha=0.7)

# adjust xaxis
plt.xlim(0, 167)

# Adjust layout for better fit
plt.tight_layout()

# Show the plot
plt.savefig("../results/2024-04-18/MACCSKeysAnalysis.svg", transparent=True)

In [None]:
# Update the Matplotlib rcParams with your specific settings
plt.rcParams.update({
    # Settings equivalent to Seaborn theme
    "savefig.transparent": True,   # Transparent background for saving figures
    "axes.grid": False,            # No grid on axes
    "axes.spines.bottom": True,    # Show bottom spine
    "axes.spines.left": False,     # Hide left spine
    "axes.spines.right": False,    # Hide right spine
    "axes.spines.top": False,      # Hide top spine

    # Font settings
    "font.size": 6,                # Font size for the entire figure
    "font.family": 'sans-serif',
    "font.sans-serif": ["Helvetica", "Arial"],
    "text.color": 'black',         # Set default text color to black

    # Tick settings
    "xtick.major.pad": 0.0,        # Padding for major x-ticks
    "xtick.minor.pad": 0.0,        # Padding for minor x-ticks
    "ytick.major.pad": 0.0,        # Padding for major y-ticks
    "ytick.minor.pad": 0.0,        # Padding for minor y-ticks

    # Axis label settings
    "axes.labelweight": "bold",    # Bold axis labels
    "axes.labelpad": 2.5,          # Padding between axis and label
    "axes.xmargin": 0.05,          # Margin on the x-axis

    # Additional settings
    'axes.labelsize': 6,           # Size for axis labels
    'axes.titlesize': 6,           # Size for plot titles
    'xtick.labelsize': 6,          # Size for x-axis tick labels
    'ytick.labelsize': 6,          # Size for y-axis tick labels
    'legend.fontsize': 6,          # Font size for the legend
    'svg.fonttype': 'none',        # Necessary to have editable text in SVGs
    'axes.labelcolor': 'black',    # Color of axis labels
    'xtick.color': 'black',        # Color of x-axis tick labels
    'ytick.color': 'black',        # Color of y-axis tick labels
})

# You can reset or customize additional figure or plot properties as needed

# To reset or customize additional figure or plot properties, you can use:
#plt.style.use('default') # Alternatively, 'white' or 'classic' styles can be customized

In [None]:
# where are the biggest deviations?
np.where(np.abs(np.mean(maccs, axis=0) - np.mean(maccs_chembl, axis=0)) > 0.5)

In [None]:
Draw.MolToImage(Chem.MolFromSmarts(MACCSkeys.smartsPatts[88][0]))

The biggest absolute differences are on bits that are "always on" for synferm, 
e.g. 
- bit 36 (sulfur in a ring)
- bit 47 (sulfur and nitrogen connected to the same atom)
- bit 81 (sulfur connected to an atom with two other substituents)
- bit 88 (sulfur)

it is not very informative to look a these bits that are always on for synferm compounds b/c the substructure is contained in the backbone. We remove all bits that are always on for synferm and repeat the analysis

In [None]:
mask = np.where(np.mean(maccs, axis=0) != 1)[0]
mask

In [None]:
mask[np.where(np.mean(maccs, axis=0)[mask] - np.mean(maccs_chembl, axis=0)[mask] > 0.3)]

In [None]:
np.where(np.abs(np.mean(maccs, axis=0) - np.mean(maccs_chembl, axis=0)) > 0.5)

In [None]:
Draw.MolToImage(Chem.MolFromSmarts(MACCSkeys.smartsPatts[52][0]))

Not looking at the "always on" bits, the biggest other differences are:
e.g. 
- bit 52 (a nitrogen-nitrogen bond); overrepresented in PRIME
- bit 77 (two nitrogens connected to the same atom); underrepresented in PRIME
- bit 104 (a complex pattern involving a hydrogen-carrying heteroatom with a methylene group in the beta position); overrepresented in PRIME
- bit 135 (nitrogen substituent on an aromatic ring); underrepresented in PRIME


In [None]:
# how many bits are withine 10% deviation?
np.where(np.mean(maccs, axis=0) - np.mean(maccs_chembl, axis=0) < 0.1)[0].shape