In [None]:
import pathlib
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Lipinski

%matplotlib inline


In [None]:
plt.rcParams.update({
    "savefig.transparent": True,   # Transparent background for saving figures
    "axes.grid": False,            # No grid on axes
    "axes.spines.bottom": True,    # Show bottom spine
    "axes.spines.left": False,     # Hide left spine
    "axes.spines.right": False,    # Hide right spine
    "axes.spines.top": False,      # Hide top spine

    # Font settings
    "font.size": 6,                # Font size for the entire figure
    "font.family": 'sans-serif',
    "font.sans-serif": ["Helvetica", "Arial"],
    "text.color": 'black',         # Set default text color to black

    # Tick settings
    "xtick.major.pad": 1,        # Padding for major x-ticks
    "xtick.minor.pad": 1,        # Padding for minor x-ticks
    "ytick.major.pad": 1,        # Padding for major y-ticks
    "ytick.minor.pad": 1,        # Padding for minor y-ticks

    # Axis label settings
    "axes.labelweight": "bold",    # Bold axis labels
    "axes.labelpad": 2.5,          # Padding between axis and label
    "axes.xmargin": 0.05,          # Margin on the x-axis

    # Additional settings
    'axes.labelsize': 6,           # Size for axis labels
    'axes.titlesize': 6,           # Size for plot titles
    'xtick.labelsize': 6,          # Size for x-axis tick labels
    'ytick.labelsize': 6,          # Size for y-axis tick labels
    'legend.fontsize': 6,          # Font size for the legend
    'svg.fonttype': 'none',        # Necessary to have editable text in SVGs
    'axes.labelcolor': 'black',    # Color of axis labels
    'xtick.color': 'black',        # Color of x-axis tick labels
    'ytick.color': 'black',        # Color of y-axis tick labels
})

In [None]:
df_list = [PandasTools.LoadSDF(f"../data/PRIME_VL_2024-04-18_annotated/data_chunk{i+1:02d}.sdf") for i in range(23)]
df_all = pd.concat(df_list)
df_all.head()

In [None]:
# add fraction of sp3 carbons (not calculated by qikprop)
df_all["fsp3"] = df_all["ROMol"].apply(Lipinski.FractionCSP3)

In [None]:
# reduce df to columns we are interested in
df = df_all[[
    "ID",  # vl member ID
    "r_qp_mol_MW",  # mol weight
    "r_qp_QPlogPo/w",  # logP
    "r_qp_donorHB",  # HB donors
    "r_qp_accptHB",  # HB acceptors
    "i_qp_#rotor",  # rotatable bonds
    "i_qp_RuleOfFive",  # Ro5 violations
    "fsp3",  # fraction of sp3 carbons
    "r_user_PMI_ratio_I1/I3",  # n.b PMIs are calculated with Schrödinger's calculate_pmi.py script
    "r_user_PMI_ratio_I2/I3",
    "r_epik_Population",  # Epik can generate multiple ionizations states per molecule. This gives the relative population.
]].astype({"ID": object,
"r_qp_mol_MW": float,
"r_qp_QPlogPo/w": float,
"r_qp_donorHB": float,
"r_qp_accptHB": float,
"i_qp_#rotor": float,
"i_qp_RuleOfFive": float,
"fsp3": float,
"r_user_PMI_ratio_I1/I3": float,
"r_user_PMI_ratio_I2/I3": float,
"r_epik_Population": float,})

In [None]:
# reset df index (for unique access with df.loc in weighted mean function)
df = df.reset_index(drop=True)

In [None]:
# define lambda function for population-weighted averaging
weighted_mean = lambda x: np.average(x, weights=df.loc[x.index, "r_epik_Population"])
# define lambda function for picking the value of the ionizations state with highest population
pick_max = lambda x: x.loc[df["r_epik_Population"].idxmax()]

In [None]:
df.loc[df.loc[[5,4,299], "fsp3"].index, "r_epik_Population"]

In [None]:
# population-averaged mean of all properties
# 1. multiply with respective population
df[["r_qp_mol_MW",
    "r_qp_QPlogPo/w",
    "r_qp_donorHB",
    "r_qp_accptHB",
    "i_qp_#rotor",
    "i_qp_RuleOfFive",
    "fsp3",
    "r_user_PMI_ratio_I1/I3",
    "r_user_PMI_ratio_I2/I3"]] = df[
    ["r_qp_mol_MW",
    "r_qp_QPlogPo/w",
    "r_qp_donorHB",
    "r_qp_accptHB",
    "i_qp_#rotor",
    "i_qp_RuleOfFive",
    "fsp3",
    "r_user_PMI_ratio_I1/I3",
    "r_user_PMI_ratio_I2/I3"]
].mul(df["r_epik_Population"], axis=0)


# 2. group by molecule ID and sum over all entries
df = df.groupby("ID").sum()

# 3. divide by population sum
df[
    ["r_qp_mol_MW",
    "r_qp_QPlogPo/w",
    "r_qp_donorHB",
    "r_qp_accptHB",
    "i_qp_#rotor",
    "i_qp_RuleOfFive",
    "fsp3",
    "r_user_PMI_ratio_I1/I3",
    "r_user_PMI_ratio_I2/I3"]
] = df[
    ["r_qp_mol_MW",
    "r_qp_QPlogPo/w",
    "r_qp_donorHB",
    "r_qp_accptHB",
    "i_qp_#rotor",
    "i_qp_RuleOfFive",
    "fsp3",
    "r_user_PMI_ratio_I1/I3",
    "r_user_PMI_ratio_I2/I3"]
].div(df["r_epik_Population"], axis=0)

# 4. reset index (ID)
df = df.reset_index()

In [None]:
palette = ["#5790fc", "#f89c20", "#e42536", "#a1212c"]  # works for colorblind

In [None]:
plt.figure(figsize=(3.25, 3))
plt.hist(df["r_qp_mol_MW"], bins=20, rwidth=0.9, color=palette[0])
plt.axvline(500, color="black", ls="--")
plt.xlabel('Molecular weight')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig("../results/2024-04-18/properties/mw.svg")
plt.show()

In [None]:
plt.figure(figsize=(3.25, 3))
plt.hist(df["r_qp_QPlogPo/w"], bins=20, rwidth=0.9, color=palette[0])
plt.axvline(5, color="black", ls="--")
plt.xlabel('Log P')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig("../results/2024-04-18/properties/logp.svg")
plt.show()

In [None]:
plt.figure(figsize=(3.25, 3))
plt.hist(df["r_qp_donorHB"], bins=[-0.5, 0.5, 1.5, 2.5, 3.5, 4.5], rwidth=0.9, color=palette[0])
plt.axvline(5.5, color="black", ls="--")
plt.xlabel('H bond donors')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig("../results/2024-04-18/properties/HBdonor.svg")
plt.show()

In [None]:
plt.figure(figsize=(3.25, 3))
plt.hist(df["r_qp_accptHB"], bins=[-0.5 + i for i in range(19)], rwidth=0.9, color=palette[0])
plt.axvline(10.5, color="black", ls="--")
plt.xlabel('H bond acceptors')
plt.ylabel('Count')
plt.xticks(list(range(0, 19, 2)))
plt.tight_layout()
plt.savefig("../results/2024-04-18/properties/HBacceptor.svg")
plt.show()

In [None]:
plt.figure(figsize=(3.25, 3))
plt.hist(df["i_qp_#rotor"], bins=[-0.5 + i for i in range(21)], rwidth=0.9, color=palette[0])
plt.axvline(10.5, color="black", ls="--")
plt.xlabel('Rotatable bonds')
plt.ylabel('Count')
plt.xticks(list(range(0, 21, 2)))
plt.tight_layout()
plt.savefig("../results/2024-04-18/properties/rotbonds.svg")
plt.show()

In [None]:
plt.figure(figsize=(3.25, 3))
plt.hist(df["fsp3"], bins=[i / 20 for i in range(21)], rwidth=0.9, color=palette[0])
plt.axvline(0.5, color="black", ls="--")
plt.xlabel('Fraction of sp3 carbons')
plt.ylabel('Count')
plt.xlim(0,1)
plt.tight_layout()
plt.savefig("../results/2024-04-18/properties/fsp3.svg")
plt.show()

In [None]:
plt.figure(figsize=(3.25, 3))
plt.hist(df["i_qp_#rotor"].astype(float).to_numpy(), bins=[-0.5 + i for i in range(21)], rwidth=0.9, color=palette[0])
plt.axvline(10.5, color="black", ls="--")
plt.xlabel('Rotatable bonds')
plt.ylabel('Count')
plt.xticks(list(range(0, 21, 2)))
plt.tight_layout()
plt.savefig("../results/2024-04-18/properties/rotbonds.svg")
plt.show()

In [None]:
# how many are RO5 compliant?
(df["i_qp_RuleOfFive"].astype(float) < 2).value_counts()

In [None]:
# how many are RO5 compliant?
(df["i_qp_RuleOfFive"].astype(float) < 2).value_counts() / len(df)

In [None]:
plt.figure(figsize=(3.25, 3))
plt.hist(df["i_qp_RuleOfFive"].astype(float).to_numpy(), bins=[-0.5 + i for i in range(0,5,2)], rwidth=0.9, color=palette[0])
plt.axvline(1.5, color="black", ls="--")
plt.xlabel('Rule-of-5 violations')
plt.ylabel('Count')
plt.xticks([0.5, 2.5], ["0 or 1", ">1"])
plt.xlim(-0.5, 3.5)
plt.tight_layout()
plt.savefig("../results/2024-04-18/properties/ro5_violations.svg")
plt.show()

In [None]:
# prepare pmi plot

# Calculate the point density
x = df["r_user_PMI_ratio_I1/I3"]
y = df["r_user_PMI_ratio_I2/I3"]
hist, xedges, yedges = np.histogram2d(x, y, bins=100, range=[[0, 1], [0, 1]])
# Find the bin indices for each point
x_bin_idx = np.digitize(x, xedges) - 1
y_bin_idx = np.digitize(y, yedges) - 1

# Assign density (from the histogram) to each point
density = hist[x_bin_idx, y_bin_idx]
# Sort points by density (lowest density first, highest last)
sorted_indices = np.argsort(density)
x_sorted = x[sorted_indices]
y_sorted = y[sorted_indices]
density_sorted = density[sorted_indices]


In [None]:
plt.figure(figsize=(5, 4))
# Plot the triangle boundary for sphere-like, rod-like, and disc-like shapes
triangle_coords = np.array([[0, 1], [0.5, 0.5], [1, 1], [0, 1]])
plt.plot(triangle_coords[:, 0], triangle_coords[:, 1], color="black")

# Plot the normalized moments of inertia
plt.scatter(x_sorted, y_sorted, c=density_sorted, cmap="viridis", s=.5)
plt.colorbar(label='Density')


# Set plot limits and aspect ratio
plt.xlim(0, 1)
plt.ylim(0.5, 1)
plt.gca().spines["bottom"].set_visible(False)
plt.ylabel("NPR2")
plt.xlabel("NPR1")

# Add title and legend
plt.tight_layout()

plt.savefig("../results/2024-04-18/properties/pmi.svg")
plt.savefig("../results/2024-04-18/properties/pmi.png", dpi=300)
plt.show()