In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import colors


In [None]:
df = pd.read_csv("patent_table.csv")
df = df.drop(columns = ["cid", "smiles", "patent", 'descriptor', "rank"])
df

In [None]:
df["structure"] = df["fingerprint"].map(lambda x: 1 if x >= 0.6 else 0)

In [None]:
molecules = ["Penicillin", "Nirmatrelvir", "Zidovudine", "LSD", "Fentanyl", "Acid Blue 25 FA", "Avobenzone", "2-dPAC"]

In [None]:
canons = ["RDKit Atom 0", "RDKit Atom n", "OEChem"]
cans = []
mols = []
pp = []
mp = []
pm = []
mm = []
druglike = []
dyelike = []
unk_drugdyelike = []

for molecule in molecules:
    for canon in canons:
        sub_df = df[df["query"] == molecule][df["canon"] == canon]
        mols += [molecule]
        cans += [canon]
        pp += [sub_df[sub_df["structure"] == 1][sub_df["function"] == 1].shape[0]]
        mp += [sub_df[sub_df["structure"] == 0][sub_df["function"] == 1].shape[0]]
        pm += [sub_df[sub_df["structure"] == 1][sub_df["function"] == 0].shape[0]]
        mm += [sub_df[sub_df["structure"] == 0][sub_df["function"] == 0].shape[0]]
        druglike += [sub_df[sub_df["druglike"] == 1].shape[0]]
        dyelike += [sub_df[sub_df["dyelike"] == 1].shape[0]]
        unk_drugdyelike += [sub_df[sub_df["druglike"] == 0][sub_df["dyelike"] == 0].shape[0]]

In [None]:
new_df = pd.DataFrame()
new_df["molecule"] = mols
new_df["canon"] = cans
new_df["++"] = pp
new_df["-+"] = mp
new_df["+-"] = pm
new_df["--"] = mm
new_df["druglike"] = druglike
new_df["dyelike"] = dyelike
new_df["unk_drugdyelike"] = unk_drugdyelike
new_df = new_df.melt(id_vars=["molecule", "canon", "druglike", "dyelike", "unk_drugdyelike"],
                    var_name="strfun",
                    value_name="strfun_count")
new_df = new_df.melt(id_vars=["molecule", "canon", "strfun", "strfun_count"],
                     var_name="drug_dyelike",
                     value_name="drug_dyelike_count")

In [None]:
# select only rows of df with molecule == Penicillin or Nirmatrelvir
drug_like_queries = new_df[new_df["molecule"].isin(["Penicillin", "Nirmatrelvir", "Azidothymidine", "LSD", "Fentanyl"])]
dye_like_queries = new_df[new_df["molecule"].isin(["Acid Blue 25 FA", "Avobenzone", "2-dPAC"])]

In [None]:
from statannotations.Annotator import Annotator
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"


# data_df = drug_like
# label = "drug_like"

# data_df = dye_like
# label = "dye_like"

data_df = new_df
label = "all"


# Create box plots for each metric
fig, ax = plt.subplots()
x = "strfun"
y = "strfun_count"
hue = "canon"
hue_order = ["RDKit Atom 0", "RDKit Atom n", "OEChem"]
order = ["++", "+-", "-+", "--"]

pairs = [(("++", "RDKit Atom 0"), ("++", "RDKit Atom n")), (("++", "RDKit Atom 0"), ("++", "OEChem")), (("++", "RDKit Atom n"), ("++", "OEChem")),
        (("+-", "RDKit Atom 0"), ("+-", "RDKit Atom n")), (("+-", "RDKit Atom 0"), ("+-", "OEChem")), (("+-", "RDKit Atom n"), ("+-", "OEChem")),
        (("-+", "RDKit Atom 0"), ("-+", "RDKit Atom n")), (("-+", "RDKit Atom 0"), ("-+", "OEChem")), (("-+", "RDKit Atom n"), ("-+", "OEChem")),
        (("--", "RDKit Atom 0"), ("--", "RDKit Atom n")), (("--", "RDKit Atom 0"), ("--", "OEChem")), (("--", "RDKit Atom n"), ("--", "OEChem")),]

colors = {"RDKit Atom 0": "#E69F00", "RDKit Atom n": "#56B4E9", "OEChem":"#009E73"}

sns.boxplot(data=data_df, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors)

# violin plot
# sns.violinplot(data=data_df, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors)
# Add statistical annotations for hues
annotator = Annotator(ax, pairs, data=data_df, x=x, y=y, hue=hue, order=order, hue_order=hue_order)
annotator.configure(test="t-test_ind", text_format='star', loc='inside', verbose=2)
annotator.apply_test()
annotator.annotate()

# set hue colors

# Change legned title to "Canonicalization"
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, title="Canonicalization (n=8)", loc="upper left")

# Remove axis labels
ax.set_xlabel("(Structurally Similar, Known Relevant Function)")

# Change hue labels
ax.set_ylabel("# of Molecules in Top 20")

# Change xtick labels
ax.set_xticklabels([r"($+$,$+$)", r"($+$,$-$)", r"($-$,$+$)", r"($-$,$-$)"])

# Wrap tick labels down to a second line
ax.set_xticklabels(ax.get_xticklabels(), wrap=True)

# Override y tick marks
ax.set_yticks([0, 5, 10, 15, 20])


# dashed horizontal line ax y=1
# ax.axhline(y=1, color="black", linestyle="--")


# sns.swarmplot(data=df_all, x=x, y=y, hue=hue, order=order, ax=ax, color=".25")

plt.savefig(f"{label}_strfun.png", dpi=300, bbox_inches = "tight")


In [None]:
from statannotations.Annotator import Annotator
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"


data_df = drug_like_queries
label = "drug_like_queries"
n_sample = 5
legend_loc = "upper center"

# data_df = dye_like_queries
# label = "dye_like_queries"
# n_sample = 3
# legend_loc = "upper left"

# data_df = new_df
# label = "all"


# Create box plots for each metric
fig, ax = plt.subplots()
x = "drug_dyelike"
y = "drug_dyelike_count"
hue = "canon"
hue_order = ["RDKit Atom 0", "RDKit Atom n", "OEChem"]
order = ["druglike", "dyelike", "unk_drugdyelike"]

pairs = [(("druglike", "RDKit Atom 0"), ("druglike", "RDKit Atom n")), (("druglike", "RDKit Atom 0"), ("druglike", "OEChem")), (("druglike", "RDKit Atom n"), ("druglike", "OEChem")),
        (("dyelike", "RDKit Atom 0"), ("dyelike", "RDKit Atom n")), (("dyelike", "RDKit Atom 0"), ("dyelike", "OEChem")), (("dyelike", "RDKit Atom n"), ("dyelike", "OEChem")),
        (("unk_drugdyelike", "RDKit Atom 0"), ("unk_drugdyelike", "RDKit Atom n")), (("unk_drugdyelike", "RDKit Atom 0"), ("unk_drugdyelike", "OEChem")), (("unk_drugdyelike", "RDKit Atom n"), ("unk_drugdyelike", "OEChem")),
        ]

colors = {"RDKit Atom 0": "#E69F00", "RDKit Atom n": "#56B4E9", "OEChem":"#009E73"}

# sns.boxplot(data=data_df, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors)

# Plot bar plots w/ confidence interval caps and outlines around bars
sns.barplot(data=data_df, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors, capsize=.1, errwidth=1.5, edgecolor="black", linewidth=1)


# Add statistical annotations for hues
annotator = Annotator(ax, pairs, data=data_df, x=x, y=y, hue=hue, order=order, hue_order=hue_order)
annotator.configure(test="t-test_ind", text_format='star', loc='inside', verbose=2)
annotator.apply_test()
annotator.annotate()

# set hue colors

# Change legned title to "Canonicalization"
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, title=f"Canonicalization (n={n_sample})", loc=legend_loc)

# Remove axis labels
ax.set_xlabel("")

# Change hue labels
ax.set_ylabel("# of Molecules in Top 20")

# Change xtick labels
ax.set_xticklabels(["Drug-like", "Dye-like", "Unknown"])

# Wrap tick labels down to a second line
ax.set_xticklabels(ax.get_xticklabels(), wrap=True)

# Override y tick marks
ax.set_yticks([0, 5, 10, 15, 20])


# dashed horizontal line ax y=1
# ax.axhline(y=1, color="black", linestyle="--")



plt.savefig(f"{label}_drugdyelikeness.png", dpi=300, bbox_inches = "tight")


In [None]:
import numpy as np

sndfa_df = df[["query", "canon", "NDFA", "SDFA"]]

sdfa_rdkit_0 = []
sdfa_rdkit_n = []
sdfa_oechem = []
ndfa_rdkit_0 = []
ndfa_rdkit_n = []
ndfa_oechem = []

for molecule in molecules:
    sdfa_rdkit_0 += [len(sndfa_df[sndfa_df["query"] == molecule][sndfa_df["canon"] == "RDKit Atom 0"][sndfa_df["SDFA"] == 1])]
    sdfa_rdkit_n += [len(sndfa_df[sndfa_df["query"] == molecule][sndfa_df["canon"] == "RDKit Atom n"][sndfa_df["SDFA"] == 1])]
    sdfa_oechem += [len(sndfa_df[sndfa_df["query"] == molecule][sndfa_df["canon"] == "OEChem"][sndfa_df["SDFA"] == 1])]
    ndfa_rdkit_0 += [len(sndfa_df[sndfa_df["query"] == molecule][sndfa_df["canon"] == "RDKit Atom 0"][sndfa_df["NDFA"] == 1])]
    ndfa_rdkit_n += [len(sndfa_df[sndfa_df["query"] == molecule][sndfa_df["canon"] == "RDKit Atom n"][sndfa_df["NDFA"] == 1])]
    ndfa_oechem += [len(sndfa_df[sndfa_df["query"] == molecule][sndfa_df["canon"] == "OEChem"][sndfa_df["NDFA"] == 1])]

print(np.array(sdfa_rdkit_0).sum())
print(np.array(sdfa_rdkit_n).sum())
print(np.array(sdfa_oechem).sum())
print(np.array(ndfa_rdkit_0).sum())
print(np.array(ndfa_rdkit_n).sum())
print(np.array(ndfa_oechem).sum())

In [None]:
from statannotations.Annotator import Annotator
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"


struct_distinct = pd.DataFrame()
struct_distinct["RDKit Atom 0"] = ndfa_rdkit_0
struct_distinct["RDKit Atom n"] = ndfa_rdkit_n
struct_distinct["OEChem"] = ndfa_oechem
struct_distinct = struct_distinct.melt(var_name="canon", value_name="count")
struct_distinct["label"] = "Non-Derivative Functional Analogue"

# Create box plots for each metric
fig, ax = plt.subplots()
x = "label"
y = "count"
hue = "canon"
hue_order = ["RDKit Atom 0", "RDKit Atom n", "OEChem"]
order = ["Non-Derivative Functional Analogue"]

pairs = [(("Non-Derivative Functional Analogue", "RDKit Atom 0"), ("Non-Derivative Functional Analogue", "RDKit Atom n")), (("Non-Derivative Functional Analogue", "RDKit Atom 0"), ("Non-Derivative Functional Analogue", "OEChem")), (("Non-Derivative Functional Analogue", "RDKit Atom n"), ("Non-Derivative Functional Analogue", "OEChem")),
        ]

colors = {"RDKit Atom 0": "#E69F00", "RDKit Atom n": "#56B4E9", "OEChem":"#009E73"}

# sns.boxplot(data=data_df, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors)

# Plot bar plots w/ confidence interval caps and outlines around bars
sns.barplot(data=struct_distinct, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors, capsize=.1, errwidth=1.5, edgecolor="black", linewidth=1)

# violin plot
# sns.violinplot(data=struct_distinct, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors)


# Add statistical annotations for hues
annotator = Annotator(ax, pairs, data=struct_distinct, x=x, y=y, hue=hue, order=order, hue_order=hue_order)
annotator.configure(test="t-test_ind", text_format='star', loc='inside', verbose=2)
annotator.apply_test()
annotator.annotate()

# set hue colors

# Change legned title to "Canonicalization"
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, title="Canonicalization (n=8)", loc="center left")

# Remove axis labels
ax.set_xlabel("")

# Change hue labels
ax.set_ylabel("# of Molecules in Top 20")

# Change xtick labels
ax.set_xticklabels(["Non-Derivative\nFunctional Analogue"])

# Wrap tick labels down to a second line
ax.set_xticklabels(ax.get_xticklabels(), wrap=True)

# Override y tick marks
# ax.set_yticks([0, 5, 10, 15, 20])

# set y limit at 0
ax.set_ylim(bottom=0)


# dashed horizontal line ax y=1
# ax.axhline(y=1, color="black", linestyle="--")



plt.savefig(f"non_deriv_func_analogue.png", dpi=300, bbox_inches = "tight")
