In [None]:
# Charges adjusted to match RDKit Standardized Database
rdkit_atom_0_query_adjusted_smiles = [
    "CC1(C)SC2C(NC(=O)Cc3ccccc3)C(=O)N2C1C(=O)[O-]", # penicillin
    "CC(C)(C)C(NC(=O)C(F)(F)F)C(=O)N1CC2C(C1C(=O)NC(C#N)CC1CCNC1=O)C2(C)C", # nirmatrelvir
    "Cc1cn(C2CC(N=[N+]=[N-])C(CO)O2)c(=O)[nH]c1=O", # zidovudine
    "CCN(CC)C(=O)C1C=C2c3cccc4[nH]cc(c34)CC2[NH+](C)C1", # lsd
    "CCC(=O)N(c1ccccc1)C1CC[NH+](CCc2ccccc2)CC1", # fentanyl
    "Nc1c(S(=O)(=O)[O-])cc(Nc2ccccc2)c2c1C(=O)c1ccccc1C2=O", # acid blue 25 free acid
    "COc1ccc(C(=O)CC(=O)c2ccc(C(C)(C)C)cc2)cc1", # avobenzone
    "c1ccc(N(c2ccccc2)c2ccc3c(c2)[nH]c2ccccc23)cc1", # 2-diphenylaminocarbazole
]


rdkit_atom_n_query_adjusted_smiles = [
    "c1ccc(CC(=O)NC2C(=O)N3C2SC(C)(C)C3C(=O)[O-])cc1", # penicillin
    "N(C(=O)C1C2C(CN1C(=O)C(NC(=O)C(F)(F)F)C(C)(C)C)C2(C)C)C(C#N)CC1CCNC1=O", # nirmatrelvir
    "O=c1[nH]c(=O)c(C)cn1C1CC(N=[N+]=[N-])C(CO)O1", # zidovudine
    "C12=CC(C(=O)N(CC)CC)C[NH+](C)C1Cc1c[nH]c3cccc2c13", # lsd
    "c1(CC[NH+]2CCC(N(C(=O)CC)c3ccccc3)CC2)ccccc1", # fentanyl
    "C1(=O)c2ccccc2C(=O)c2c(N)c(S(=O)(=O)[O-])cc(Nc3ccccc3)c21", # acid blue 25 free acid
    "C(C(=O)c1ccc(OC)cc1)C(=O)c1ccc(C(C)(C)C)cc1", # avobenzone
    "c12ccccc1c1ccc(N(c3ccccc3)c3ccccc3)cc1[nH]2", # 2-diphenylaminocarbazole
]

oechem_query_adjusted_smiles = [
    "CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)[O-])C", # penicillin
    "CC1(C2C1C(N(C2)C(=O)C(C(C)(C)C)NC(=O)C(F)(F)F)C(=O)NC(CC3CCNC3=O)C#N)C", # nirmatrelvir
    "CC1=CN(C(=O)NC1=O)C2CC(C(O2)CO)N=[N+]=[N-]", # zidovudine
    "CCN(CC)C(=O)C1C[NH+](C2CC3=CNC4=CC=CC(=C34)C2=C1)C", # lsd
    "CCC(=O)N(C1CC[NH+](CC1)CCC2=CC=CC=C2)C3=CC=CC=C3", # fentanyl
    "C1=CC=C(C=C1)NC2=CC(=C(C3=C2C(=O)C4=CC=CC=C4C3=O)N)S(=O)(=O)[O-]", # acid blue 25 free acid
    "CC(C)(C)C1=CC=C(C=C1)C(=O)CC(=O)C2=CC=C(C=C2)OC", # avobenzone
    "C1=CC=C(C=C1)N(C2=CC=CC=C2)C3=CC4=C(C=C3)C5=CC=CC=C5N4", # 2-diphenylaminocarbazole
]

rdkit_atom_0_results_path_list = [
    "../results/similarity_pen_rdkit_atom_0.csv",
    "../results/similarity_nirmatrelvir_rdkit_atom_0.csv",
    "../results/similarity_zidovudine_rdkit_atom_0.csv",
    "../results/similarity_lsd_rdkit_atom_0.csv",
    "../results/similarity_fentanyl_rdkit_atom_0.csv",
    "../results/similarity_acid_blue_25_rdkit_atom_0.csv",
    "../results/similarity_avobenzone_rdkit_atom_0.csv",
    "../results/similarity_2-diphenylaminocarbazole_rdkit_atom_0.csv",
]

rdkit_atom_n_results_path_list = [
    "../results/similarity_pen_rdkit_atom_n.csv",
    "../results/similarity_nirmatrelvir_rdkit_atom_n.csv",
    "../results/similarity_zidovudine_rdkit_atom_n.csv",
    "../results/similarity_lsd_rdkit_atom_n.csv",
    "../results/similarity_fentanyl_rdkit_atom_n.csv",
    "../results/similarity_acid_blue_25_rdkit_atom_n.csv",
    "../results/similarity_avobenzone_rdkit_atom_n.csv",
    "../results/similarity_2-diphenylaminocarbazole_rdkit_atom_n.csv",
]

oechem_results_path_list = [
    "../results/similarity_pen_oechem.csv",
    "../results/similarity_nirmatrelvir_oechem.csv",
    "../results/similarity_zidovudine_oechem.csv",
    "../results/similarity_lsd_oechem.csv",
    "../results/similarity_fentanyl_oechem.csv",
    "../results/similarity_acid_blue_25_oechem.csv",
    "../results/similarity_avobenzone_oechem.csv",
    "../results/similarity_2-diphenylaminocarbazole_oechem.csv",
]

names_list = [
    "penicillin",
    "nirmatrelvir",
    "zidovudine",
    "lsd",
    "fentanyl",
    "acid_blue_25",
    "avobenzone",
    "2-diphenylaminocarbazole"
]

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.figure import figaspect
from difflib import SequenceMatcher
from rdkit import Chem, DataStructs
from transformers import AutoTokenizer


In [None]:

def tanimoto(smi1, smi2):
    """
    Code from https://medium.com/data-professor/how-to-calculate-molecular-similarity-25d543ea7f40
    """
    mol1 = Chem.MolFromSmiles(smi1)
    mol2 = Chem.MolFromSmiles(smi2)
    fp1 = Chem.RDKFingerprint(mol1)
    fp2 = Chem.RDKFingerprint(mol2)
    # fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, useChirality = False, radius = 3, nBits=2048)
    # fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, useChirality = False, radius = 3, nBits=2048)
    s = round(DataStructs.TanimotoSimilarity(fp1,fp2),2)
    return s

In [None]:
def token_similarity(smi1, smi2, tokenizer):
    smi1_tokens = tokenizer(smi1).input_ids
    smi2_tokens = tokenizer(smi2).input_ids
    ratio_token_len = len(smi1_tokens) / len(smi2_tokens)
    ratio_shared_tokens = len(set(smi1_tokens) & set(smi2_tokens)) / len(set(smi1_tokens) | set(smi2_tokens))
    return ratio_shared_tokens, ratio_token_len

In [None]:
rdkit_atom_0_smiles = pd.DataFrame()
rdkit_atom_0_fsim = pd.DataFrame()
rdkit_atom_0_gestalt = pd.DataFrame()
rdkit_atom_0_fingerprint = pd.DataFrame()
rdkit_atom_0_token_len_ratio = pd.DataFrame()
rdkit_atom_0_token_similarity = pd.DataFrame()
rdkit_atom_n_smiles = pd.DataFrame()
rdkit_atom_n_fsim = pd.DataFrame()
rdkit_atom_n_gestalt = pd.DataFrame()
rdkit_atom_n_fingerprint = pd.DataFrame()
rdkit_atom_n_token_len_ratio = pd.DataFrame()
rdkit_atom_n_token_similarity = pd.DataFrame()
oechem_smiles = pd.DataFrame()
oechem_fsim = pd.DataFrame()
oechem_gestalt = pd.DataFrame()
oechem_fingerprint = pd.DataFrame()
oechem_token_len_ratio = pd.DataFrame()
oechem_token_similarity = pd.DataFrame()



tokenizer = AutoTokenizer.from_pretrained(
    "seyonec/PubChem10M_SMILES_BPE_450k", model_max_length=512,
)

for path, mol_name, query_smi in zip(rdkit_atom_0_results_path_list, names_list, rdkit_atom_0_query_adjusted_smiles):
    tmp_df = pd.read_csv(path)[:20]
    rdkit_atom_0_smiles[mol_name] = tmp_df["SMILES"]
    rdkit_atom_0_fsim[mol_name] = tmp_df["similarity"]
    rdkit_atom_0_gestalt[mol_name] = rdkit_atom_0_smiles[mol_name].map(lambda x: SequenceMatcher(None, query_smi, x).ratio())
    rdkit_atom_0_fingerprint[mol_name] = rdkit_atom_0_smiles[mol_name].map(lambda x: tanimoto(query_smi, x))
    rdkit_atom_0_token_len_ratio[mol_name] = rdkit_atom_0_smiles[mol_name].map(lambda x: token_similarity(query_smi, x, tokenizer)[1])
    rdkit_atom_0_token_similarity[mol_name] = rdkit_atom_0_smiles[mol_name].map(lambda x: token_similarity(query_smi, x, tokenizer)[0])
    
for path, mol_name, query_smi in zip(rdkit_atom_n_results_path_list, names_list, rdkit_atom_n_query_adjusted_smiles):
    tmp_df = pd.read_csv(path)[:20]
    rdkit_atom_n_smiles[mol_name] = tmp_df["SMILES"]
    rdkit_atom_n_fsim[mol_name] = tmp_df["similarity"]
    rdkit_atom_n_gestalt[mol_name] = rdkit_atom_n_smiles[mol_name].map(lambda x: SequenceMatcher(None, query_smi, x).ratio())
    rdkit_atom_n_fingerprint[mol_name] = rdkit_atom_n_smiles[mol_name].map(lambda x: tanimoto(query_smi, x))
    rdkit_atom_n_token_len_ratio[mol_name] = rdkit_atom_n_smiles[mol_name].map(lambda x: token_similarity(query_smi, x, tokenizer)[1])
    rdkit_atom_n_token_similarity[mol_name] = rdkit_atom_n_smiles[mol_name].map(lambda x: token_similarity(query_smi, x, tokenizer)[0])


for path, mol_name, query_smi in zip(oechem_results_path_list, names_list, oechem_query_adjusted_smiles):
    tmp_df = pd.read_csv(path)[:20]
    oechem_smiles[mol_name] = tmp_df["SMILES"]
    oechem_fsim[mol_name] = tmp_df["similarity"]
    oechem_gestalt[mol_name] = oechem_smiles[mol_name].map(lambda x: SequenceMatcher(None, query_smi, x).ratio())
    oechem_fingerprint[mol_name] = oechem_smiles[mol_name].map(lambda x: tanimoto(query_smi, x))
    oechem_token_len_ratio[mol_name] = oechem_smiles[mol_name].map(lambda x: token_similarity(query_smi, x, tokenizer)[1])
    oechem_token_similarity[mol_name] = oechem_smiles[mol_name].map(lambda x: token_similarity(query_smi, x, tokenizer)[0])


In [None]:
fsim = [rdkit_atom_0_fsim, rdkit_atom_n_fsim, oechem_fsim]
gestalt = [rdkit_atom_0_gestalt, rdkit_atom_n_gestalt, oechem_gestalt]
fingerprint = [rdkit_atom_0_fingerprint, rdkit_atom_n_fingerprint, oechem_fingerprint]
token_len_ratio = [rdkit_atom_0_token_len_ratio, rdkit_atom_n_token_len_ratio, oechem_token_len_ratio]
token_similarity = [rdkit_atom_0_token_similarity, rdkit_atom_n_token_similarity, oechem_token_similarity]

canon_list = ["rdkit_atom_0", "rdkit_atom_n", "oechem"]


In [None]:
plt.rcParams["font.family"] = "serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"


In [None]:
# sns.set(font_scale=0.4)
# sns.set(font_scale=1)

for count, (df, canon) in enumerate(zip(fsim, canon_list)):
    data = df.transpose()
    x_labels = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    y_labels = ["Penicillin G", "Nirmatrelvir", "Zidovudine", "LSD", "Fentanyl", "Acid Blue 25 FA", "Avobenzone", "2-dPAC"]
    # create heatmap
    fig, ax = plt.subplots()
    cmap = sns.color_palette("viridis", as_cmap=True)
    ax = sns.heatmap(data, 
                    vmin=0, 
                    vmax=1, 
                    cmap=cmap,
                    annot=True,
                    xticklabels=x_labels,
                    yticklabels=y_labels,
                    cbar=True,
                    square=True,
                    linewidths=0.5,
                    annot_kws={"size":5},
                    cbar_kws={"shrink": 0.5, "label":"Cosine Similarity"},
                    )
    ax.set(xlabel="Result #")
    plt.xticks(np.arange(0,data.shape[1],2)+0.5, np.arange(1,data.shape[1]+1,2))
    ax.tick_params(axis=u'both', which=u'both',length=0)
    # Make x axis labels on top
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')    
    plt.savefig(f"{canon}_results_fsim.png", dpi=300, bbox_inches = "tight")


for df, canon in zip(gestalt, canon_list):
    data = df.transpose()
    x_labels = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    y_labels = ["Penicillin G", "Nirmatrelvir", "Zidovudine", "LSD", "Fentanyl", "Acid Blue 25 FA", "Avobenzone", "2-dPAC"]
    # create heatmap
    fig, ax = plt.subplots()
    cmap = sns.color_palette("viridis", as_cmap=True)
    ax = sns.heatmap(data, 
                    vmin=0, 
                    vmax=1, 
                    cmap=cmap,
                    annot=True,
                    xticklabels=x_labels,
                    yticklabels=y_labels,
                    cbar=True,
                    square=True,
                    linewidths=0.5,
                    annot_kws={"size":5},
                    cbar_kws={"shrink": 0.5, "label":"Gestalt Similarity"},
                    )
    ax.set(xlabel="Result #")
    plt.xticks(np.arange(0,data.shape[1],2)+0.5, np.arange(1,data.shape[1]+1,2))
    ax.tick_params(axis=u'both', which=u'both',length=0)
    # Make x axis labels on top
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')  
    plt.savefig(f"{canon}_results_gestalt.png", dpi=300, bbox_inches = "tight")

for df, canon in zip(fingerprint, canon_list):
    data = df.transpose()
    x_labels = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    y_labels = ["Penicillin G", "Nirmatrelvir", "Zidovudine", "LSD", "Fentanyl", "Acid Blue 25 FA", "Avobenzone", "2-dPAC"]
    # create heatmap
    fig, ax = plt.subplots()
    cmap = sns.color_palette("viridis", as_cmap=True)
    ax = sns.heatmap(data, 
                    vmin=0, 
                    vmax=1, 
                    cmap=cmap,
                    annot=True,
                    xticklabels=x_labels,
                    yticklabels=y_labels,
                    cbar=True,
                    square=True,
                    linewidths=0.5,
                    annot_kws={"size":5},
                    cbar_kws={"shrink": 0.5, "label":"Fingerprint Tanimoto Similarity"},
                    )
    ax.set(xlabel="Result #")
    plt.xticks(np.arange(0,data.shape[1],2)+0.5, np.arange(1,data.shape[1]+1,2))
    ax.tick_params(axis=u'both', which=u'both',length=0)
    # Make x axis labels on top
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')  
    plt.savefig(f"{canon}_results_fingerprint.png", dpi=300, bbox_inches = "tight")


for df, canon in zip(token_len_ratio, canon_list):
    data = df.transpose()
    x_labels = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    y_labels = ["Penicillin G", "Nirmatrelvir", "Zidovudine", "LSD", "Fentanyl", "Acid Blue 25 FA", "Avobenzone", "2-dPAC"]
    # create heatmap
    fig, ax = plt.subplots()
    cmap = sns.color_palette("viridis", as_cmap=True)
    ax = sns.heatmap(data, 
                    vmin=0, 
                    vmax=2.0, 
                    cmap=cmap,
                    annot=True,
                    xticklabels=x_labels,
                    yticklabels=y_labels,
                    cbar=True,
                    square=True,
                    linewidths=0.5,
                    annot_kws={"size":5},
                    cbar_kws={"shrink": 0.5, "label":"Token Length Ratio"},
                    )
    ax.set(xlabel="Result #")
    plt.xticks(np.arange(0,data.shape[1],2)+0.5, np.arange(1,data.shape[1]+1,2))
    ax.tick_params(axis=u'both', which=u'both',length=0)
    # Make x axis labels on top
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')  
    plt.savefig(f"{canon}_results_token_len_ratio.png", dpi=300, bbox_inches = "tight")


for df, canon in zip(token_similarity, canon_list):
    data = df.transpose()
    x_labels = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    y_labels = ["Penicillin G", "Nirmatrelvir", "Zidovudine", "LSD", "Fentanyl", "Acid Blue 25 FA", "Avobenzone", "2-dPAC"]
    # create heatmap
    fig, ax = plt.subplots()
    cmap = sns.color_palette("viridis", as_cmap=True)
    ax = sns.heatmap(data, 
                    vmin=0, 
                    vmax=1, 
                    cmap=cmap,
                    annot=True,
                    xticklabels=x_labels,
                    yticklabels=y_labels,
                    cbar=True,
                    square=True,
                    linewidths=0.5,
                    annot_kws={"size":5},
                    cbar_kws={"shrink": 0.5, "label":"Token Tanimoto Similarity"},
                    )
    ax.set(xlabel="Result #")
    plt.xticks(np.arange(0,data.shape[1],2)+0.5, np.arange(1,data.shape[1]+1,2))
    ax.tick_params(axis=u'both', which=u'both',length=0)
    # Make x axis labels on top
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')  
    plt.savefig(f"{canon}_results_token_similarity.png", dpi=300, bbox_inches = "tight")

In [None]:
for i in (0,1,2):
    fsim[i] = fsim[i].melt(var_name="molecule", value_name="fsim")
    gestalt[i] = gestalt[i].melt(var_name="molecule", value_name="gestalt")
    fingerprint[i] = fingerprint[i].melt(var_name="molecule", value_name="fingerprint")
    token_len_ratio[i] = token_len_ratio[i].melt(var_name="molecule", value_name="token_len_ratio")
    token_similarity[i] = token_similarity[i].melt(var_name="molecule", value_name="token_similarity")

In [None]:
from statannotations.Annotator import Annotator

df_rdkit_atom_0 = pd.DataFrame()
df_rdkit_atom_n = pd.DataFrame()
df_oechem = pd.DataFrame()
df_rdkit_atom_0["molecule"] = fsim[0]["molecule"]
df_rdkit_atom_n["molecule"] = fsim[1]["molecule"]
df_oechem["molecule"] = fsim[2]["molecule"]
df_rdkit_atom_0["fsim"] = fsim[0]["fsim"]
df_rdkit_atom_n["fsim"] = fsim[1]["fsim"]
df_oechem["fsim"] = fsim[2]["fsim"]
df_rdkit_atom_0["gestalt"] = gestalt[0]["gestalt"]
df_rdkit_atom_n["gestalt"] = gestalt[1]["gestalt"]
df_oechem["gestalt"] = gestalt[2]["gestalt"]
df_rdkit_atom_0["fingerprint"] = fingerprint[0]["fingerprint"]
df_rdkit_atom_n["fingerprint"] = fingerprint[1]["fingerprint"]
df_oechem["fingerprint"] = fingerprint[2]["fingerprint"]
df_rdkit_atom_0["token_len_ratio"] = token_len_ratio[0]["token_len_ratio"]
df_rdkit_atom_n["token_len_ratio"] = token_len_ratio[1]["token_len_ratio"]
df_oechem["token_len_ratio"] = token_len_ratio[2]["token_len_ratio"]
df_rdkit_atom_0["token_similarity"] = token_similarity[0]["token_similarity"]
df_rdkit_atom_n["token_similarity"] = token_similarity[1]["token_similarity"]
df_oechem["token_similarity"] = token_similarity[2]["token_similarity"]

# Convert column names into values of a single column
df_rdkit_atom_0 = df_rdkit_atom_0.melt(id_vars="molecule",var_name="metric", value_name="value")
df_rdkit_atom_n = df_rdkit_atom_n.melt(id_vars="molecule",var_name="metric", value_name="value")
df_oechem = df_oechem.melt(id_vars="molecule",var_name="metric", value_name="value")

df_rdkit_atom_0["canon"] = "RDKit Atom 0"
df_rdkit_atom_n["canon"] = "RDKit Atom n"
df_oechem["canon"] = "OEChem"


df_all = pd.concat([df_rdkit_atom_0, df_rdkit_atom_n, df_oechem]).reset_index(drop=True)

# Create box plots for each metric
fig, ax = plt.subplots()
x = "metric"
y = "value"
hue = "canon"
hue_order = ["RDKit Atom 0", "RDKit Atom n", "OEChem"]
# order = ["fsim", "fingerprint", "gestalt", "token_similarity", "token_len_ratio"]
order = ["fingerprint", "gestalt", "token_similarity", "token_len_ratio"]

pairs = [
        # (("fsim", "RDKit Atom 0"), ("fsim", "RDKit Atom n")), (("fsim", "RDKit Atom 0"), ("fsim", "OEChem")), (("fsim", "RDKit Atom n"), ("fsim", "OEChem")),
        (("gestalt", "RDKit Atom 0"), ("gestalt", "RDKit Atom n")), (("gestalt", "RDKit Atom 0"), ("gestalt", "OEChem")), (("gestalt", "RDKit Atom n"), ("gestalt", "OEChem")),
        (("fingerprint", "RDKit Atom 0"), ("fingerprint", "RDKit Atom n")), (("fingerprint", "RDKit Atom 0"), ("fingerprint", "OEChem")), (("fingerprint", "RDKit Atom n"), ("fingerprint", "OEChem")),
        (("token_len_ratio", "RDKit Atom 0"), ("token_len_ratio", "RDKit Atom n")), (("token_len_ratio", "RDKit Atom 0"), ("token_len_ratio", "OEChem")), (("token_len_ratio", "RDKit Atom n"), ("token_len_ratio", "OEChem")),
        (("token_similarity", "RDKit Atom 0"), ("token_similarity", "RDKit Atom n")), (("token_similarity", "RDKit Atom 0"), ("token_similarity", "OEChem")), (("token_similarity", "RDKit Atom n"), ("token_similarity", "OEChem"))]

colors = {"RDKit Atom 0": "#E69F00", "RDKit Atom n": "#56B4E9", "OEChem":"#009E73"}

# sns.boxplot(data=df_all, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors)
sns.violinplot(data=df_all, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors)



# Add statistical annotations for hues
annotator = Annotator(ax, pairs, data=df_all, x=x, y=y, hue=hue, order=order, hue_order=hue_order)
annotator.configure(test="t-test_ind", text_format='star', loc='inside', verbose=2)
annotator.apply_test()
annotator.annotate()

# set hue colors

# Change legned title to "Canonicalization"
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, title="Canonicalization (n=160)", loc="lower right")

# Remove axis labels
ax.set_xlabel("")

# Change hue labels
ax.set_ylabel("Similarity")

# Change xtick labels
# ax.set_xticklabels(["Feature\nCosine", "Fingerprint\nTanimoto", "Gestalt", "Token\nTanimoto", "Token\nLength"])
ax.set_xticklabels(["Fingerprint\nTanimoto", "Gestalt", "Token\nTanimoto", "Token\nLength"])

# Wrap tick labels down to a second line
ax.set_xticklabels(ax.get_xticklabels(), wrap=True)

# dashed horizontal line ax y=1
ax.axhline(y=1, color="black", linestyle="--")

# plot mean values as well
# sns.stripplot(data=df_all, x=x, y=y, hue=hue, order=order, hue_order=hue_order, ax=ax, palette=colors, dodge=True, size=2, linewidth=0.5, edgecolor="black", alpha=1)

plt.savefig(f"results_violin_all.png", dpi=300, bbox_inches = "tight")


In [None]:
print(len(df_all[df_all["metric"] == "fingerprint"][df_all["canon"] == "RDKit Atom 0"][df_all["value"] >= 0.80]))
print(len(df_all[df_all["metric"] == "fingerprint"][df_all["canon"] == "RDKit Atom 0"]))
print(len(df_all[df_all["metric"] == "fingerprint"][df_all["canon"] == "RDKit Atom n"][df_all["value"] >= 0.80]))
print(len(df_all[df_all["metric"] == "fingerprint"][df_all["canon"] == "RDKit Atom n"]))
print(len(df_all[df_all["metric"] == "fingerprint"][df_all["canon"] == "OEChem"][df_all["value"] >= 0.80]))
print(len(df_all[df_all["metric"] == "fingerprint"][df_all["canon"] == "OEChem"]))

In [None]:

print("Gestalt")
print(df_all[df_all["metric"] == "gestalt"][df_all["canon"] == "RDKit Atom 0"]["value"].mean())
print(df_all[df_all["metric"] == "gestalt"][df_all["canon"] == "RDKit Atom n"]["value"].mean())
print(df_all[df_all["metric"] == "gestalt"][df_all["canon"] == "OEChem"]["value"].mean())

print("\nFingerprint")
print(df_all[df_all["metric"] == "fingerprint"][df_all["canon"] == "RDKit Atom 0"]["value"].quantile([0.25,0.5,0.75]))
print(df_all[df_all["metric"] == "fingerprint"][df_all["canon"] == "RDKit Atom n"]["value"].mean())
print(df_all[df_all["metric"] == "fingerprint"][df_all["canon"] == "OEChem"]["value"].mean())

print("\ntoken_similarity")
print(df_all[df_all["metric"] == "token_similarity"][df_all["canon"] == "RDKit Atom 0"]["value"].mean())
print(df_all[df_all["metric"] == "token_similarity"][df_all["canon"] == "RDKit Atom n"]["value"].mean())
print(df_all[df_all["metric"] == "token_similarity"][df_all["canon"] == "OEChem"]["value"].mean())

print("\ntoken_len_ratio")
print(df_all[df_all["metric"] == "token_len_ratio"][df_all["canon"] == "RDKit Atom 0"]["value"].mean())
print(df_all[df_all["metric"] == "token_len_ratio"][df_all["canon"] == "RDKit Atom n"]["value"].mean())
print(df_all[df_all["metric"] == "token_len_ratio"][df_all["canon"] == "OEChem"]["value"].mean())