# 3. Excipient Selection

This notebook describes the use of association rule learning and exploratory hypothesis testing to inform excipient selection. Association rule learning applies a frequent itemset approach to identify interesting or meaningful excipient patterns in oral tablet formulation data, independent of the active ingredient. Exploratory hypothesis testing aims to detect differences in the distribution of chemical descriptors between the set of drugs formulated with each excipient and the set of drugs not formulated with the excipient of interest.

In [None]:
# arulespy is a Python interface for the arules R package and is built with rpy2. As rpy2 does not fully support Windows, Windows users may
# experience an error where the kernal dies when importing below libraries/packages. Information on possible workarounds are available via
# the GitHub page https://github.com/mhahsler/arulespy and the Python Package Index page https://pypi.org/project/arulespy/
# import os
# os.environ["R_HOME"] = #R_Path
# os.environ["PATH"] = ""

In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Display
from IPython.core.display import display, HTML
from IPython import get_ipython as get_ipython
from IPython.display import IFrame
import rpy2.robjects.packages as packages

# Frequent itemset mining
from arulespy.arules import Transactions, apriori, parameters
from arulespy.arulesViz import plot, inspectDT, ruleExplorer
from rpy2.ipython.ggplot import image_png
from rpy2 import robjects as ro
from mlxtend.preprocessing import TransactionEncoder

# Exploratory hypothesis testing
from scipy.stats import mannwhitneyu
from scipy.stats import false_discovery_control

# Cheminformatics
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

# Others
import ast
from collections import Counter
import pandas as pd
import numpy as np
import networkx as nx
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm

## Association Rule Learning

In [None]:
# Generate dataframe suitable for analysis

original_df = pd.read_csv("../csv_files/final_master_df.csv", index_col = 0)
oral = original_df[original_df["Route"] == "Oral"][["Dosage Form", "Excipients_Final"]]
oral_tablets = oral[oral["Dosage Form"] == "Tablet"]["Excipients_Final"].apply(ast.literal_eval)
oral_tablets.drop_duplicates(inplace=True)
te = TransactionEncoder()
te_ary = te.fit(oral_tablets).transform(oral_tablets)
oral_df = pd.DataFrame(te_ary, columns=te.columns_)
oral_df.drop_duplicates(inplace=True)
trans = Transactions.from_df(oral_df)

In [None]:
streamlit_rules = apriori(trans,
               parameter = parameters({"supp": 0.01, "conf": 0.01}),
               control = parameters({"verbose": False}),
               minlen=2,
               maxlen=2)
streamlit_rules_df = streamlit_rules.as_df().round(2)
streamlit_rules_df.to_csv("../csv_files/streamlit_app_data.csv")



In [None]:
rules_graph = apriori(trans,
               parameter = parameters({"supp": 0.01, "conf": 0.6}),
               control = parameters({"verbose": False}),
                minlen=2,
                maxlen=2)
rules_df = rules_graph.as_df()
rules_df

In [None]:
# Printing all instances that contain both Copovidone and Sorbitan Monolaurate
oral1 = original_df[original_df["Route"] == "Oral"][["Dosage Form", "Excipients_Final", "Active substance", "Marketing authorisation holder/company name"]]
oral_tablets1 = oral1[oral1["Dosage Form"] == "Tablet"]["Excipients_Final"].apply(ast.literal_eval)
oral_tablets1.drop_duplicates(inplace=True)
def get_indices(lst):
    return "COPOVIDONE" in lst and "SORBITAN MONOLAURATE" in lst
mask = oral_tablets1.apply(get_indices)

# Get the indices where the mask is True
indices = mask[mask].index.tolist()
oral1.loc[indices][["Active substance", "Marketing authorisation holder/company name"]]

In [None]:
# Printing all instances that contain both Hypromellose Acetate Succinate and Croscarmellose Sodium
oral1 = original_df[original_df["Route"] == "Oral"][["Dosage Form", "Excipients_Final", "Active substance", "Marketing authorisation holder/company name"]]
oral_tablets1 = oral1[oral1["Dosage Form"] == "Tablet"]["Excipients_Final"].apply(ast.literal_eval)
oral_tablets1.drop_duplicates(inplace=True)
def get_indices(lst):
    return "HYPROMELLOSE ACETATE SUCCINATE" in lst and "CROSCARMELLOSE SODIUM" in lst
mask = oral_tablets1.apply(get_indices)

# Get the indices where the mask is True
indices = mask[mask].index.tolist()
oral1.loc[indices][["Active substance", "Marketing authorisation holder/company name"]]

In [None]:
lhs = rules_df[(rules_df["LHS"] == "{COPOVIDONE}")|(rules_df["LHS"] == "{HYPROMELLOSE ACETATE SUCCINATE}")]
rhs = rules_df[(rules_df["RHS"] == "{COPOVIDONE}" )|(rules_df["RHS"] == "{HYPROMELLOSE ACETATE SUCCINATE}")]
lhs_and_rhs = pd.concat([lhs, rhs])
lhs_and_rhs = lhs_and_rhs[lhs_and_rhs["lift"] > 1]
list_for_network = list(set(lhs_and_rhs["LHS"]).union(set(lhs_and_rhs["RHS"])))
mask = rules_df["LHS"].isin(list_for_network) & rules_df["RHS"].isin(list_for_network)
full_network = rules_df[mask]
full_network = full_network[full_network["lift"] > 1]
def clean_text(text):
    cleaned_text = text.strip("{}").strip().replace(" ", "\n")
    return cleaned_text.capitalize()
full_network["RHS"] = full_network["RHS"].apply(clean_text)
full_network["LHS"] = full_network["LHS"].apply(clean_text)
full_network.to_csv("../results/association_rules_hpmcas_pvpva64.csv")
full_network

In [None]:
positions = [
    "Hypromellose\nacetate\nsuccinate",
    "Copovidone",
    "Sorbitan\nmonolaurate",
    "Polyethylene\nglycol",
    "Iron\noxide",
    "Silicon\ndioxide,\nanhydrous",
    "Talc",
    "Titanium\ndioxide",
    "Croscarmellose\nsodium",
    "Magnesium\nstearate",
    "Microcrystalline\ncellulose"
]

excip_of_interest = [
    "Hypromellose\nacetate\nsuccinate",
    "Copovidone"
]

other_excip = [x for x in positions if x not in excip_of_interest]

In [None]:
# Generate Figure 7A
G = nx.MultiDiGraph()

# Adding edges and rows
for index, row in full_network.iterrows():
    G.add_edge(row["LHS"], row["RHS"], confidence=row["confidence"])

# Setting up positioning
num_nodes_semicircle = 9
radius = 1
angle_between_nodes = np.pi / (num_nodes_semicircle - 1)
pos = dict()
for i in range(3, 12):
    angle = (i - 3) * angle_between_nodes 
    x = radius * np.cos(angle)
    y = radius * np.sin(angle) + 0.38
    pos[positions[i-1]] = (x, y)   
pos[positions[0]] = (pos[positions[7]][0] + 0.1, 0.1) # HPMCAS
pos[positions[1]] = (pos[positions[5]][0] - 0.1, 0.1) # PVPVA

confidences = [d["confidence"] for u, v, d in G.edges(data=True)]
norm = mcolors.Normalize(vmin=0.6, vmax=1)
cmap = cm.Reds
colors = [cmap(norm(d["confidence"])) for u, v, d in G.edges(data=True)]

# Plotting
nx.draw(G,
        pos, 
        node_size=2500, 
        node_color="w", 
        edgecolors= "k",
        edge_color = colors,
        width=0.7,
        with_labels=False, 
        arrows=True,
        connectionstyle="arc3,rad=0.1",
        labels={node: node for node in G.nodes()}
       )
nx.draw_networkx_labels(G.subgraph(other_excip),
                        {k: v for k, v in pos.items() if k in other_excip},
                        font_size=5.8)

nx.draw_networkx_labels(G.subgraph(excip_of_interest),
                        {k: v for k, v in pos.items() if k in excip_of_interest},
                        font_weight="bold",
                        font_size=5.8)


plt.text(-1.1, 1.4, "A", fontsize=20, fontweight="bold")
# Colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar_ax = plt.gcf().add_axes([1.03, 0.35, 0.015, 0.35])  
cbar = plt.colorbar(sm, cax=cbar_ax, orientation="vertical")  
cbar.ax.tick_params(labelsize=8)
cbar.set_label("Confidence", fontsize=7)


# Saving figure
plt.savefig("../figures/network_plot_confidence.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
# Generate Figure 7B
G = nx.MultiDiGraph()

# Adding edges and rows
for index, row in full_network.iterrows():
    G.add_edge(row["LHS"], row["RHS"], lift=row["lift"])

# Setting up positioning
num_nodes_semicircle = 9
radius = 1
angle_between_nodes = np.pi / (num_nodes_semicircle - 1)
pos = dict()
for i in range(3, 12):
    angle = (i - 3) * angle_between_nodes 
    x = radius * np.cos(angle)
    y = radius * np.sin(angle) + 0.38
    pos[positions[i-1]] = (x, y)   
pos[positions[0]] = (pos[positions[7]][0] + 0.1, 0.1) # HPMCAS
pos[positions[1]] = (pos[positions[5]][0] - 0.1, 0.1) # PVPVA

# Normalizing the confidence values for the colormap
lifts = [d["lift"] for u, v, d in G.edges(data=True)]
norm = mcolors.Normalize(vmin=1, vmax=2)
cmap = cm.Blues  # You can choose any colormap you prefer
colors = [cmap(norm(d["lift"])) for u, v, d in G.edges(data=True)]

# Plotting
nx.draw(G,
        pos, 
        node_size=2500, 
        node_color="w", 
        edgecolors= "k",
        edge_color = colors,
        width=0.7,
        with_labels=False, 
        arrows=True,
        connectionstyle="arc3,rad=0.1",
        labels={node: node for node in G.nodes()}
       )
nx.draw_networkx_labels(G.subgraph(other_excip),
                        {k: v for k, v in pos.items() if k in other_excip},
                        font_size=5.8)

nx.draw_networkx_labels(G.subgraph(excip_of_interest),
                        {k: v for k, v in pos.items() if k in excip_of_interest},
                        font_weight="bold",
                        font_size=5.8)
plt.text(-1.1, 1.4, "B", fontsize=20, fontweight="bold")
# Colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar_ax = plt.gcf().add_axes([1.03, 0.35, 0.015, 0.35])  
cbar = plt.colorbar(sm, cax=cbar_ax, orientation="vertical", extend="max")  
cbar.ax.tick_params(labelsize=8)
cbar.set_label("Lift", fontsize=7)

# # Saving figure
plt.savefig("../figures/network_plot_lift.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
trans = Transactions.from_df(oral_df)
rules = apriori(trans,
               parameter = parameters({"supp": 0.01, "conf": 0.6}),
               control = parameters({"verbose": False}),
               minlen=2)
rules_df = rules.as_df().round(2)
to_plot = rules_df[(rules_df["lift"]>1)]
to_plot.sort_values(by="lift", inplace=True)
rules_df.to_csv("../results/all_association_rules.csv")
to_plot

In [None]:
jitter_support = to_plot["support"] + np.random.normal(loc=0, scale=0.002, size=len(to_plot))
jitter_confidence = to_plot["confidence"] + np.random.normal(loc=0, scale=0.002, size=len(to_plot))

fig, ax = plt.subplots(figsize=(10, 5))
scatter = ax.scatter(x=jitter_support, y=jitter_confidence, c=to_plot["lift"], cmap="Reds", s=10, norm= mcolors.Normalize(vmin=1, vmax=5))

plt.colorbar(scatter, label="Lift", extend="max")

ax.set_xlabel("Support")
ax.set_ylabel("Confidence")
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.grid(True)
ax.set_xlim(left=0.0)

plt.savefig("../figures/rules_plot.pdf", format="pdf", bbox_inches="tight")
plt.tight_layout()
plt.show()


## Exploratory Hypothesis Testing

In [None]:
active_substance_df = pd.read_csv("../csv_files/adding_chembl_to_actives.csv", index_col=0)
active_substance_df = active_substance_df[~active_substance_df["IsomericSMILES"].str.contains("\.")][["active_PSS", "IsomericSMILES"]] # get rid of multicomponents
active_substance_df.drop_duplicates(subset="active_PSS", inplace=True)
active_set = set(active_substance_df["active_PSS"])
active_substance_df["Mol"] = active_substance_df["IsomericSMILES"].apply(lambda x: Chem.MolFromSmiles(x))
active_substance_df["MolWt"] = active_substance_df["Mol"].apply(lambda x: Descriptors.MolWt(x))
active_substance_df["HDonors"] = active_substance_df["Mol"].apply(lambda x: Lipinski.NumHDonors(x))
active_substance_df["HAcceptors"] = active_substance_df["Mol"].apply(lambda x: Lipinski.NumHAcceptors(x))
active_substance_df["RotB"] = active_substance_df["Mol"].apply(lambda x: Lipinski.NumRotatableBonds(x))
active_substance_df["TPSA"] = active_substance_df["Mol"].apply(lambda x: Descriptors.TPSA(x))
active_substance_df["LogP"] = active_substance_df["Mol"].apply(lambda x: Descriptors.MolLogP(x))
active_substance_df.rename(columns={"active_PSS": "drug"}, inplace=True)

In [None]:
exploratory_testing = original_df[(original_df["Route"] == "Oral") & (original_df["Dosage Form"] == "Tablet")][["actives_in_dosage_form", "Excipients_Final"]]
exploratory_testing["actives_in_dosage_form"] = exploratory_testing["actives_in_dosage_form"].apply(lambda x: x.upper().strip().split(","))
exploratory_testing = exploratory_testing.explode("actives_in_dosage_form")
exploratory_testing["actives_in_dosage_form"] = exploratory_testing["actives_in_dosage_form"].apply(lambda x: x.strip())
exploratory_testing["Excipients_Final"] = exploratory_testing["Excipients_Final"].apply(ast.literal_eval)
exploratory_testing = (exploratory_testing[exploratory_testing["actives_in_dosage_form"].isin(active_set)]
             .groupby("actives_in_dosage_form")["Excipients_Final"]
             .sum()
             .apply(lambda x: list(set(x)))
             .reset_index())
# Getting excipients with a count greater >= 10
list_of_excipients = list(exploratory_testing["Excipients_Final"].explode())
excipient_counts = Counter(list_of_excipients)
selected_excipients = [item for item, count in excipient_counts.items() if count >= 10]
for excipient in selected_excipients:
    exploratory_testing[excipient] = exploratory_testing["Excipients_Final"].apply(lambda x: excipient in x)
exploratory_testing.rename(columns={"actives_in_dosage_form": "drug"}, inplace=True)

In [None]:
df_for_testing = pd.merge(exploratory_testing, active_substance_df, how="left", on="drug")
output_columns = df_for_testing[selected_excipients]
predictors = ["MolWt", "HDonors", "HAcceptors", "RotB", "TPSA", "LogP"]

In [None]:
def perform_mannwhitneyu_test(df, input_col, output_col):
    included = df[df[output_col] == 1][input_col]
    not_included = df[df[output_col] == 0][input_col]
    statistic, p_value = mannwhitneyu(included, not_included, alternative="two-sided")
    return (output_col, input_col, statistic, p_value)

results = []

# Collect all p-values and other test results
for input_col in predictors:
    for output_col in output_columns:
        result = perform_mannwhitneyu_test(df_for_testing, input_col, output_col)
        results.append(result)

# Extract p-values for BH correction
p_values = [result[3] for result in results]

# Apply FDR
corrected_p_values = false_discovery_control(p_values)

df_results = pd.DataFrame(results, columns=["Output Column", "Input Column", "Statistic", "P-Value"])
df_results["Corrected P"] = false_discovery_control(p_values)
df_results[df_results["Corrected P"] < 0.05]
df_results.to_csv("../results/exploratory_hypothesis_testing.csv")

In [None]:
# Generating lists of values to plot normalised against the Ro5 or Veber's Rules as appropriate

# Copovidone TPSA
copovidone_tpsa = df_for_testing[df_for_testing["COPOVIDONE"] == True]["TPSA"]/140
not_copovidone_tpsa = df_for_testing[df_for_testing["COPOVIDONE"] == False]["TPSA"]/140

# Copovidone HDonors
copovidone_hdonors = df_for_testing[df_for_testing["COPOVIDONE"] == True]["HDonors"]/5
not_copovidone_hdonors = df_for_testing[df_for_testing["COPOVIDONE"] == False]["HDonors"]/5

# Copovidone HAcceptors
copovidone_hacceptors = df_for_testing[df_for_testing["COPOVIDONE"] == True]["HAcceptors"]/10
not_copovidone_hacceptors = df_for_testing[df_for_testing["COPOVIDONE"] == False]["HAcceptors"]/10

# Copovidone MolWt
copovidone_molwt = df_for_testing[df_for_testing["COPOVIDONE"] == True]["MolWt"]/500
not_copovidone_molwt = df_for_testing[df_for_testing["COPOVIDONE"] == False]["MolWt"]/500

# Maize Starch TPSA
maize_tpsa = df_for_testing[df_for_testing["MAIZE STARCH"] == True]["TPSA"]/140
not_maize_tpsa = df_for_testing[df_for_testing["MAIZE STARCH"] == False]["TPSA"]/140

# Maize Starch HAcceptors
maize_hacceptors = df_for_testing[df_for_testing["MAIZE STARCH"] == True]["HAcceptors"]/10
not_maize_hacceptors = df_for_testing[df_for_testing["MAIZE STARCH"] == False]["HAcceptors"]/10

In [None]:
# Plotting data using boxplots

data = [
    copovidone_molwt, not_copovidone_molwt,  
    copovidone_hdonors, not_copovidone_hdonors,   
    copovidone_hacceptors, not_copovidone_hacceptors,
    copovidone_tpsa, not_copovidone_tpsa,   
    maize_hacceptors, not_maize_hacceptors,  
    maize_tpsa, not_maize_tpsa         
]

grouped_data = [data[i:i+2] for i in range(0, len(data), 2)]

# f-strings made code too long and complicated, values copied manually
x_labels = [r"$\bf{MolWt}$" "\np<0.001\nAdjusted p<0.001\nU=2056.0", 
          r"$\bf{HBD}$" "\np=0.001\nAdjusted p=0.034\nU=1787.5", 
          r"$\bf{HBA}$" "\np<0.001\nAdjusted p=0.002\nU=1971.0", 
          r"$\bf{TPSA}$" "\np<0.001\nAdjusted p<0.001\nU=2057.0", 
          r"$\bf{HBA}$" "\np<0.001\nAdjusted p=0.013\nU=972.0", 
          r"$\bf{TPSA}$" "\np<0.001\nAdjusted p=0.013\nU=951.5"]

plt.figure(figsize=(15, 8))

# Positioning bars side-by-side
positions = []
for i in range(len(grouped_data)):
    positions.extend([i*4+1, i*4+2]) 

for i, group in enumerate(grouped_data):
    for j, var in enumerate(group):
        x = np.random.normal(positions[i*2 + j], 0.05, size=len(var))  
        plt.scatter(x, var, color="gray", alpha=0.4, s=6)

boxplot = plt.boxplot(
    [item for sublist in grouped_data for item in sublist], 
    positions=positions, 
    patch_artist=True, 
    medianprops=dict(color="black"),
    widths=0.75
)

# Customising the colouring and layout of the plot
edge_colors = ["C0", "black", "C0", "black", "C0", "black", "C0", "black", "C1", "black", "C1", "black"]
line_width=1.6
for i, (box, whisker1, whisker2, cap1, cap2, median, fliers) in enumerate(zip(boxplot["boxes"], 
                                                                        boxplot["whiskers"][::2], 
                                                                        boxplot["whiskers"][1::2], 
                                                                        boxplot["caps"][::2], 
                                                                        boxplot["caps"][1::2],
                                                                        boxplot["medians"],
                                                                        boxplot["fliers"])):
    box.set_edgecolor(edge_colors[i])
    box.set_linewidth(line_width)
    whisker1.set_color(edge_colors[i])
    whisker1.set_linewidth(line_width) 
    whisker2.set_color(edge_colors[i])
    whisker2.set_linewidth(line_width)
    cap1.set_color(edge_colors[i])
    cap1.set_linewidth(line_width) 
    cap2.set_color(edge_colors[i])
    cap2.set_linewidth(line_width) 
    median.set_color(edge_colors[i])
    median.set_linewidth(line_width)
    fliers.set_markeredgecolor(edge_colors[i])
for patch in boxplot["boxes"]:
    patch.set_facecolor("none")


plt.xticks([i*4 + 1.5 for i in range(len(grouped_data))], x_labels, fontsize=12)
plt.ylabel("Normalized Values", fontsize=16)

plt.xlim(-0.5, max(positions) + 1.5)  # Adding additional whitespace on each side
plt.gca().yaxis.grid(True)
plt.gca().xaxis.grid(False)

legend_elements = [
    plt.Line2D([0], [0], color="C0", lw=2, label="Formulated with copovidone"),
    plt.Line2D([0], [0], color="C1", lw=2, label="Formulated with maize starch"),
    plt.Line2D([0], [0], color="black", lw=2, label="Not formulated with excipient of interest")
]
plt.legend(handles=legend_elements, loc="upper right", fontsize = 14)

plt.savefig("../figures/hypothesis_testing.pdf", format="pdf", bbox_inches="tight")
plt.tight_layout()
plt.show()
