Where possible, this notebook contains code to replicate the visualizations presented in the paper. Note that the various subparts of each figure are made by rerunning each piece of visualization code with slightly different inputs and parameters.

In [None]:
# imports
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from matplotlib.colors import to_hex
from adjustText import adjust_text
from scipy.stats import pearsonr
from tqdm import tqdm

### Figure 2a

In [None]:
# load and process data
results = pd.read_csv("<aggregate results filepath>")
conversion_table = pd.read_csv("<SNOMED to ICD-10 chapter filepath>")
merged = results.merge(conversion_table, on="snomed")
merged = merged.sort_values(by="chapter").reset_index(drop=True)
correction = 1e-323
merged['-log(p)'] = -np.log10(merged["agg-p-adj"] + correction)

# generate y-coordinates
y = []
for i, chapter in enumerate(merged["chapter"].unique()):
    y += list(22000 - np.linspace(1000 * i, 1000 * (i + 1), num = len(merged[merged["chapter"] == chapter])))
merged["y"] = y

# split into significant and non-significant
sig = pd.read_csv("<filtered results, desired condition set>")
merged_sig = merged[merged["snomed"].isin(sig["snomed"])]
merged_nonsig = merged[~merged["snomed"].isin(merged_sig["snomed"])]

# set up text annotations (edit as desired)
to_annot = merged_sig[merged_sig["-log(p)"] > 100]

# load colors
color_df = pd.read_csv("<CSV containing desired colors>")

# create color dictionaries
sig_color_dict = color_df[["chapter", "base"]].set_index("chapter").to_dict()["base"]
nonsig_color_dict = color_df[["chapter", "light"]].set_index("chapter").to_dict()["light"]

# create canvas
fig, ax = plt.subplots(figsize=(20, 11))

# plot points
ax.scatter(merged_nonsig["-log(p)"], merged_nonsig["y"], color=[nonsig_color_dict[c] for c in merged_nonsig["chapter"]], alpha=1, edgecolors="none")
ax.scatter(merged_sig["-log(p)"], merged_sig["y"], color=[sig_color_dict[c] for c in merged_sig["chapter"]], alpha=1, edgecolors="none")

# set tick parameters
ax.tick_params(axis='both', which='major', labelsize = 18)

# y axis
chapter_tick_df = merged.groupby("chapter_name")["y"].median()
ax.set_yticks(chapter_tick_df)
ax.set_yticklabels(chapter_tick_df.index)

# x axis
ax.xaxis.set_ticks_position('none')
ax.set_xlabel(r"-log$_{10}$(adj. p)", fontsize = 18)
ax.set_xlim(right=330)
ax.set_xticks([0, 100, 200, 300], ["0", "100", "200", "300"])

# annotations
texts = []
for index, row in to_annot.iterrows():
    texts.append(ax.annotate(row["name"], (row["-log(p)"], row["y"]), size=14, ha="center", va="center"))
adjust_text(texts, expand=(2, 2), arrowprops=dict(arrowstyle="-", color="gray", lw=0.5))

sns.despine()
plt.tight_layout()
plt.savefig("<output filepath>", transparent=True)

### Figure 2b

In [None]:
# load results and mapping
results = pd.read_csv("<aggregate results filepath>")
results = results.iloc[:20, :]
results = results.sort_values(by="avg-odds", ascending=False).reset_index(drop=True)
mapping = pd.read_csv("<SNOMED to ICD-10 chapter filepath>")
results = results.merge(mapping, on="snomed")
snomed_to_name = results[["snomed", "name"]].set_index("snomed").to_dict()["name"]
snomed_to_chapter = results[["snomed", "chapter"]].set_index("snomed").to_dict()["chapter"]

# load data for each replicate
replicates = {}
for i in range(30):
    replicates[i] = pd.read_csv(f"<directory for desired case versus control analysis>/replicate-{i+1:02}.csv")

# extract necessary information
records = []
for s in tqdm(results["snomed"]):
    for i in range(30):
        repl = replicates[i]
        records.append({
            "snomed": s,
            "name": snomed_to_name[s],
            "chapter": snomed_to_chapter[s],
            "odds": repl[repl["snomed"] == s]["odds"].item(),
            "replicate": i
        })
records = pd.DataFrame.from_records(records)

# load color dictionary
color_df = pd.read_csv("<CSV containing desired colors>")
color_dict = color_df[["chapter", "base"]].set_index("chapter").to_dict()["base"]

# create figure
fig, ax = plt.subplots(figsize=(20, 16)) # pre

# make boxplot
sns.stripplot(data=records, x="odds", y="name", ax=ax, size=10, hue="chapter", palette=color_dict)

# add dividers
for y in ax.get_yticks():
    ax.axhline(y + .5, color="darkgray")
    
# remove tick marks
ax.xaxis.set_ticks_position('none') 
ax.yaxis.set_ticks_position('none')

# set labels
ax.set_xlabel("odds ratio")
ax.set_ylabel("condition")

# remove legend
ax.get_legend().remove()

plt.tight_layout()
plt.savefig("<output filepath>", transparent=True)

### Figure 2c

The code for generating the plots for Figure 2c was developed for the UC-wide environment and cannot be made publicly available. Please reach out to the corresponding author if needed.

### Figure 3b / Figure 4a

In [None]:
adata = sc.read_h5ad(f"<desired AnnData filepath>")
colors = pd.read_csv("<CSV containing cluster-to-color mapping>", dtype={"leiden": "str"})
color_dict = colors[["leiden", "color"]].set_index("leiden").to_dict()["color"]
plt.clf()
with plt.rc_context({"figure.figsize": (6, 4), "figure.dpi": 300}):
    sc.pl.umap(adata, color="leiden", palette=color_dict, legend_loc="none")

### Figure 4c

In [None]:
# load data
full_adata = sc.read_h5ad("<AnnData filepath for endo patients, all conditions>")
pre_adata = sc.read_h5ad("<AnnData filepath for endo patients, pre-endo conditions>")

# create DataFrame for pre-to-full cluster changes
df = pre_adata.obs.merge(full_adata.obs, on="person_id")[["person_id", "leiden_x", "leiden_y"]]
df["leiden_x"] = (df["leiden_x"].astype(int) + 1).astype(str)
df["leiden_y"] = (df["leiden_y"].astype(int) + 1).astype(str)

# define color palette
palette = sns.color_palette("husl", len(df["leiden_x"].unique()))
palette_dict = {str(i + 1): palette[i] for i in range(31)}

# generate plot
fig = px.parallel_categories(
    df,
    dimensions=["leiden_x", "leiden_y"],
    labels={"leiden_x": "", "leiden_y": ""},
    color=[to_hex(palette_dict[i]) for i in df["leiden_x"]],
    width=1900,
    height=1800
)

# save image
fig.update_layout(font={"size": 20})
fig.write_image("<output filepath>")

### Supplementary Figure 1a

In [None]:
# compute UMAP representation for cases and closest controls
adata = sc.read_h5ad("<AnnData filepath for all patients, all conditions>")
adata = adata[adata.obs["replicate"].isin((0, 1))]
sc.pp.neighbors(adata)
sc.tl.umap(adata)

# plot endometriosis status
plt.clf()
fig, ax = plt.subplots(figsize=(12, 12))
color_dict = {
    0: "tab:blue",
    1: "tab:orange"
}
ax.scatter(adata.obsm["X_umap"][:, 0], adata.obsm["X_umap"][:, 1], s=2, color=[color_dict[x] for x in adata.obs["endo"]])
ax.set_yticklabels([])
ax.set_xticklabels([])
plt.tight_layout()
plt.savefig("<output filepath>", transparent=True)

# plot age
plt.clf()
fig, ax = plt.subplots(figsize=(12, 12))
plt.scatter(adata.obsm["X_umap"][:, 0], adata.obsm["X_umap"][:, 1], s=2, c=adata.obs["age"], cmap="viridis")
ax.set_yticklabels([])
ax.set_xticklabels([])
plt.colorbar()
plt.tight_layout()
plt.savefig("<output filepath>", transparent=True)

# plot gender
plt.clf()
fig, ax = plt.subplots(figsize=(12, 12))
color_dict = {
    "Female": "tab:blue",
    "Male": "tab:orange",
    "Unknown": "tab:green",
    "Other": "tab:red",
}
ax.scatter(adata.obsm["X_umap"][:, 0], adata.obsm["X_umap"][:, 1], s=2, color=[color_dict[x] for x in adata.obs["gender"]])
ax.set_yticklabels([])
ax.set_xticklabels([])
plt.tight_layout()
plt.savefig("<output filepath>", transparent=True)

# plot race
plt.clf()
fig, ax = plt.subplots(figsize=(12, 12))
color_dict = {
    "White": "tab:blue",
    "Asian": "tab:orange",
    "Other": "tab:green",
    "Unknown": "tab:red",
    "Black or African American": "tab:purple",
    "Native Hawaiian or Other Pacific Islander": "tab:brown",
    "American Indian or Alaska Native": "tab:pink",
}
ax.scatter(adata.obsm["X_umap"][:, 0], adata.obsm["X_umap"][:, 1], s=2, color=[color_dict[x] for x in adata.obs["race"]])
ax.set_yticklabels([])
ax.set_xticklabels([])
plt.tight_layout()
plt.savefig("<output filepath>", transparent=True)

# plot ethnicity
plt.clf()
fig, ax = plt.subplots(figsize=(12, 12))
color_dict = {
    "Not Hispanic or Latino": "tab:blue",
    "Unknown": "tab:orange",
    "Hispanic or Latino": "tab:green"
}
ax.scatter(adata.obsm["X_umap"][:, 0], adata.obsm["X_umap"][:, 1], s=2, color=[color_dict[x] for x in adata.obs["ethnicity"]])
ax.set_yticklabels([])
ax.set_xticklabels([])
plt.tight_layout()
plt.savefig("<output filepath>", transparent=True)

### Supplementary Figure 1b

The code for generating the plots for Supplementary Figure 1b was developed for the UC-wide environment and cannot be made publicly available. Please reach out to the corresponding author if needed.

### Supplementary Figure 2a

In [None]:
# load results and mapping
results = pd.read_csv("<aggregate results filepath>")
results["p-nozero"] = results["agg-p-adj"].replace(to_replace=0, value=1e-323)
results["log10(odds)"] = np.log10(results["avg-odds"])
results["-log10(p)"] = -np.log10(results["p-nozero"])
mapping = pd.read_csv("<SNOMED to ICD-10 chapter filepath>")
results = results.merge(mapping, on="snomed")

# split into significant and non-significant
sig = pd.read_csv("<filtered results, desired condition set>")
results_sig = results[results["snomed"].isin(sig["snomed"])]
results_nonsig = results[~results["snomed"].isin(results_sig["snomed"])]

# set up annotations (as desired)
to_annot = results_sig[results_sig["-log10(p)"] > 300]
to_annot = to_annot[to_annot["name"].apply(lambda x: len(x) < 25)]

# load colors
color_df = pd.read_csv("<CSV containing desired colors>")
sig_color_dict = color_df[["chapter", "base"]].set_index("chapter").to_dict()["base"]
nonsig_color_dict = color_df[["chapter", "light"]].set_index("chapter").to_dict()["light"]

# create canvas
fig, ax = plt.subplots(figsize=(15, 15))

# plot points
ax.scatter(results_nonsig["log10(odds)"], results_nonsig["-log10(p)"], color=[nonsig_color_dict[c] for c in results_nonsig["chapter"]], alpha=1, edgecolors="none")
ax.scatter(results_sig["log10(odds)"], results_sig["-log10(p)"], color=[sig_color_dict[c] for c in results_sig["chapter"]], alpha=1, edgecolors="none")

# set tick parameters
ax.tick_params(axis='both', which='major', labelsize = 18)

# y axis
ax.yaxis.set_ticks_position('none')
ax.set_ylabel(r"-log$_{10}$(adj. p)", fontsize = 18)
ax.set_ylim(bottom=-30, top=450)
ax.set_yticks([0, 50, 100, 150, 200, 250, 300, 350, 400], ["0", "50", "100", "150", "200", "250", "300", "350", "400"])

# x axis
ax.xaxis.set_ticks_position('none')
ax.set_xlabel(r"log$_{10}$(avg. odds ratio)", fontsize = 18)
ax.set_xlim(left=-2.4, right=2.4)
ax.set_xticks([-2, -1, 0, 1, 2], ["-2", "-1", "0", "1", "2"])

# guide lines
ax.axvline(x=np.log10(1), ls="--", color="dimgray")
ax.axhline(y=-np.log10(0.05), ls="--", color="dimgray")

# annotations
texts = []
for index, row in to_annot.iterrows():
    texts.append(ax.annotate(row["name"], (row["log10(odds)"], row["-log10(p)"]), size=14, ha="center", va="center"))
adjust_text(texts, expand=(2.5, 2.5), arrowprops=dict(arrowstyle="-", color="gray", lw=0.5))

# finalize and save
sns.despine()
plt.tight_layout()
plt.savefig("<output filepath>", transparent=True)

### Supplementary Figure 2b

The code for generating the plots for Supplementary Figure 2b was developed for the UC-wide environment and cannot be made publicly available. Please reach out to the corresponding author if needed.

### Supplementary Figure 3 (UCSF)

In [None]:
# load results for full condition set
full_results = pd.read_csv("<filtered results, full condition set>")
full_results["p-nozero"] = full_results["agg-p-adj"].replace(to_replace=0, value=1e-323)
full_results["log10(odds)"] = np.log10(full_results["avg-odds"])
full_results["-log10(p)"] = -np.log10(full_results["p-nozero"])
full_results

# load results for pre-endometriosis condition set
pre_results = pd.read_csv("<filtered results, pre-endo condition set>")
pre_results["p-nozero"] = pre_results["agg-p-adj"].replace(to_replace=0, value=1e-323)
pre_results["log10(odds)"] = np.log10(pre_results["avg-odds"])
pre_results["-log10(p)"] = -np.log10(pre_results["p-nozero"])
pre_results

# get intersection
full_sig = full_results[full_results["agg-sig"] == True]["snomed"]
pre_sig = pre_results[pre_results["agg-sig"] == True]["snomed"]
inter = list(set(full_sig).intersection(set(pre_sig)))
merged = full_results[full_results["snomed"].isin(inter)].merge(pre_results[pre_results["snomed"].isin(inter)], on="snomed", suffixes=("_full", "_pre")).sort_values(by="snomed", ignore_index=True)

# add chapter annotations
mapping = pd.read_csv("<SNOMED to ICD-10 chapter filepath>")
merged = merged.merge(mapping, on="snomed")
merged

# load color dictionary
color_df = pd.read_csv("<CSV containing desired colors>")
color_dict = color_df[["chapter", "base"]].set_index("chapter").to_dict()["base"]

# create and save figure
fig, ax = plt.subplots(figsize=(15, 10))
ax.tick_params(axis='both', which='major', labelsize = 18)
ax.axline((0, 0), slope=1, ls="--", color="dimgray")
sns.scatterplot(data=merged, x="log10(odds)_full", y="log10(odds)_pre", ax=ax, legend=False, hue="chapter", palette=color_dict)
ax.set_xlim(left=-0.7, right=2.7)
ax.set_ylim(bottom=-0.7, top=2.7)
ax.set_xlabel(r"log$_{10}$(avg. odds ratio) [full]", fontsize = 18)
ax.set_ylabel(r"log$_{10}$(avg. odds ratio) [pre-endo.]", fontsize = 18)
plt.tight_layout()
plt.savefig("<output filepath>", transparent=True)

# run correlation analysis
print(pearsonr(merged["log10(odds)_full"], merged["log10(odds)_pre"]))

### Supplementary Figure 3 (UC-wide)

The code for generating the plots for this part of Supplementary Figure 3 was developed for the UC-wide environment and cannot be made publicly available. Please reach out to the corresponding author if needed.