In this notebook, we use the cohort information and tables we saved to construct AnnData objects containing one-hot encoded condition vectors for each patient, which we will use in subsequent analyses.

In [None]:
# imports
import numpy as np
import pandas as pd
import anndata as ad
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix

# load endometriosis concepts and cohort
endo_concepts = pd.read_csv("<concept filepath>")["<concept ID>"].tolist()
cohort = pd.read_feather("<cohort filepath>").sort_values(by="<person ID>").reset_index(drop=True)

# add replicate labels to cohort
repl_labels = np.zeros(cohort.shape[0], dtype=int)
for person in tqdm(cohort[cohort["endo"] == 1]["<person ID>"]):
    subclass = cohort[cohort["subclass"] == cohort[cohort["<person ID>"] == person]["subclass"].item()]
    paired_idxs = subclass[subclass["endo"] == 0].index
    repl_labels[paired_idxs] = list(range(1, 31))
cohort["replicate"] = repl_labels

# load condition table and retrieve full condition set for each person
condition_table = pd.read_feather("<condition table filepath>")
condition_table = condition_table.drop_duplicates(subset=["<person ID>", "<condition concept ID>"], keep="first")
condition_table = condition_table.sort_values(by=["<person ID>", "<condition concept ID>"])
conditions_full = condition_table.groupby("<person ID>")["<condition concept ID>"].agg(list).tolist()

# retrieve pre-endometriosis conditions for each person
conditions_pre = []
persons_pre = []
for person in tqdm(cohort[cohort["endo"] == 1]["<person ID>"]):
    person_subset = condition_table[condition_table["<person ID>"] == person]
    max_date = person_subset[person_subset["<condition concept ID>"].isin(endo_concepts)]["<condition date>"].min()
    condition_list = person_subset[person_subset["<condition date>"] < max_date]["<condition concept ID>"].unique().tolist()
    if len(condition_list) == 0:
        continue
    conditions_pre.append(condition_list)
    persons_pre.append(int(person))
    subclass = cohort[cohort["subclass"] == cohort[cohort["<person ID>"] == person]["subclass"].item()]
    paired_idxs = subclass[subclass["endo"] == 0].index
    for idx in paired_idxs:
        conditions_pre.append(conditions_full[idx])
        persons_pre.append(int(cohort.loc[idx]["<person ID>"]))

# sort data for pre-endometriosis conditions
persons_pre, conditions_pre = zip(*sorted(zip(persons_pre, conditions_pre)))
persons_pre, conditions_pre = list(persons_pre), list(conditions_pre)

# one-hot encode information
def one_hot_encode(conditions):
    conditions = pd.Series(conditions)
    mlb = MultiLabelBinarizer()
    encoded = mlb.fit_transform(conditions)
    return encoded, mlb.classes_
encoded_full, columns_full = one_hot_encode(conditions_full)
encoded_pre, columns_pre = one_hot_encode(conditions_pre)

# construct AnnDatas
concept_table = pd.read_feather("<concept table filepath>")[["<concept ID>", "<concept name>"]]
full_obs = cohort.reset_index(drop=True)
full_var = concept_table[concept_table["<concept ID>"].isin(columns_full)].reset_index(drop=True)
full_ad = ad.AnnData(X=csr_matrix(encoded_full, dtype=np.float32), obs=full_obs, var=full_var)
full_ad_onlyendo = full_ad[full_ad.obs["endo"] == 1]
pre_obs = cohort[cohort["<person ID>"].isin(persons_pre)].reset_index(drop=True)
pre_var = concept_table[concept_table["<concept ID>"].isin(columns_pre)].reset_index(drop=True)
pre_ad = ad.AnnData(X=csr_matrix(encoded_pre, dtype=np.float32), obs=pre_obs, var=pre_var)
pre_ad_onlyendo = pre_ad[pre_ad.obs["endo"] == 1]

# modify data types for broader compatibility
full_ad.obs["<record start>"] = full_ad.obs["<record start>"].astype(str)
full_ad.obs["<record end>"] = full_ad.obs["<record end>"].astype(str)
full_ad_onlyendo.obs["<record start>"] = full_ad_onlyendo.obs["<record start>"].astype(str)
full_ad_onlyendo.obs["<record end>"] = full_ad_onlyendo.obs["<record end>"].astype(str)
pre_ad.obs["<record start>"] = pre_ad.obs["<record start>"].astype(str)
pre_ad.obs["<record end>"] = pre_ad.obs["<record end>"].astype(str)
pre_ad_onlyendo.obs["<record start>"] = pre_ad_onlyendo.obs["<record start>"].astype(str)
pre_ad_onlyendo.obs["<record end>"] = pre_ad_onlyendo.obs["<record end>"].astype(str)

# save to disk
full_ad.write_h5ad("<AnnData filepath for all patients, all conditions>")
full_ad_onlyendo.write_h5ad("<AnnData filepath for endo patients, all conditions>")
pre_ad.write_h5ad("<AnnData filepath for all patients, pre-endo conditions>")
pre_ad_onlyendo.write_h5ad("<AnnData filepath for endo patients, pre-endo conditions>")