In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data preparation

In [52]:
from importlib.resources import files

import pandas as pd

from biobank_olink.dataset import load_datasets

DATA_DIR = files("biobank_olink.data")

ol_df, cov_df = load_datasets()

dropping 2 columns
dropping 576 rows


In [75]:
eids = ol_df.index.intersection(cov_df.index)
ol_df = ol_df.loc[eids]
cov_df = cov_df.loc[eids]


def median_impute(df):
    return df.fillna(df.median())


ol_df = median_impute(ol_df)
cov_df = median_impute(cov_df)

print("ol_df shape: {}".format(ol_df.shape))
print("cov_df shape: {}".format(cov_df.shape))

ol_df shape: (39157, 1461)
cov_df shape: (39157, 10)


## Evaluation

In [76]:
olink_assays = pd.read_csv(DATA_DIR / "olink-explore-3072-assay-list-2023-06-08.csv")
assays_mapping = olink_assays.groupby("Explore 384 panel")["Gene name"].apply(list).to_dict()
list(assays_mapping.keys())

['Cardiometabolic',
 'Cardiometabolic_II',
 'Inflammation',
 'Inflammation_II',
 'Neurology',
 'Neurology_II',
 'Oncology',
 'Oncology_II']

In [77]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

baseline_cols = ["Sex", "age", "BMI"]


def experiment(x, y, fit_params=None):
    model = XGBRegressor(tree_method="hist", random_state=42, n_estimators=1000, learning_rate=0.05)
    scores = cross_val_score(model, x, y, scoring="r2", cv=5, fit_params=fit_params, n_jobs=5)
    return scores


scores = {}
x_base = cov_df[baseline_cols]

scores["baseline"] = experiment(x_base, cov_df.SBP)

scores[f"all ({ol_df.shape[1]})"] = experiment(x_base.join(ol_df), cov_df.SBP)

for assay_name in assays_mapping.keys():
    cols = [c for c in ol_df.columns if c in assays_mapping[assay_name]]
    exp_score = experiment(x_base.join(ol_df.loc[:, cols]), cov_df.SBP)
    scores[f"{assay_name.lower()} ({len(cols)})"] = exp_score

In [80]:
scores_df = pd.DataFrame(scores).mean().to_frame("r2")
scores_df["corr"] = np.sqrt(scores_df["r2"])
scores_df

Unnamed: 0,r2,corr
baseline,0.11249,0.335395
all (1461),0.35647,0.597051
cardiometabolic (202),0.219468,0.468474
cardiometabolic_ii (164),0.218778,0.467737
inflammation (223),0.238699,0.488568
inflammation_ii (186),0.257549,0.507492
neurology (197),0.220568,0.469647
neurology_ii (150),0.188529,0.434199
oncology (194),0.212533,0.461013
oncology_ii (154),0.185867,0.431123
