In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data preparation

In [27]:
from importlib.resources import files

import pandas as pd
import numpy as np

from biobank_olink.dataset import load_olink_and_covariates

DATA_DIR = files("biobank_olink.data")

ol_df, cov_df = load_olink_and_covariates(cols_na_th=0.3, rows_na_th=0.3, corr_th=0.9)
cov_df = cov_df[cov_df.HTNgroup < 2]
ol_df = ol_df.loc[cov_df.index]
print("ol_df shape: {}".format(ol_df.shape))
print("cov_df shape: {}".format(cov_df.shape))

[Memory]282.5s, 4.7min  : Loading load_olink_and_covariates...
ol_df shape: (30252, 1355)
cov_df shape: (30252, 10)


## Evaluation

In [28]:
olink_assays = pd.read_csv(DATA_DIR / "olink-explore-3072-assay-list-2023-06-08.csv")
olink_assays["Explore 384 panel"] = olink_assays.loc[:, "Explore 384 panel"].apply(lambda x: x.split("_")[0])
assays_mapping = olink_assays.groupby("Explore 384 panel")["Gene name"].apply(list).to_dict()
list(assays_mapping.keys())

['Cardiometabolic', 'Inflammation', 'Neurology', 'Oncology']

In [29]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

baseline_cols = ["Sex", "age", "BMI"]


def experiment(x, y, fit_params=None):
    model = XGBRegressor(tree_method="gpu_hist", random_state=42)
    scores = cross_val_score(model, x, y, scoring="r2", cv=5, n_jobs=5)
    return scores


scores = {}
x_base = cov_df[baseline_cols]

scores["baseline"] = experiment(x_base, cov_df.SBP)

scores[f"all ({ol_df.shape[1]})"] = experiment(x_base.join(ol_df), cov_df.SBP)

for assay_name in assays_mapping.keys():
    cols = [c for c in ol_df.columns if c in assays_mapping[assay_name]]
    exp_score = experiment(x_base.join(ol_df.loc[:, cols]), cov_df.SBP)
    scores[f"{assay_name.lower()} ({len(cols)})"] = exp_score



In [30]:
scores_df = pd.DataFrame(scores).mean().to_frame("r2")
scores_df["corr"] = np.sqrt(scores_df["r2"])
scores_df

Unnamed: 0,r2,corr
baseline,0.126743,0.35601
all (1355),0.281072,0.530162
cardiometabolic (348),0.234007,0.483743
inflammation (337),0.173479,0.416508
neurology (335),0.185323,0.430492
oncology (341),0.177878,0.421756
