In [1]:
%load_ext autoreload
%autoreload 2

import warnings
from functools import partial
from typing import Mapping

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import shap
import wandb
import xgboost as xgb
from sklearn.model_selection import train_test_split

from biobank_olink.constants import PROJECT_DATA

warnings.filterwarnings('ignore', category=UserWarning)
nb_data = PROJECT_DATA / "olink"
olink_xlsx = nb_data / "Olink_whole_caucasian_nonrandom_Zscore.parquet"

sns.set_theme()
pl.__version__

'1.7.1'

In [2]:
import polars.selectors as cs

feat2code = {
    "SBP": "4080",
    "DBP": "4079",
    "ASI": "21021",
    "DATE": "53",
}
instances = [0, 1, 2, 3]

(
    pl.scan_csv(nb_data / "BP_ASI_instances0123_participant.csv")
    .select(
        "eid",
        *[
            pl.concat_list(cs.contains(f"{feat2code[feat]}-{ins}.")).list.mean().alias(f"{feat}/{ins}")
            for feat in ["SBP", "DBP"]
            for ins in instances
        ],
        *[
            pl.col(f"{feat2code[feat]}-{ins}.0").cast(t).alias(f"{feat}/{ins}")
            for feat, t in [("ASI", pl.Float64), ("DATE", pl.Date)]
            for ins in instances
        ],
    )
    .with_columns(
        (pl.col("^DATE/[123]$") - pl.col("DATE/0")).dt.total_days()
    )
    .drop("DATE/0")
    .unpivot(index="eid")
    .filter(pl.col("value").is_not_null())
    .with_columns(
        pl.col("variable").str.split_exact("/", 1)
    )
    .with_columns(
        variable=pl.col("variable").struct[0],
        ins_index=pl.col("variable").struct[1].cast(pl.Int64)
    )
    .collect()
    .pivot(index=["eid", "ins_index"], on="variable")
)


eid,ins_index,SBP,DBP,ASI,DATE
i64,i64,f64,f64,f64,f64
3638897,0,165.5,83.0,,
4415148,0,155.5,96.5,,
5164480,0,152.0,97.5,10.5921,
3753362,0,126.0,79.5,,
1689649,0,100.0,65.0,,
…,…,…,…,…,…
1627458,3,,,,5521.0
1040988,3,,,,4466.0
5950006,3,,,,5439.0
5433150,3,,,,5315.0


In [66]:
df_followups = (
 
 
    pl.read_excel(nb_data / "Olink BP and ASI follow-up Pawel.xlsx")
    # .unpivot(index="eid")
    # .filter(pl.col("value").is_not_null())
    # .with_columns(
    #     pl.col("variable").str.strip_chars("123_"),
    #     ins_index=pl.col("variable").str.slice(-1).cast(pl.Int64)
    # )
    # .pivot(index=["eid", "ins_index"], on="variable", values="value")
    # .rename({"time_to_ins": "followup_time"})
)
df_followups

eid,time_to_ins1,DBP_1,SBP_1,time_to_ins2,DBP_2,SBP_2,time_to_ins3,DBP_3,SBP_3,ASI_2,ASI_3,ASI_1
i64,i64,f64,f64,i64,f64,f64,i64,f64,f64,f64,f64,f64
1000011,,,,,,,,,,,,
1000121,,,,,,,,,,,,
1000366,,,,,,,,,,,,
1000807,,,,,,,,,,,,
1001025,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…
6023060,,,,4878,,,,,,,,
6023200,,,,,,,,,,,,
6023252,,,,,,,,,,,,
6023293,,,,5361,86.0,136.0,,,,15.8182,,


In [41]:
df_followups.filter(pl.col("ASI").is_not_null()).group_by("eid").agg(pl.col("ins_index").len())["ins_index"].value_counts(sort=True)

ins_index,count
u32,u32
1,4963
2,1534
3,194


In [65]:
df_followups.filter(
    pl.col("SBP").is_not_null(), pl.col("DBP").is_not_null()
).group_by("eid").agg("ins_index")["ins_index"].value_counts(sort=True)


ins_index,count
list[i64],u32
[2],3693
[1],928
"[2, 3]",838
"[1, 2]",671
[3],260
"[1, 2, 3]",233
"[1, 3]",20


 - 222 patients with BP measurements performed at instances 0,1,2,3 (4 total measurements during follow-up)
 - 975 patients with BP measurements performed at instances 0,2,3 (3 total measurements during follow-up)
 - 5,016 patients with BP measurements performed at instances 0,2 (2 total measurements during follow-up)
------------------
 - 53 patients with ASI measurements performed at instances 0,1,2,3 (4 total measurements during follow-up)
 - 325 patients with ASI measurements performed at instances 0,2,3 (3 total measurements during follow-up)
 - 1,076 patients with ASI measurements performed at instances 0,2 (2 total measurements during follow-up)