In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from biobank_olink.dataset import load_datasets

ol_df, cov_df = load_datasets(cols_na_th=0, rows_na_th=0)

In [3]:
# W danych dodałem zmienną HTNgroup gdzie 0 to normotensja, 1 to nieleczone HTN (SBP>=140 lub DBP>=90) 2
# to "zażywanie leków antyHTN"
cov_df = cov_df[cov_df.HTNgroup < 2]
cov_df = cov_df.sort_values(by='SBP')
cov_df.head(5)

Unnamed: 0_level_0,Sex,age,BMI,Smokinstatus,Alcoholintakefrequency,HTNgroup,fastingtime,SBP,DBP,PP
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1100542,0.0,56.0,25.3302,0.0,2.0,0.0,3.0,76.5,47.0,29.5
2071806,1.0,61.0,32.7542,1.0,4.0,0.0,3.0,84.5,53.0,31.5
4670270,0.0,41.0,21.7004,0.0,4.0,0.0,4.0,84.5,62.0,22.5
3662467,0.0,55.0,20.3475,0.0,3.0,0.0,6.0,85.0,59.0,26.0
5301061,1.0,43.0,21.6512,1.0,6.0,0.0,2.0,85.5,55.5,30.0


In [17]:
ol_df_corr = ol_df.corr()
mask = np.triu(np.ones(ol_df_corr.shape), k=1).astype(bool)
high_corr = ol_df_corr.where(mask)
cols_to_remove = [column for column in high_corr.columns if any(high_corr[column] > 0.9)]
ol_df.drop(columns=cols_to_remove, inplace=True)

In [4]:
threshold = 0.35
lower_bound, upper_bound = cov_df.SBP.quantile([threshold, 1 - threshold]).values
low_cov_df = cov_df[cov_df.SBP < lower_bound]
high_cov_df = cov_df[upper_bound < cov_df.SBP]
pd.concat([low_cov_df.SBP.describe(), high_cov_df.SBP.describe()], axis=1, keys=["low", "high"])

Unnamed: 0,low,high
count,10679.0,10721.0
mean,117.77423,156.075972
std,7.386252,12.429014
min,76.5,142.0
25%,113.0,146.5
50%,119.5,153.0
75%,124.0,162.0
max,127.5,246.0


In [5]:
correction_df = pd.concat([low_cov_df, high_cov_df])
correction_cols = ["Sex", "age", "BMI"]
correction_df = correction_df[correction_cols]
correction_df = (correction_df - correction_df.mean()) / correction_df.std()
correction_df.head()

Unnamed: 0_level_0,Sex,age,BMI
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1100542,-0.837467,-0.017329,-0.306785
2071806,1.194021,0.593013,1.366313
4670270,-0.837467,-1.848354,-1.124809
3662467,-0.837467,-0.139397,-1.429703
5301061,1.194021,-1.604218,-1.135897


In [6]:
from scipy.spatial.distance import pdist, squareform

similarities = squareform(pdist(correction_df))
np.fill_diagonal(similarities, np.inf)
similarities_df = pd.DataFrame(similarities, index=correction_df.index, columns=correction_df.index)
similarities_sub_df = similarities_df.loc[low_cov_df.index, high_cov_df.index]

paired_up_df = similarities_sub_df.idxmin().to_frame("p2_id")
paired_up_df["dist"] = similarities_df.min()
paired_up_df2 = similarities_sub_df.T.idxmin().to_frame("p2_id")
paired_up_df2["dist"] = similarities_df.T.min()
paired_up_df = pd.concat([paired_up_df, paired_up_df2])
paired_up_df.sort_values(by="dist", inplace=True)

del similarities_df, similarities
paired_up_df.head()

Unnamed: 0_level_0,p2_id,dist
eid,Unnamed: 1_level_1,Unnamed: 2_level_1
3636713,4051783,0.0
2389551,2986916,0.0
1528084,5331657,0.0
2116722,4731896,0.0
2263355,5522598,0.0


In [7]:
p1, p2, _ = paired_up_df.reset_index().iloc[-1].values
cov_df.loc[[p1, p2]]

Unnamed: 0_level_0,Sex,age,BMI,Smokinstatus,Alcoholintakefrequency,HTNgroup,fastingtime,SBP,DBP,PP
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5519244,1.0,65.0,50.9975,1.0,3.0,1.0,4.0,150.0,90.0,60.0
1939696,1.0,55.0,48.3703,0.0,3.0,0.0,12.0,127.0,86.5,40.5


In [8]:
chosen = set()
for p1_idx, (p2_idx, _) in paired_up_df.iterrows():
    if p1_idx in chosen or p2_idx in chosen:
        continue
    chosen.add(p1_idx)
    chosen.add(p2_idx)

chosen_cov_df = cov_df.loc[list(chosen)]
low_cov_df = chosen_cov_df[chosen_cov_df.SBP < lower_bound]
high_cov_df = chosen_cov_df[upper_bound < chosen_cov_df.SBP]
pd.concat([low_cov_df.SBP.describe(), high_cov_df.SBP.describe()], axis=1, keys=["low", "high"])

Unnamed: 0,low,high
count,5133.0,5133.0
mean,118.92295,155.348821
std,6.843063,12.17258
min,76.5,142.0
25%,115.0,146.0
50%,120.5,152.0
75%,124.5,161.0
max,127.5,246.0


In [9]:
pd.concat([low_cov_df[correction_cols].describe(), high_cov_df[correction_cols].describe()], axis=1,
          keys=["low", "high"])

Unnamed: 0_level_0,low,low,low,high,high,high
Unnamed: 0_level_1,Sex,age,BMI,Sex,age,BMI
count,5133.0,5133.0,5133.0,5133.0,5133.0,5133.0
mean,0.431522,56.676018,26.796728,0.431522,56.678745,26.798973
std,0.495337,7.627332,4.2893,0.495337,7.626911,4.289026
min,0.0,40.0,15.3535,0.0,40.0,15.6573
25%,0.0,51.0,23.8437,0.0,51.0,23.8572
50%,0.0,58.0,26.2284,0.0,58.0,26.2252
75%,1.0,63.0,29.0392,1.0,63.0,29.0542
max,1.0,70.0,55.1341,1.0,70.0,56.5019


## Classification

In [11]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

x = ol_df.loc[chosen_cov_df.index]
y = chosen_cov_df.index.isin(high_cov_df.index)

est = XGBClassifier(tree_method="gpu_hist", random_state=42)
est.fit(x, y)
# scores = cross_val_score(est, x, y, scoring="accuracy", cv=5, n_jobs=5)
# scores, scores.mean()

In [13]:
pd.Series(est.feature_importances_, index=x.columns).to_dict()

{'AARSD1': 0.0010935893515124917,
 'ABHD14B': 0.00022652916959486902,
 'ABL1': 0.0004242146678734571,
 'ACAA1': 0.0003337656962685287,
 'ACAN': 0.001136269187554717,
 'ACE2': 0.005188326817005873,
 'ACOX1': 0.0004556016647256911,
 'ACP5': 0.0012314956402406096,
 'ACP6': 0.0005510232294909656,
 'ACTA2': 0.0016992379678413272,
 'ACTN4': 0.0005096012610010803,
 'ACVRL1': 0.00026541296392679214,
 'ACY1': 0.005066273733973503,
 'ADA': 0.0006456547416746616,
 'ADA2': 0.0010410953545942903,
 'ADAM15': 0.0006820888957008719,
 'ADAM22': 0.00031960950582288206,
 'ADAM23': 0.0009812968783080578,
 'ADAM8': 0.0012958525912836194,
 'ADAMTS13': 0.0007295347168110311,
 'ADAMTS15': 0.00044374779099598527,
 'ADAMTS16': 0.001064575044438243,
 'ADAMTS8': 0.0010574125917628407,
 'ADCYAP1R1': 0.0,
 'ADGRB3': 0.00014466853463090956,
 'ADGRE2': 0.0012658892665058374,
 'ADGRE5': 0.000879454892128706,
 'ADGRG1': 0.0008473297930322587,
 'ADGRG2': 0.0009096745634451509,
 'ADH4': 0.00024343139375559986,
 'ADM': 0.