In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from biobank_olink.dataset import load_datasets

ol_df, cov_df = load_datasets(cols_na_th=0, rows_na_th=0)

In [3]:
# W danych dodałem zmienną HTNgroup gdzie 0 to normotensja, 1 to nieleczone HTN (SBP>=140 lub DBP>=90) 2
# to "zażywanie leków antyHTN"
cov_df = cov_df[cov_df.HTNgroup < 2]
cov_df = cov_df.sort_values(by='SBP')
cov_df.head(5)

Unnamed: 0_level_0,Sex,age,BMI,Smokinstatus,Alcoholintakefrequency,HTNgroup,fastingtime,SBP,DBP,PP
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1100542,0.0,56.0,25.3302,0.0,2.0,0.0,3.0,76.5,47.0,29.5
2071806,1.0,61.0,32.7542,1.0,4.0,0.0,3.0,84.5,53.0,31.5
4670270,0.0,41.0,21.7004,0.0,4.0,0.0,4.0,84.5,62.0,22.5
3662467,0.0,55.0,20.3475,0.0,3.0,0.0,6.0,85.0,59.0,26.0
5301061,1.0,43.0,21.6512,1.0,6.0,0.0,2.0,85.5,55.5,30.0


In [4]:
ol_df_corr = ol_df.corr()
mask = np.triu(np.ones(ol_df_corr.shape), k=1).astype(bool)
high_corr = ol_df_corr.where(mask)
cols_to_remove = [column for column in high_corr.columns if any(high_corr[column] > 0.9)]
ol_df.drop(columns=cols_to_remove, inplace=True)

In [5]:
threshold = 0.3
lower_bound, upper_bound = cov_df.SBP.quantile([threshold, 1 - threshold]).values
low_cov_df = cov_df[cov_df.SBP < lower_bound]
high_cov_df = cov_df[upper_bound < cov_df.SBP]
pd.concat([low_cov_df.SBP.describe(), high_cov_df.SBP.describe()], axis=1, keys=["low", "high"])

Unnamed: 0,low,high
count,9028.0,9014.0
mean,116.17335,158.513035
std,6.918589,12.09532
min,76.5,145.0
25%,112.0,149.5
50%,117.5,155.0
75%,122.0,164.5
max,125.0,246.0


In [6]:
correction_df = pd.concat([low_cov_df, high_cov_df])
correction_cols = ["Sex", "age", "BMI"]
correction_df = correction_df[correction_cols]
correction_df = (correction_df - correction_df.mean()) / correction_df.std()
correction_df.head()

Unnamed: 0_level_0,Sex,age,BMI
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1100542,-0.820553,-0.017659,-0.291991
2071806,1.218623,0.590351,1.377041
4670270,-0.820553,-1.841688,-1.108027
3662467,-0.820553,-0.139261,-1.41218
5301061,1.218623,-1.598484,-1.119088


In [7]:
from scipy.spatial.distance import pdist, squareform

similarities = squareform(pdist(correction_df))
np.fill_diagonal(similarities, np.inf)
similarities_df = pd.DataFrame(similarities, index=correction_df.index, columns=correction_df.index)
similarities_sub_df = similarities_df.loc[low_cov_df.index, high_cov_df.index]

paired_up_df = similarities_sub_df.idxmin().to_frame("p2_id")
paired_up_df["dist"] = similarities_df.min()
paired_up_df2 = similarities_sub_df.T.idxmin().to_frame("p2_id")
paired_up_df2["dist"] = similarities_df.T.min()
paired_up_df = pd.concat([paired_up_df, paired_up_df2])
paired_up_df.sort_values(by="dist", inplace=True)

del similarities_df, similarities
paired_up_df.head()

Unnamed: 0_level_0,p2_id,dist
eid,Unnamed: 1_level_1,Unnamed: 2_level_1
2692226,5952781,0.0
4402558,4581635,0.0
2278270,5787420,0.0
1092280,4767076,0.0
1084054,2184694,0.0


In [8]:
p1, p2, _ = paired_up_df.reset_index().iloc[-1].values
cov_df.loc[[p1, p2]]

Unnamed: 0_level_0,Sex,age,BMI,Smokinstatus,Alcoholintakefrequency,HTNgroup,fastingtime,SBP,DBP,PP
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5519244,1.0,65.0,50.9975,1.0,3.0,1.0,4.0,150.0,90.0,60.0
5865888,1.0,56.0,44.9063,2.0,6.0,0.0,4.0,122.5,77.0,45.5


In [9]:
chosen = set()
for p1_idx, (p2_idx, _) in paired_up_df.iterrows():
    if p1_idx in chosen or p2_idx in chosen:
        continue
    chosen.add(p1_idx)
    chosen.add(p2_idx)

chosen_cov_df = cov_df.loc[list(chosen)]
low_cov_df = chosen_cov_df[chosen_cov_df.SBP < lower_bound]
high_cov_df = chosen_cov_df[upper_bound < chosen_cov_df.SBP]
pd.concat([low_cov_df.SBP.describe(), high_cov_df.SBP.describe()], axis=1, keys=["low", "high"])

Unnamed: 0,low,high
count,4082.0,4082.0
mean,117.208844,157.715948
std,6.410988,11.701788
min,76.5,145.0
25%,113.5,149.0
50%,119.0,154.5
75%,122.5,163.0
max,125.0,246.0


In [10]:
pd.concat([low_cov_df[correction_cols].describe(), high_cov_df[correction_cols].describe()], axis=1,
          keys=["low", "high"])

Unnamed: 0_level_0,low,low,low,high,high,high
Unnamed: 0_level_1,Sex,age,BMI,Sex,age,BMI
count,4082.0,4082.0,4082.0,4082.0,4082.0,4082.0
mean,0.417687,56.854238,26.705207,0.417687,56.861098,26.710776
std,0.493239,7.564025,4.248771,0.493239,7.561465,4.262156
min,0.0,40.0,15.3535,0.0,40.0,15.6573
25%,0.0,51.0,23.777975,0.0,51.0,23.7966
50%,0.0,58.0,26.1503,0.0,58.0,26.1277
75%,1.0,63.0,28.9344,1.0,63.0,28.9257
max,1.0,70.0,46.5171,1.0,70.0,56.263


## Classification

In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

x = ol_df.loc[chosen_cov_df.index]
y = chosen_cov_df.index.isin(high_cov_df.index)

est = XGBClassifier(tree_method="hist", random_state=42, n_estimators=500)
scores = cross_val_score(est, x, y, scoring="accuracy", cv=5, n_jobs=5)
scores, scores.mean()

array([0.73194614, 0.71481028, 0.71603427, 0.73194614, 0.71568627,
       0.70955882, 0.72671569, 0.7377451 , 0.76102941, 0.70588235])

In [None]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(64, 64), max_iter=100, random_state=42)
scores = cross_val_score(est, x, y, scoring="accuracy", cv=5, n_jobs=5)
scores, scores.mean()