In [1]:
import pandas
import warnings
from matplotlib import pyplot
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import OrdinalEncoder
from sksurv.linear_model import CoxnetSurvivalAnalysis
from deps.data import load_metadata
from deps.logger import logger
from deps.data import load_raw_data, load_data
from hcve_lib.functional import pipe
from hcve_lib.data import to_survival_y_records, get_X, get_survival_y
from hcve_lib.cv import train_test
from hcve_lib.utils import list_to_dict_index
from sklearn.model_selection import KFold
from hcve_lib.cv import cross_validate, predict_survival, lm_cv, kfold_cv
from hcve_lib.evaluation_functions import compute_metrics_folds, c_index
from hcve_lib.cv import lco_cv
from hcve_lib.data import format_identifier_long
from hcve_lib.transformers import MiceForest
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

logger.setLevel('DEBUG')
pandas.set_option("display.max_columns", None)
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format='retina'
pyplot.rcParams['figure.facecolor'] = 'white'

In [2]:
metadata = load_metadata()
data = load_data(metadata)
raw_data = load_raw_data()
raw_data_subset = raw_data.loc[data.index]

Raw data
	n cohorts=21
	n individuals=52,585

Baseline visit kept
 	n individuals removed=5,529

 	n individuals=47,056

HF cohorts removed
	n cohorts removed=3: hfgr, timechf, leitzaran
	n individuals removed=1,073

	n cohorts=18
	n individuals=45,983

No outcome cohorts removed
	n cohorts removed=5: biomarcoeurs, dyda, epath, iblomaved, stophf
	n individuals removed=3,077

	n cohorts=13
	n individuals=42,906

Missing HF data cohorts removed
	n cohorts removed=6: adelhyde, gecoh, r2c2, reve(1-2), stanislas, styrianvitd
	n individuals removed=6,209

	n cohorts=7
	n individuals=36,697

Missing blood pressure measurements
 	n individuals removed=434

 	n individuals=36,263

Providing missing PP for 3075individuals

HF individuals at baseline removed
 	n individuals removed=2,953

 	n individuals=33,310

Missing outcome individuals removed
 	n individuals removed=904

 	n individuals=32,406

Final dataset
	n individuals=32,406
	n cohorts=7



In [107]:
for column in data.columns:
    print(f'{len(data[column].unique())}: {format_identifier_long(column, metadata)}')

32406: [IDNR] IDNR
7: [STUDY_NUM] Numeric study identifier
7: [STUDY] STUDY
1: [VISIT] VISIT
2: [SEX] Gender
16090: [AGE] Age
1878: [BW] Body weight
883: [BH] Body height
13216: [BMI] Body-mass index_data
4: [SMK] Smoking status
3: [DRK] Alcohol intake
3: [DIABETES] History of diabetes
3: [HCV] History of cardiovascular disease
3: [HCAD] History of coronary artery disease
3: [HPAD] History of peripheral artery disease
3: [HAF] History of atrial fibrillation
3: [HCEREBROV] History of cerebrovascular incident
3: [TRT_AH] Intake of antihypertensive medication
3: [TRT_ACE] Intake of angiotensin converting enzyme inhibitors
3: [TRT_CCB] Intake of calcium channel blockers
3: [TRT_DIUR] Intake of diuretics
3: [TRT_BB] Intake of beta-blokcers
3: [TRT_ARB] Intake of angiotensin II receptor blockers
3: [TRT_LIP] Intake of lipid-lowering medication
471: [SBP] Systolic blood pressure
308: [DBP] Diastolic blood pressure
189: [HR] Heart rate
408: [PP] Pulse pressure
557: [SOK] Sokolow-Lyon index_dat

In [108]:
for column in data.columns:
    if len(data[column].unique()) < 10:
        data.loc[:, column] = data[column].astype('category')
    print(f'{data[column].dtype}: {format_identifier_long(column, metadata)}')

int64: [IDNR] IDNR
category: [STUDY_NUM] Numeric study identifier
category: [STUDY] STUDY
category: [VISIT] VISIT
category: [SEX] Gender
float64: [AGE] Age
float64: [BW] Body weight
float64: [BH] Body height
float64: [BMI] Body-mass index_data
category: [SMK] Smoking status
category: [DRK] Alcohol intake
category: [DIABETES] History of diabetes
category: [HCV] History of cardiovascular disease
category: [HCAD] History of coronary artery disease
category: [HPAD] History of peripheral artery disease
category: [HAF] History of atrial fibrillation
category: [HCEREBROV] History of cerebrovascular incident
category: [TRT_AH] Intake of antihypertensive medication
category: [TRT_ACE] Intake of angiotensin converting enzyme inhibitors
category: [TRT_CCB] Intake of calcium channel blockers
category: [TRT_DIUR] Intake of diuretics
category: [TRT_BB] Intake of beta-blokcers
category: [TRT_ARB] Intake of angiotensin II receptor blockers
category: [TRT_LIP] Intake of lipid-lowering medication
float6

In [112]:
import pickle

with open('./data/prediction.data', 'rb') as file:
    results = pickle.load(file)


In [21]:
from hcve_lib.visualisation import b, h2
from load_test import load_pickled_artifact
import mlflow
from mlflow.tracking import MlflowClient
from typing import Any
import pickle
import yaml
from hcve_lib.data import format_identifier
from toolz.curried import valmap, partial
from toolz import valfilter, identity
from pandas import DataFrame, Series
import plotly.express as px
from hcve_lib.serialization import to_json_serializable
from hcve_lib.evaluation_functions import compute_metric_groups


for evaluation_type in ('reproduce', 'lco', 'lm'):
    h2(evaluation_type)
    runs = mlflow.search_runs('0')
    last_run = runs[runs['tags.mlflow.parentRunId'].isna() & (runs['params.cv'] == evaluation_type)].iloc[0]
    result = load_pickled_artifact(last_run['run_id'], 'result')

    removed_features = pipe(
        result['column_masks'],
        valmap(lambda mask: valfilter(identity, mask)),
        valmap(lambda mask: list(mask)),
        valmap(lambda mask: map(partial(format_identifier, metadata=metadata), mask)),
        valmap(lambda mask: list(mask)),
    )
    b('Removed features')
    print(yaml.dump(removed_features))
    metrics_group = compute_metric_groups(c_index, result['predictions'], raw_data_subset.groupby('STUDY'))
    # for fold_name, fold in result['predictions'].items():
        # print(len(fold['model'][-1].coef_[0]))
        # print(Series(fold['model'][-1].coef_, index=fold['model'][-1].get_feature_names()))
    fig = px.imshow(
        DataFrame(metrics_group),
        labels=dict(x="Trained on", y="Tested on", color="c-index"),
    )
    fig.show()



train_test:
- Sokolow-Lyon index_data
- Cornell index_data

95


ASCOT: []
FLEMENGHO: []
HEALTHABC:
- History of atrial fibrillation
- Sokolow-Lyon index_data
- Cornell index_data
HULL_LIFELAB:
- Intake of lipid-lowering medication
- Sokolow-Lyon index_data
- QRS duration
- Cornell index_data
HVC:
- Alcohol intake
- Sokolow-Lyon index_data
- Cornell index_data
PREDICTOR:
- Intake of lipid-lowering medication
- Sokolow-Lyon index_data
- QRS duration
- Cornell index_data
PROSPER:
- History of atrial fibrillation
- Intake of lipid-lowering medication
- Sokolow-Lyon index_data
- QRS duration
- Cornell index_data

94
100
88
92
89
89
84


ASCOT: []
FLEMENGHO: []
HEALTHABC:
- History of atrial fibrillation
- Sokolow-Lyon index_data
- Cornell index_data
HULL_LIFELAB:
- Intake of lipid-lowering medication
- Sokolow-Lyon index_data
- QRS duration
- Cornell index_data
HVC:
- Alcohol intake
- Sokolow-Lyon index_data
- Cornell index_data
PREDICTOR:
- Intake of lipid-lowering medication
- Sokolow-Lyon index_data
- QRS duration
- Cornell index_data
PROSPER:
- History of atrial fibrillation
- Intake of lipid-lowering medication
- Sokolow-Lyon index_data
- QRS duration
- Cornell index_data

36
100
100
100
100
98
100


In [125]:
print(yaml.dump(to_json_serializable(metrics_group)))

ASCOT:
  ASCOT: null
  FLEMENGHO: 0.6002263258062857
  HEALTHABC: 0.5
  HULL_LIFELAB: 0.5
  HVC: 0.5
  PREDICTOR: 0.5
  PROSPER: 0.5
FLEMENGHO:
  ASCOT: 0.6594507072020622
  FLEMENGHO: null
  HEALTHABC: 0.5754161396407792
  HULL_LIFELAB: 0.5166962831334222
  HVC: 0.6132763389489565
  PREDICTOR: 0.6298449937159615
  PROSPER: 0.553570709523563
HEALTHABC:
  ASCOT: 0.7530559190277603
  FLEMENGHO: 0.8461498894089811
  HEALTHABC: null
  HULL_LIFELAB: 0.5318747223456242
  HVC: 0.6135277847623837
  PREDICTOR: 0.7058232090490155
  PROSPER: 0.6553500859159478
HULL_LIFELAB:
  ASCOT: 0.49771711224573467
  FLEMENGHO: 0.5747646725991461
  HEALTHABC: 0.5654675773674003
  HULL_LIFELAB: null
  HVC: 0.5295448830776968
  PREDICTOR: 0.5728529534981148
  PROSPER: 0.6004197595996998
HVC:
  ASCOT: 0.5143859924677523
  FLEMENGHO: 0.6582994701918625
  HEALTHABC: 0.4759364195969306
  HULL_LIFELAB: 0.5151636309788242
  HVC: null
  PREDICTOR: 0.5894428152492669
  PROSPER: 0.4939957448801738
PREDICTOR:
  ASCOT: 0.