In [None]:
!pip install autoprognosis
!pip install pycox
#!pip install scikit-survival

Collecting autoprognosis
  Downloading autoprognosis-0.1.21-py2.py3-none-any.whl (284 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.5/284.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting shap>=0.40.0 (from autoprognosis)
  Downloading shap-0.44.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (533 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m533.5/533.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna==3.1.0 (from autoprognosis)
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hyperimpute>=0.1.16 (from autoprognosis)
  Downloading hyperimpute-0.1.17-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyod (fro

# AutoPrognosis

Recently, machine learning approaches have shown improvement over conventional modeling techniques by better capturing complex interactions between patient covariates in a data-driven manner. However, the use of machine learning introduces technical and practical challenges that have thus far restricted widespread adoption of such techniques in clinical settings. To address these challenges and empower healthcare professionals, we present an open-source machine learning framework, AutoPrognosis 2.0, to facilitate the development of diagnostic and prognostic models.


Documentation:
https://github.com/vanderschaarlab/autoprognosis/tree/main?tab=readme-ov-file#zap-plugins

## Survival analysis

In [None]:
# stdlib
import json
import sys
import warnings

# third party
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# autoprognosis absolute
import autoprognosis.logger as log
from autoprognosis.studies.risk_estimation import RiskEstimationStudy

log.add(sink=sys.stderr, level="INFO")

### Load toy data


In [None]:

from pycox import datasets

df = datasets.gbsg.read_df()
df = df[df["duration"] > 0]

X = df.drop(columns = ["duration", "event"])
T = df["duration"]
Y = df["event"]

eval_time_horizons = [
    int(T[Y.iloc[:] == 1].quantile(0.25)),
    int(T[Y.iloc[:] == 1].quantile(0.50)),
    int(T[Y.iloc[:] == 1].quantile(0.75)),
]

X

ModuleNotFoundError: ignored

In [None]:
### Simulate missing data

import random

total_len = len(X)

for col in ["x3", "x4"]:
    indices = random.sample(range(0, total_len), 10)
    X.loc[indices, col] = np.nan

X.isnull().any()

x0    False
x1    False
x2    False
x3     True
x4     True
x5    False
x6    False
dtype: bool

In [None]:
dataset = X.copy()
dataset["target"] = Y
dataset["time_to_event"] = T

### Build pipeline and find the best imputer

In [None]:
# stdlib
from pathlib import Path

workspace = Path("workspace")
workspace.mkdir(parents=True, exist_ok=True)

study_name = "test_risk_estimation_studies"

study = RiskEstimationStudy(
    study_name=study_name,
    dataset=dataset,
    target="target",
    time_to_event="time_to_event",
    time_horizons=eval_time_horizons,
    num_iter=2,
    num_study_iter=1,
    imputers=["mean", "ice", "most_frequent"],
    risk_estimators=["cox_ph"],
    score_threshold=0.4,
    workspace=workspace,
)

In [None]:
study.run()

<autoprognosis.plugins.ensemble.risk_estimation.RiskEnsemble at 0x7879814cca00>

In [None]:
from pprint import pprint
# autoprognosis absolute
from autoprognosis.utils.serialization import load_model_from_file
from autoprognosis.utils.tester import evaluate_survival_estimator

model_path = workspace / study_name / "model.p"

model = load_model_from_file(model_path)

# No need to impute the data here, the pipeline includes the imputer
metrics = evaluate_survival_estimator(model, X, T, Y, eval_time_horizons)

pprint(metrics["str"])

{'NPV': '0.75 +/- 0.094',
 'PPV': '0.406 +/- 0.331',
 'aucroc': '0.717 +/- 0.002',
 'brier_score': '0.172 +/- 0.05',
 'c_index': '0.694 +/- 0.019',
 'predicted_cases': '94.0 +/- 117.898',
 'sensitivity': '0.209 +/- 0.252',
 'specificity': '0.919 +/- 0.101'}


In [None]:
# by horizon
pprint(metrics["horizons"]["str"])

{13.0: {'NPV': '0.863 +/- 0.001',
        'PPV': '0.0 +/- 0.0',
        'aucroc': '0.72 +/- 0.014',
        'brier_score': '0.112 +/- 0.004',
        'c_index': '0.716 +/- 0.016',
        'predicted_cases': '0.333 +/- 0.533',
        'sensitivity': '0.0 +/- 0.0',
        'specificity': '0.999 +/- 0.001'},
 24.0: {'NPV': '0.724 +/- 0.003',
        'PPV': '0.541 +/- 0.022',
        'aucroc': '0.716 +/- 0.022',
        'brier_score': '0.186 +/- 0.009',
        'c_index': '0.691 +/- 0.019',
        'predicted_cases': '42.333 +/- 6.49',
        'sensitivity': '0.109 +/- 0.02',
        'specificity': '0.962 +/- 0.005'},
 40.0: {'NPV': '0.665 +/- 0.018',
        'PPV': '0.678 +/- 0.015',
        'aucroc': '0.717 +/- 0.022',
        'brier_score': '0.218 +/- 0.006',
        'c_index': '0.674 +/- 0.018',
        'predicted_cases': '239.333 +/- 19.388',
        'sensitivity': '0.518 +/- 0.046',
        'specificity': '0.795 +/- 0.018'}}


# Task 1


a) Powyższy kod zmodyfikuj tak, aby rozpatrywać większą rodzinę modeli (risk_estimators)

![models](https://drive.google.com/uc?id=1rlL9N2cqQRTLYF5C2NGRUFO3tDLSp1Do)

b) Jaki algorytm osiągnął najlepszą jakość?

c) Jakie techniki imputacji są wybierane?

d) Rozpatrz także inne techniki imputacji?

![imputation](https://drive.google.com/uc?id=1GUOqy2mXWPoYDRSCQCgSRN5meCUeLMN1)

e) Jakie techniki preprocessingu są dostępne? Jak je ewentualnie ograniczyć?
Kod klasy `RiskEstimationStudy`:

https://github.com/vanderschaarlab/autoprognosis/blob/2b71de4cddb1fc63ed48c1df244e4afcdce802ce/src/autoprognosis/studies/risk_estimation.py#L35