In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

data_path = Path("../datasets/raw_UKReg/")

In [None]:
raw_regs = pd.read_csv(data_path / "liver_registrations.csv")

raw_regs

In [None]:
raw_trans = pd.read_csv(data_path / "liver_transplant.csv")

raw_trans

In [None]:
#full_df = pd.merge(raw_regs, raw_trans, left_on = 'a_registration_id', right_on = 'REGID')
raw_regs.loc[:, 'RECIPID'] = raw_regs['a_recip_id']
full_df = raw_regs.merge(raw_trans, how='left', on='RECIPID')

full_df.loc[:, "SERUM_POTASSIUM"] = full_df["SERUM_POTASSIUM"].replace([99.9], np.nan)
full_df.loc[:, "AFP_LEVEL"] = full_df["AFP_LEVEL"].replace([99999, 88888], np.nan)
full_df.loc[full_df['outcome'].isin(['A', 'T']), 'rwtime'] = np.nan
full_df.loc[:, 'PSURV'] = full_df['PSURV'].replace(np.nan, 0)
full_df.loc[:, 'rwtime'] = full_df['rwtime'].replace(np.nan, 0)


full_df = full_df[full_df['INR'].notna()]
full_df = full_df[full_df['outcome'].notna()]
full_df = full_df[full_df['SEX'].notna()]
full_df = full_df[full_df['SEX'] != 8]
full_df = full_df[full_df['SERUM_BILIRUBIN'].notna()]
full_df = full_df[full_df['INR'].notna()]
full_df = full_df[full_df['SERUM_CREATININE'].notna()]
full_df = full_df[full_df['SERUM_SODIUM'].notna()]


full_df.loc[:, 'Y'] = full_df['rwtime'] + full_df['PSURV']
full_df = full_df[full_df['Y'] > 0]

full_df["CENS"] =  (full_df.outcome != 'T').astype(int)
full_df["CENS"].value_counts()

In [None]:
with_transplant = full_df[full_df["CENS"] == 0].copy()
no_transplant = full_df[full_df["CENS"] == 1].copy()

In [None]:
with_transplant

In [None]:
# Helpers for data conversion


def meld_observed_mortality(score: float) -> float:
    if score >= 40:
        return 71.3
    elif score >= 30:
        return 52.6
    elif score >= 20:
        return 19.6
    elif score >= 10:
        return 6.0
    else:
        return 1.9


def ukeld_eligibility(score: float) -> int:
    return int(score >= 49)


def parse_bilirunbin_meld(val: float) -> float:
    return max(val / 17.1, 1)


def parse_bilirunbin_ukeld(val: float) -> float:
    return max(val, 1)


def parse_inr(val: float) -> float:
    return max(val, 1)


def parse_creatinine_meld(val: float) -> float:
    val = val / 88.42
    return min(max(val, 1), 4)


def parse_creatinine_ukeld(val: float) -> float:
    return min(max(val, 1), 400)


def parse_sodium(val: float) -> float:
    return min(max(val, 125), 140)



In [None]:
# MELD
from organsync.models.linear import MELD
import math

def meld_inference(data: pd.DataFrame) -> dict:
    serum_bilirubin = parse_bilirunbin_meld(data["SERUM_BILIRUBIN"])
    inr = parse_inr(data["INR"])
    serum_creatinine = parse_creatinine_meld(data["SERUM_CREATININE"])
    score = MELD().score(
        serum_bilirubin=serum_bilirubin, inr=inr, serum_creatinine=serum_creatinine
    )
    rounded_score = math.ceil(score)

    return rounded_score


In [None]:
# MELD na

import math
from organsync.models.linear import MELD_na


def meld_na_inference(data: pd.DataFrame) -> int:
    serum_bilirubin = parse_bilirunbin_meld(data["SERUM_BILIRUBIN"])
    inr = parse_inr(data["INR"])
    serum_creatinine = parse_creatinine_meld(data["SERUM_CREATININE"])
    sodium = parse_sodium(data["SERUM_SODIUM"])
    score = MELD_na().score(
        serum_bilirubin=serum_bilirubin,
        inr=inr,
        serum_creatinine=serum_creatinine,
        serum_sodium=sodium,
    )

    return math.ceil(score)


In [None]:
#UKELD
from organsync.models.linear import UKELD

def ukeld_inference(data: pd.DataFrame) -> int:
    serum_bilirubin = parse_bilirunbin_ukeld(data["SERUM_BILIRUBIN"])
    inr = parse_inr(data["INR"])
    serum_creatinine = parse_creatinine_ukeld(data["SERUM_CREATININE"])
    sodium = data["SERUM_SODIUM"]

    score = UKELD().score(
        serum_bilirubin=serum_bilirubin,
        inr=inr,
        serum_creatinine=serum_creatinine,
        serum_sodium=sodium,
    )

    return math.ceil(score)


In [None]:
#list(with_transplant.columns)
with_transplant.loc[:, "MELD"] = with_transplant.apply(lambda row: meld_inference(row), axis = 1)
no_transplant.loc[:, "MELD"] = no_transplant.apply(lambda row: meld_inference(row), axis = 1)

In [None]:
with_transplant.loc[:, "MELD_na"] = with_transplant.apply(lambda row: meld_na_inference(row), axis = 1)
no_transplant.loc[:, "MELD_na"] = no_transplant.apply(lambda row: meld_na_inference(row), axis = 1)

In [None]:
with_transplant.loc[:, "UKELD"] = with_transplant.apply(lambda row: ukeld_inference(row), axis = 1)
no_transplant.loc[:, "UKELD"] = no_transplant.apply(lambda row: ukeld_inference(row), axis = 1)

In [None]:
stats_cols = ["MELD", "MELD_na", "UKELD", "Y"]

with_transplant[stats_cols]

In [None]:
import json


def generate_stats(data: pd.DataFrame):
    metrics = {}

    for key in ["MELD", "MELD_na", "UKELD"]:
        metrics[key] = {}

        for val in data[key].unique():
            if np.isnan(val):
                continue
        
            ymed = data[data[key] == val]["Y"].median()
            ymax = data[data[key] == val]["Y"].max()
            ymin = data[data[key] == val]["Y"].min()

            metrics[key][int(val)] = {
                "min" : ymin,
                "max" : ymax,
                "median" : ymed,
                }
    return metrics

surv_stats_with_transplant = generate_stats(with_transplant)
surv_stats_no_transplant = generate_stats(no_transplant)

In [None]:
metrics = {
    "with_transplant": surv_stats_with_transplant,
    "no_transplant": surv_stats_no_transplant,
}
with open("models/model_stats.json", "w") as f:
    json.dump(metrics, f, indent=4, sort_keys=True)