In [2]:
import pandas as pd
import scipy as sp
import numpy as np

import emcee
from multiprocessing import Pool

import matplotlib.pyplot as plt
import matplotlib.gridspec as gs
from matplotlib import font_manager
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.colors import ListedColormap
from cycler import cycler
import corner
from typing import Any, Dict, List, Optional, Tuple, Union

import lymph

In [6]:
filename = "../data/extended_system.hdf5"

extended_systm = lymph.utils.system_from_hdf(
    filename=filename,
    name="extended/model")

samples_HMM = emcee.backends.HDFBackend(filename=filename, name="extended/samples").get_chain(flat=True, discard=burnin)


Loading patients of late T-stage


 16%|█▌        | 2308/14336 [00:02<00:10, 1129.64it/s]

KeyboardInterrupt: 

## Compare Model Risk Prediction with Occurrence in Data


#### Method to determine the involvement of a layer in a single patient


In [4]:
import itertools

def layer_involvement_expectation(inv: Optional[int], spsns: List[tuple])-> int:
    """Determine which layers are involved given the different (conflicting) measurements:

        Args:
            inv:    For one single layer, one entry gives the measured involvement for the
                    respective modality
            spsns:  Each entry of the list is the spsn tuple of the respective modality

    """
    no_of_measurements = 0
    prob = 0
    for i , spsn in enumerate(spsns):
        if(inv[i] == True):
            no_of_measurements+=1
            prob += spsn[1]
        elif(inv[i] == False):
            no_of_measurements+=1
            prob += (1-spsn[0])
    if(no_of_measurements == 0):
        return 0
    mean_prob = prob / no_of_measurements
    if(mean_prob >= 0.5):
        return 1
    else:
        return 0


def layer_involvement_statistical(inv: Optional[int], spsns: List[tuple])-> int:
    """Determine which layers are involved given the different (conflicting) measurements:

        Args:
            inv:    For one single layer, one entry gives the measured involvement for the
                    respective modality
            spsns:  Each entry of the list is the spsn tuple of the respective modality

    """
    p_healthy = 1
    p_involved = 1
    for i , spsn in enumerate(spsns):
        if(inv[i] == True):
            p_healthy *= 1-spsn[1]
            p_involved *= spsn[1]
        elif(inv[i] == False):
            p_healthy *= spsn[0]
            p_involved *= 1-spsn[0]
    if(p_healthy < p_involved):
        return 1
    else:
        return 0


def layer_involvement_hierarchical(inv: Optional[int], spsns: List[tuple])-> int:
    """Determine which layers are involved given the different (conflicting) measurements:
       hierarchy: 1.Pathology,  2.Diagnostic consensus,  3. PET/CT,  4.MRI

        Args:
            inv:    For one single layer, one entry gives the measured involvement for the
                    respective modality. For this function the modalities have to be given in the hierachical
                    order -> pathology first etc.
            spsns:  Each entry of the list is the spsn tuple of the respective modality

    """
    spsns_with_index = []
    for i, spsn in enumerate(spsns):
        spsn_w_i = list(spsn)
        spsn_w_i.append(i)
        spsns_with_index.append(tuple(spsn_w_i))
    zipped_list = zip(spsns, inv)
    sorted_by_sp = sorted(zipped_list, key=lambda x: x[0], reverse=True)
    sorted_by_sn = sorted(zipped_list, key=lambda x: x[1], reverse=True)
    inv_by_sp = [element for _, element in sorted_by_sp]
    inv_by_sn = [element for _, element in sorted_by_sn]
    # for i, invol in inv_by_sp:
    #     if(inv_by_sp[i] is not None):
    #         if(inv_by_sp[i] == inv_by_sn[i]):
    #             return inv_by_sp[i]
    #     else:
            
    #         pass
    return 0

print(layer_involvement_hierarchical([0, 0, 1, 0], [(1.,1.), (0.63, 0.81), (0.86, 0.79), (0.63, 0.81)]))


all_possible_measurements = list(itertools.product([0,1,None], repeat=4))
for meas in all_possible_measurements:
    exp = layer_involvement_expectation(meas, [(1.,1.), (0.63, 0.81), (0.86, 0.79), (0.63, 0.81)])
    stat = layer_involvement_statistical(meas, [(1.,1.), (0.63, 0.81), (0.86, 0.79), (0.63, 0.81)])
    hier = layer_involvement_hierarchical(meas, [(1.,1.), (0.63, 0.81), (0.86, 0.79), (0.63, 0.81)])
    if(exp != stat):
        print(meas, f"\t\t\t\texp={exp}   !=   stat={stat}")
    if(hier != stat):
        print(meas, f"\t\t\t\thier={hier}   !=   stat={stat}")


0
(0, 1, 1, 1) 				exp=1   !=   stat=0
(0, 1, 1, None) 				exp=1   !=   stat=0
(0, 1, None, 1) 				exp=1   !=   stat=0
(0, None, 1, 1) 				exp=1   !=   stat=0
(1, 0, 0, 0) 				exp=0   !=   stat=1
(1, 0, 0, 0) 				hier=0   !=   stat=1
(1, 0, 0, 1) 				hier=0   !=   stat=1
(1, 0, 0, None) 				hier=0   !=   stat=1
(1, 0, 1, 0) 				hier=0   !=   stat=1
(1, 0, 1, 1) 				hier=0   !=   stat=1
(1, 0, 1, None) 				hier=0   !=   stat=1
(1, 0, None, 0) 				hier=0   !=   stat=1
(1, 0, None, 1) 				hier=0   !=   stat=1
(1, 0, None, None) 				hier=0   !=   stat=1
(1, 1, 0, 0) 				hier=0   !=   stat=1
(1, 1, 0, 1) 				hier=0   !=   stat=1
(1, 1, 0, None) 				hier=0   !=   stat=1
(1, 1, 1, 0) 				hier=0   !=   stat=1
(1, 1, 1, 1) 				hier=0   !=   stat=1
(1, 1, 1, None) 				hier=0   !=   stat=1
(1, 1, None, 0) 				hier=0   !=   stat=1
(1, 1, None, 1) 				hier=0   !=   stat=1
(1, 1, None, None) 				hier=0   !=   stat=1
(1, None, 0, 0) 				hier=0   !=   stat=1
(1, None, 0, 1) 				hier=0   !=   stat=1
(1

### Method to count the occurrences of involved layers in the real Dataset


In [5]:
def layer_occurence(df, t_stage=True):
    #All labels of the Columns
    layer_cols = df.columns.get_level_values(1)

    #Delete Duplicates and t-stage from list
    layers = list(dict.fromkeys(layer_cols))
    if(t_stage == True):
        layers = layers[:-1]
   
    #Create Empty DataFrame to store the occurrences
    occurr_table = pd.DataFrame(np.zeros((1, len(layers))), columns=layers)

    #Fill the occurrence table
    for layer in layers:
        select = layer_cols.isin([layer])
        level_data = df.loc[:, select]
        for index, row in level_data.iterrows():
            patient_occ = row.to_dict()
            patient_occ2 = {key[0]: val for (key, val) in patient_occ.items()}
            involved = layer_involvement_statistical(patient_occ2)
            if(involved):
                occurr_table[layer] += 1
    return occurr_table




#### Compare Dataset with Model

In [7]:
def comparison_risk_w_dataset(df, layers, time_dists):

    ##layers = ["I", "IV"]
    
    # time_dists={
    #     "early": lymph.utils.fast_binomial_pmf(t, max_t, early_p),
    #     "late" : lymph.utils.fast_binomial_pmf(t, max_t, mean_late_p)
    # }
    
    all_layer_cols = df.columns.get_level_values(1)
    all_layers = list(dict.fromkeys(all_layer_cols))[:-1]
    involvement = np.repeat(None, len(all_layers))
    for i, layer in enumerate(all_layers):
        if(layers.includes(layer)):
            involvement[i]= 1
    diagnose = {"PET": [None,None,None,None,None,None]}
    
    for key in time_dists.keys():
        risk =  extended_systm.risk(
                        diagnoses=diagnose, inv=involvement,
                        time_dist=time_dists[key], 
                        mode="HMM"
                    )


## Risk prediction

In [None]:
time_dists={
        "early": lymph.utils.fast_binomial_pmf(t, max_t, early_p),
        "late" : lymph.utils.fast_binomial_pmf(t, max_t, mean_late_p)
    }

layers = ["V", "VII"]

diagnoses_text = [["Layer V is negative", "Layer V is negative but II positive"], ["Layer VII is negative", "Layer VII is negative but II & III positive"]]

diagnoses = [[{"PET":  np.array([None,None,None,None,0,None])},
            {"PET": np.array([None,1,None,None,0,None])}],
            [{"PET": np.array([None,None,None,None,None,0])},
            {"PET": np.array([None,1,1,None,None,0])}]
            ]

involvements = [np.array([None,None,None,None,1,None]), np.array([None,None,None,None,None,1])]
thin = 50

print("Probability p for Binomial distribution of late T_stage:", round(mean_late_p,4))


for key in time_dists.keys():
    print("T_stage = ", key)
    for i, layer in enumerate(layers):
        for k, diagnose in enumerate(diagnoses[i]):
            risk =  extended_systm.risk(
                diagnoses=diagnose, inv=involvements[i],
                time_dist=time_dists[key], 
                mode="HMM"
            )
            print(f"Risk for Layer {layer} given {diagnoses_text[i][k]}:", round(risk,4))

   
