In [1]:
import pandas as pd
import scipy as sp
import numpy as np

import emcee
from multiprocessing import Pool

import matplotlib.pyplot as plt
import matplotlib.gridspec as gs
from matplotlib import font_manager
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.colors import ListedColormap
from cycler import cycler
import corner
from typing import Any, Dict, List, Optional, Tuple, Union
import itertools

import lymph

### Setup System


In [2]:
filename = "../data/extended_system.hdf5"

extended_systm = lymph.utils.system_from_hdf(
    filename=filename,
    name="extended/model")



samples_HMM = emcee.backends.HDFBackend(filename=filename, name="extended/samples").get_chain(flat=True, discard=7000)

spread_probs = np.array([sample[:-1] for sample in samples_HMM])
late_p = np.array([sample[-1] for sample in samples_HMM])

extended_systm.spread_probs = np.mean(spread_probs, axis=0)
mean_late_p = np.mean(np.array(late_p), axis=0)

max_t=10
t = np.arange(max_t + 1)
early_p=0.3
time_dists={
        "early": lymph.utils.fast_binomial_pmf(t, max_t, early_p),
        "late" : lymph.utils.fast_binomial_pmf(t, max_t, mean_late_p)
}
extended_systm.modalities = {"PET": [0.86, 0.79]}

print(samples_HMM.shape)


INFO:numexpr.utils:NumExpr defaulting to 4 threads.


Loading patients of late T-stage


100%|██████████| 14336/14336 [00:26<00:00, 540.27it/s]


Loading patients of early T-stage


100%|██████████| 20864/20864 [00:38<00:00, 546.87it/s]


(720000, 12)


In [3]:
print(extended_systm.spread_probs)

[0.0063414  0.40399067 0.07708457 0.00699349 0.01170888 0.00266986
 0.03679701 0.09388779 0.00973944 0.00318313 0.15112902]


## Analyse patients with p = 0

In [4]:
extended_systm.spread_probs = np.random.uniform(low=0., high=1., size=11)

max_t=10
t = np.arange(max_t + 1)
time_dists={
    "early": sp.stats.binom.pmf(t, max_t, 0.3),
    "late" : sp.stats.binom.pmf(t, max_t, 0.7),
}
t_stages =["early", "late"]

for stage in t_stages:
    state_probs = time_dists[stage] @ extended_systm._evolve(t_last=max_t)
    p = state_probs @ extended_systm.diagnose_matrices[stage]
    zero_patients_idx = np.where(p == 0)[0]


    data = extended_systm.patient_data
    table = data.loc[data[("info", "t_stage")]==stage]
    zero_patients = table.iloc[zero_patients_idx] 
    
    
    for j, (_, patient) in enumerate(zero_patients.iterrows()):
        print("Pathology measurement of patient:")
        path_state = []
        for val in patient["pathology"]:
            if val == True:
                path_state.append(1)
            elif val == False:
                path_state.append(0)
            else:
                path_state.append("NaN")
        print(path_state)
        print("\nPossible hidden states given by diag matrix")
        diag_matrix_reduced = extended_systm.diagnose_matrices[stage][:,zero_patients_idx]
        column_patient = diag_matrix_reduced[:,j]
        non_zero_states = np.where(column_patient != 0)[0]
        print(extended_systm.state_list[non_zero_states])
        print("\n")

    

### Find smallest possible graph

In [5]:
pd.set_option('display.max_rows', None)
pd.set_option('display.min_rows', None)
data = pd.read_csv("../data/ipsi_data.csv", header=[0,1] )
path = data["pathology"]

three_wo_two = path.loc[path["II"]==False]
three_wo_two = three_wo_two.loc[three_wo_two["III"]==True]


four_wo_three = path.loc[path["III"]==False]
four_wo_three = four_wo_three.loc[four_wo_three["IV"]==True]


four_wo_twoandthree = path.loc[path["III"]==False]
four_wo_twoandthree = four_wo_twoandthree.loc[four_wo_twoandthree["II"]==False]
four_wo_twoandthree = four_wo_twoandthree.loc[four_wo_twoandthree["IV"]==True]


seven = path.loc[path["VII"]==True]

seven.head(50)

Unnamed: 0,I,II,III,IV,V,VII
155,,False,True,False,False,True


## Compare Model Risk Prediction with Occurrence in Data


#### Lnl involvement one patient (Data)


In [6]:
import itertools

def layer_involvement_expectation(inv: Optional[int], spsns: List[tuple])-> int:
    """Determine which layers are involved given the different (conflicting) measurements:

        Args:
            inv:    For one single layer, one entry gives the measured involvement for the
                    respective modality
            spsns:  Each entry of the list is the spsn tuple of the respective modality

    """
    no_of_measurements = 0
    prob = 0
    for i , spsn in enumerate(spsns):
        if(inv[i] == True):
            no_of_measurements+=1
            prob += spsn[1]
        elif(inv[i] == False):
            no_of_measurements+=1
            prob += (1-spsn[0])
    if(no_of_measurements == 0):
        return 0
    mean_prob = prob / no_of_measurements
    if(mean_prob >= 0.5):
        return 1
    else:
        return 0


def layer_involvement_statistical(inv: Optional[int], spsns: List[tuple])-> int:
    """Determine which layers are involved given the different (conflicting) measurements:

        Args:
            inv:    For one single layer, one entry gives the measured involvement for the
                    respective modality
            spsns:  Each entry of the list is the spsn tuple of the respective modality

    """
    p_healthy = 1
    p_involved = 1
    for i , spsn in enumerate(spsns):
        if(inv[i] == True):
            p_healthy *= 1-spsn[0]
            p_involved *= spsn[1]
        elif(inv[i] == False):
            p_healthy *= spsn[0]
            p_involved *= 1-spsn[1]
    if(p_healthy < p_involved):
        return 1
    else:
        return 0


def layer_involvement_hierarchical(inv: Optional[int], spsns: List[tuple])-> int:
    """Determine which layers are involved given the different (conflicting) measurements:
       hierarchy: 1.Pathology,  2.Diagnostic consensus,  3. PET/CT,  4.MRI

        Args:
            inv:    For one single layer, one entry gives the measured involvement for the
                    respective modality. For this function the modalities have to be given in the hierachical
                    order -> pathology first etc.
            spsns:  Each entry of the list is the spsn tuple of the respective modality

    """
    spsns_with_index = []
    for i, spsn in enumerate(spsns):
        spsn_w_i = list(spsn)
        spsn_w_i.append(i)
        spsns_with_index.append(tuple(spsn_w_i))
    zipped_list = zip(spsns, inv)
    sorted_by_sp = sorted(zipped_list, key=lambda x: x[0], reverse=True)
    sorted_by_sn = sorted(zipped_list, key=lambda x: x[1], reverse=True)
    inv_by_sp = [element for _, element in sorted_by_sp]
    inv_by_sn = [element for _, element in sorted_by_sn]
    # for i, invol in inv_by_sp:
    #     if(inv_by_sp[i] is not None):
    #         if(inv_by_sp[i] == inv_by_sn[i]):
    #             return inv_by_sp[i]
    #     else:
            
    #         pass
    return 0

print(layer_involvement_hierarchical([0, 0, 1, 0], [(1.,1.), (0.63, 0.81), (0.86, 0.79), (0.63, 0.81)]))


all_possible_measurements = list(itertools.product([0,1,None], repeat=4))
for meas in all_possible_measurements:
    exp = layer_involvement_expectation(meas, [(1.,1.), (0.63, 0.81), (0.86, 0.79), (0.63, 0.81)])
    stat = layer_involvement_statistical(meas, [(1.,1.), (0.63, 0.81), (0.86, 0.79), (0.63, 0.81)])
    hier = layer_involvement_hierarchical(meas, [(1.,1.), (0.63, 0.81), (0.86, 0.79), (0.63, 0.81)])
    


0


### Method to count the occurrences of involved layers in the real Dataset


In [7]:
def prevalence_in_dataset(df, involvement, t_stage="early"):
    #All labels of the Columns
    layer_cols = df.columns.get_level_values(1)
    stage_df = df.loc[df[("info","t_stage")] == t_stage]

    #Delete Duplicates and t-stage from list
    layers = list(dict.fromkeys(layer_cols))[:-1]

   
    #Create Empty DataFrame to store the occurrences
    occurr_table = pd.DataFrame(np.zeros((len(stage_df.index), len(layers))), columns=layers)

    #Fill the occurrence table
    for layer in layers:
        select = layer_cols.isin([layer])
        level_data = stage_df.loc[:, select].reset_index(drop=True)
        for index, row in level_data.iterrows():
            involved = layer_involvement_statistical(row.values, [(0.63, 0.81), (0.86, 0.79), (0.63, 0.81), (1., 1.)])
            if(involved):
                occurr_table.loc[occurr_table.index[index], layer] = 1

    #Compare the occurences with the given involvement
    counter = 0
    for index, row in occurr_table.iterrows():
        for i, layer in enumerate(row.values):
            if(involvement[i]==None or layer == involvement[i]):
                if(i == len(row.values)-1):
                    counter+=1
                continue
            else:
                break

    return counter / len(stage_df.index)

#### Display Model Risk vs. Data Prevalence in Dataframe


In [8]:
extended_systm.modalities = {"PET": [0.86, 0.79]}
extended_systm.spread_probs = np.mean(spread_probs, axis=0)

def comparison_risk_w_dataset(df, time_dists):
    poss_measurement = list(itertools.product([None,1], repeat=6))
    diagnose = {"PET": [None,None,None,None,None,None]}
    
    risks = {val: [] for val in time_dists.keys()}
    probs_dataset = {val: [] for val in time_dists.keys()}
    for meas in poss_measurement: 
        for key in time_dists.keys():
            risk =  extended_systm.risk(
                            diagnoses=diagnose, inv=meas,
                            time_dist=time_dists[key], 
                            mode="HMM"
                        )
            risks[key].append(risk)

            p_dataset = prevalence_in_dataset(df, meas, key)
            probs_dataset[key].append(p_dataset)
    
    overview = pd.DataFrame()
    overview["State"] = poss_measurement
    overview["Risk early"] = risks["early"]
    overview["Risk late"] = risks["late"]
    overview["Dataset early"]= probs_dataset["early"]
    overview["Dataset late"]= probs_dataset["late"]
    
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    pd.set_option('display.colheader_justify', 'center')
    pd.set_option('display.precision', 3)

    return overview
    
    
    

max_t=10
t = np.arange(max_t + 1)
early_p=0.3
time_dists={
        "early": lymph.utils.fast_binomial_pmf(t, max_t, early_p),
        "late" : lymph.utils.fast_binomial_pmf(t, max_t, mean_late_p)
}

data = pd.read_csv("../data/ipsi_data.csv", header=[0,1])
overview = comparison_risk_w_dataset(data, time_dists)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)


display(overview)


Unnamed: 0,State,Risk early,Risk late,Dataset early,Dataset late
0,"(None, None, None, None, None, None)",1.0,1.0,1.0,1.0
1,"(None, None, None, None, None, 1)",0.01176,0.01618,0.049,0.098
2,"(None, None, None, None, 1, None)",0.04565,0.06159,0.046,0.076
3,"(None, None, None, None, 1, 1)",0.000739,0.001253,0.003,0.013
4,"(None, None, None, 1, None, None)",0.06872,0.106,0.077,0.125
5,"(None, None, None, 1, None, 1)",0.001229,0.002323,0.009,0.027
6,"(None, None, None, 1, 1, None)",0.004643,0.008655,0.015,0.018
7,"(None, None, None, 1, 1, 1)",9.803e-05,0.000216,0.003,0.004
8,"(None, None, 1, None, None, None)",0.2838,0.3698,0.288,0.388
9,"(None, None, 1, None, None, 1)",0.004491,0.007337,0.021,0.058


In [9]:
consensus_systm = lymph.utils.system_from_hdf(
    filename="../data/consensus_system.hdf5",
    name="extended/model")



samples_consensus = emcee.backends.HDFBackend(filename=filename, name="extended/samples").get_chain(flat=True, discard=7000)

spread_probs_c = np.array([sample[:-1] for sample in samples_consensus])
late_p_c = np.array([sample[-1] for sample in samples_consensus])

consensus_systm.spread_probs = np.mean(spread_probs_c, axis=0)
mean_late_p_c = np.mean(np.array(late_p_c), axis=0)

Loading patients of late T-stage


100%|██████████| 14336/14336 [00:10<00:00, 1403.43it/s]


Loading patients of early T-stage


100%|██████████| 20864/20864 [00:10<00:00, 2036.77it/s]


In [10]:


def risk_plot(df, state, time_dists, plot_name):
    risks = {val: [] for val in time_dists.keys()}
    risks_c = {val: [] for val in time_dists.keys()}
    probs_dataset = {val: 0 for val in time_dists.keys()}
    diagnose = {"PET": [None,None,None,None,None,None]}
    np.random.seed(100)
    spread_probs_arr = np.asarray(spread_probs)
    choice = np.random.choice(spread_probs_arr.shape[0], size=1000, replace=False)
    sample_probs = spread_probs_arr[choice]
    sample_probs_arr_c = np.asarray(spread_probs_c)
    sample_probs_c = sample_probs_arr_c[choice]

    for key in time_dists.keys():
            for sample_prob in sample_probs:
                extended_systm.spread_probs = sample_prob
                risk =  extended_systm.risk(
                                diagnoses=diagnose, inv=state,
                                time_dist=time_dists[key], 
                                mode="HMM"
                            )
                risks[key].append(risk)

            for sample_prob in sample_probs_c:
                consensus_systm.spread_probs = sample_prob
                risk =  extended_systm.risk(
                                diagnoses=diagnose, inv=state,
                                time_dist=time_dists[key], 
                                mode="HMM"
                            )
                risks_c[key].append(risk)


            p_dataset = prevalence_in_dataset(df, state, key)
            probs_dataset[key]= p_dataset

    plt.figure(figsize = (10, 6)) 
    usz_blue = '#005ea8'
    usz_green = '#00afa5'       
    plt.hist(risks["early"], bins = 30, alpha = 0.7, label="Model risk early", color= usz_green)
    plt.hist(risks["late"], bins = 30, alpha = 0.7, label="Model risk late", color= usz_blue)
    plt.hist(risks_c["early"], bins = 30, alpha = 0.7, label="Model risk late", color= "orange")  
    plt.hist(risks_c["late"], bins = 30, alpha = 0.7, label="Model risk late", color= "red")    
    plt.vlines([probs_dataset["early"], probs_dataset["late"]],ymin=0, ymax=50, colors=[usz_green, usz_blue])
    plt.axvline(x = probs_dataset["early"], color = usz_green, label = 'Prevalence early', lw=2)
    plt.axvline(x = probs_dataset["late"], color = usz_blue, label = 'Prevalence late', lw=2)
    plt.legend()
    plt.title(f"Risk prediction Layer(s) {plot_name}")
    plt.savefig(f"../plots/riskplots_with_consensus/riskplot {plot_name}")
    plt.clf()

names = ["I", "II", "III", "IV", "V","VII","II and III", "II and III and IV", "III not II","IV not III", "IV not II", "II and V"]

interesting_states = [[1,None,None,None,None,None],
                     [None,1,None,None,None,None],
                     [None,None,1,None,None,None],
                     [None,None,None,1,None,None],
                     [None,None,None,None,1,None],
                     [None,None,None,None,None,1],
                     [None,1,1,None,None,None],
                     [None,1,1,1,None,None],
                     [None,0,1,None,None,None],
                     [None,None,0,1,None,None],
                     [None,0,None,1,None,None],
                     [None,1,None,None,1,None],]


# interesting_states = [[1,None,None,None,None,None]]

for i, state in enumerate(interesting_states):
    risk_plot(data, state, time_dists, names[i])


<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

<Figure size 720x432 with 0 Axes>

In [11]:
print(len(spread_probs[0]))
print(type(spread_probs))

11
<class 'numpy.ndarray'>


## Risk prediction

In [12]:

layers = ["V", "VII"]

diagnoses_text = [["Layer V is negative", "Layer V is negative but II positive"], ["Layer VII is negative", "Layer VII is negative but II & III positive"]]

diagnoses = [[{"PET":  np.array([None,None,None,None,0,None])},
            {"PET": np.array([None,1,None,None,0,None])}],
            [{"PET": np.array([None,None,None,None,None,0])},
            {"PET": np.array([None,1,1,None,None,0])}]
            ]

involvements = [np.array([None,None,None,None,1,None]), np.array([None,None,None,None,None,1])]
thin = 50

print("Probability p for Binomial distribution of late T_stage:", round(mean_late_p,4))


for key in time_dists.keys():
    print("T_stage = ", key)
    for i, layer in enumerate(layers):
        for k, diagnose in enumerate(diagnoses[i]):
            risk =  extended_systm.risk(
                diagnoses=diagnose, inv=involvements[i],
                time_dist=time_dists[key], 
                mode="HMM"
            )
            print(f"Risk for Layer {layer} given {diagnoses_text[i][k]}:", round(risk,4))

   


Probability p for Binomial distribution of late T_stage: 0.3892
T_stage =  early
Risk for Layer V given Layer V is negative: 0.0149
Risk for Layer V given Layer V is negative but II positive: 0.017
Risk for Layer VII given Layer VII is negative: 0.0021
Risk for Layer VII given Layer VII is negative but II & III positive: 0.0027
T_stage =  late
Risk for Layer V given Layer V is negative: 0.0203
Risk for Layer V given Layer V is negative but II positive: 0.0221
Risk for Layer VII given Layer VII is negative: 0.0028
Risk for Layer VII given Layer VII is negative but II & III positive: 0.0034
