In [None]:
# NOTE: switch to the parent directory
%cd ..
!mkdir -p notebooks/plots

In [18]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
from sklearn.impute import KNNImputer
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
from scipy.stats import false_discovery_control
from statsmodels.stats import proportion as prop
from utils.plotting_utils import plotting_setup

In [19]:
# Constants
VARS_INCLUDED = ["Age", "BMI", "Sex", "Height", "Weight",
                 "AlvaradoScore", "PediatricAppendicitisScore",
                 "AppendixOnSono", "AppendixDiameter", "MigratoryPain", "LowerAbdominalPainRight",
                 "ReboundTenderness", "CoughingPain", "PsoasSign",
                 "Nausea", "AppetiteLoss", "BodyTemp", "WBCCount", "NeutrophilPerc",
                 "KetonesInUrine", "ErythrocytesInUrine", "WBCInUrine", "CRPEntry",
                 "Dysuria", "Stool", "Peritonitis", "FreeFluids",
                 "AppendixWallLayers", "Kokarde",
                 "TissuePerfusion", # NOTE: this variable is available in external dataset
                 "SurroundingTissueReaction", "PathLymphNodes",
                 "MesentricLymphadenitis", "BowelWallThick", "Ileus", "FecalImpaction",
                 "Meteorism", "Enteritis", "DiagnosisByCriteria", "TreatmentGroupBinar", "AppendicitisComplications"]

VARS_CATEGORICAL = ["Sex", "AppendixOnSono", "MigratoryPain", "LowerAbdominalPainRight",
                    "ReboundTenderness", "CoughingPain", "PsoasSign", "Nausea", "AppetiteLoss", "KetonesInUrine", "ErythrocytesInUrine",
                    "WBCInUrine", "Dysuria",
                    "Stool", "Peritonitis", "FreeFluids", "AppendixWallLayers", "Kokarde", "TissuePerfusion",
                    "SurroundingTissueReaction", "PathLymphNodes",
                    "MesentricLymphadenitis", "BowelWallThick", "Ileus", "FecalImpaction",
                    "Meteorism", "Enteritis"]

VARS_NUMERICAL = ["Age", "BMI", "Height", "Weight", "AlvaradoScore", "PediatricAppendicitisScore",
                  "AppendixDiameter", "BodyTemp", "WBCCount", "NeutrophilPerc", "CRPEntry"]

VARIABLE_NAMES =    ['Age', 'BMI', 'Sex', 'Height', 'Weight', 'AS', 'PAS', 'Visibility of appendix',
                     'Appendix diameter', 'Migration of pain', 'Tenderness in RLQ', 'Rebound tenderness', 'Cough tenderness',
                     'Psoas sign', 'Nausea/vomitting', 'Annorexia', 'Body temperature', 'WBC count', 'Neutrophil percentage',
                     'Ketones in urine', 'Erythrocytes in urine', 'WBC in urine', 'CRP', 'Dysuria', 'Stool',
                     'Peritonitis', 'Free intraperitoneal fluid', 'Appendix layers', 'Target sign',
                     'Appendix perfusion', 'Surrounding tissue reaction', 'Path. lymph nodes', 'Mesenteric lymphadenitis',
                     'Bowel wall thickening', 'Ileus', 'Coprostasis', 'Meteorism', 'Enteritis']

In [None]:
# Load Regensburg data
app_data_regensburg = pd.read_csv('./data/app_data.csv')
app_data_regensburg.head()

In [None]:
# Preprocess Regensburg data
# Drop data points with missing diagnosis
app_data_regensburg_preproc = app_data_regensburg[VARS_INCLUDED].dropna(subset=['DiagnosisByCriteria'])
app_data_regensburg_preproc.describe()

In [22]:
# Correct the wrong body temperature record
app_data_regensburg_preproc.iloc[np.argwhere((app_data_regensburg_preproc['BodyTemp'] < 30).to_numpy())[0, 0],
    app_data_regensburg_preproc.columns.get_loc('BodyTemp')] = 36.9

In [None]:
# Factorise categorical variables
categorical_val_map_ = [
    ("Sex", {'male':1, 'female':0}),
    ("AppendixOnSono", {'yes':1, 'no':0}),
    ("MigratoryPain", {'no':0, 'yes':1}),
    ("LowerAbdominalPainRight", {'yes':1, 'no':0}),
    ("ReboundTenderness", {'no':0, 'yes':1}),
    ("CoughingPain", {'no':0, 'yes':1}),
    ("PsoasSign", {'negative':0, 'positive':1}),
    ("Nausea", {'yes':1, 'no':0}),
    ("AppetiteLoss", {'no':0, 'yes':1}),
    ("KetonesInUrine", {'+':1, 'no':0, '+++':3, '++':2}),
    ("ErythrocytesInUrine", {'+':1, 'no':0, '++':2, '+++':3}),
    ("WBCInUrine", {'no':0, '+':1, '+++':3, '++':2}),
    ("Dysuria", {'no':0, 'yes':1}),
    ("Stool", {'normal':0, 'obstipation':1, 'diarrhea':2}),
    ("Peritonitis", {'no':0, 'local':1, 'generalised':2}),
    ("FreeFluids", {'no':0, 'yes':1}),
    ("AppendixWallLayers", {'aufgehoben':1, 'intakt':0}),
    ("Kokarde", {'no':0, 'yes':1}),
    ("TissuePerfusion", {'unremarkable':0, 'hypoperfused':1, 'hyperperfused':2}),
    ("SurroundingTissueReaction", {'yes':1, 'no':0}),
    ("PathLymphNodes", {'yes':1, 'no':0}),
    ("MesentricLymphadenitis", {'yes':1, 'no':0}),
    ("BowelWallThick", {'yes':1, 'no':0}),
    ("Ileus", {'no':0, 'yes':1}),
    ("FecalImpaction", {'yes':1, 'no':0}),
    ("Meteorism", {'no':0, 'yes':1}),
    ("Enteritis", {'no':0, 'yes':1})
]

for column, themap in categorical_val_map_:
    app_data_regensburg_preproc[column] = app_data_regensburg_preproc[column].replace(themap)
    app_data_regensburg_preproc[column].replace(-1, np.nan, inplace=True)

app_data_regensburg_preproc.head()

In [None]:
# Perform imputation
imputer = KNNImputer(n_neighbors=5)

app_data_regensburg_preproc_imputed = pd.DataFrame(
    imputer.fit_transform(app_data_regensburg_preproc.drop(
        columns=['DiagnosisByCriteria', 'TreatmentGroupBinar', 'AppendicitisComplications'])),
    columns=app_data_regensburg_preproc.columns[:-3])
app_data_regensburg_preproc_imputed['DiagnosisByCriteria'] = pd.factorize(app_data_regensburg_preproc['DiagnosisByCriteria'])[0]
app_data_regensburg_preproc_imputed['TreatmentGroupBinar'] = pd.factorize(app_data_regensburg_preproc['TreatmentGroupBinar'])[0]
app_data_regensburg_preproc_imputed['AppendicitisComplications'] = pd.factorize(app_data_regensburg_preproc['AppendicitisComplications'])[0]

app_data_regensburg_preproc_imputed.head()

In [25]:
app_data_regensburg_final = app_data_regensburg_preproc_imputed

In [None]:
# Load external data
app_data_dusseldorf = pd.read_csv('./data/app_data_ext.csv')
app_data_dusseldorf.head()

In [None]:
# Now, map the categorical values
categorical_val_map_ = [
    ("Sex", {'male':1, 'female':0}),
    ("AppendixOnSono", {'yes':1, 'no':0}),
    ("MigratoryPain", {'no':0, 'yes':1}),
    ("LowerAbdominalPainRight", {'yes':1, 'no':0}),
    ("ReboundTenderness", {'no':0, 'yes':1}),
    ("CoughingPain", {'no':0, 'yes':1}),
    ("PsoasSign", {'negative':0, 'positive':1}),
    ("Nausea", {'yes':1, 'no':0}),
    ("AppetiteLoss", {'no':0, 'yes':1}),
    ("KetonesInUrine", {'+':1, 'no':0, '+++':3, '++':2}),
    ("ErythrocytesInUrine", {'+':1, 'no':0, '++':2, '+++':3}),
    ("WBCInUrine", {'no':0, '+':1, '+++':3, '++':2}),
    ("Dysuria", {'no':0, 'yes':1}),
    ("Stool", {'normal':0, 'obstipation':1, 'diarrhea':2}),
    ("Peritonitis", {'no':0, 'local':1, 'generalised':2}),
    ("FreeFluids", {'no':0, 'yes':1}),
    ("AppendixWallLayers", {'aufgehoben':1, 'intakt':0}),
    ("Kokarde", {'no':0, 'yes':1}),
    ("TissuePerfusion", {'unremarkable':0, 'hypoperfused':1, 'hyperperfused':2}),
    ("SurroundingTissueReaction", {'yes':1, 'no':0}),
    ("PathLymphNodes", {'yes':1, 'no':0}),
    ("MesentricLymphadenitis", {'yes':1, 'no':0}),
    ("BowelWallThick", {'yes':1, 'no':0}),
    ("Ileus", {'no':0, 'yes':1}),
    ("FecalImpaction", {'yes':1, 'no':0}),
    ("Meteorism", {'no':0, 'yes':1}),
    ("Enteritis", {'no':0, 'yes':1}),
    ("DiagnosisByCriteria", {'noAppendicitis': 0, 'appendicitis': 1}),
    ("TreatmentGroupBinar", {'conservative': 0, 'surgical': 1}),
    ("AppendicitisComplications", {'no': 0, 'yes': 1})
]

for column, themap in categorical_val_map_:
    app_data_dusseldorf[column] = app_data_dusseldorf[column].replace(themap)

pre_external = app_data_dusseldorf.copy()
external_X = pd.DataFrame(imputer.transform(app_data_dusseldorf.drop(
        columns=['DiagnosisByCriteria', 'TreatmentGroupBinar', 'AppendicitisComplications'])), columns = app_data_dusseldorf.columns[:-3]) # -1 is target index

app_data_dusseldorf_final = deepcopy(app_data_dusseldorf)

# NOTE: this is somewhat ad hoc
mask = app_data_dusseldorf_final['WBCCount'] > 1000
# Divide values greater than 1000 by 1000
app_data_dusseldorf_final.loc[mask, 'WBCCount'] /= 1000

app_data_dusseldorf_final.head()

In [None]:
# Compute summary statistics and perform statistical tests
# NOTE: use non-imputed data to compute statistics
app_data_regensburg_final = app_data_regensburg_preproc
y1_mapping = {'appendicitis': 1, 'noAppendicitis': 0}
app_data_regensburg_final['DiagnosisByCriteria'] = app_data_regensburg_final['DiagnosisByCriteria'].replace(y1_mapping)
y2_mapping = {'conservative': 0, 'surgical': 1}
app_data_regensburg_final['TreatmentGroupBinar'] = app_data_regensburg_final['TreatmentGroupBinar'].replace(y2_mapping)
y3_mapping = {'no': 0, 'yes': 1}
app_data_regensburg_final['AppendicitisComplications'] = app_data_regensburg_final['AppendicitisComplications'].replace(y3_mapping)

stats = {}
pvals = np.zeros((len(VARS_INCLUDED), ))
cnt = 0
for col in VARS_INCLUDED:
    if col in VARS_CATEGORICAL or \
        col in ['DiagnosisByCriteria', 'TreatmentGroupBinar', 'AppendicitisComplications']:    # Categorical variables
        if len(np.unique(app_data_regensburg_final[col])) == 2:                                # Binary variables
            # Perform a chi-squared test for proportions
            counts = [np.sum(app_data_regensburg_final[col] == 1), np.sum(app_data_dusseldorf_final[col] == 1)]    # number of successes
            nobs = [app_data_regensburg_final[col].count(), app_data_dusseldorf_final[col].count()]                # number of trials
            _, p_val, _ = prop.proportions_chisquare(count=counts, nobs=nobs)
            # Store the proportions
            stats[col] = {'regensburg': counts[0]/nobs[0], 'düsseldorf': counts[1]/nobs[1]}

        else:                                                                                  # Multilevel variables
            # NOTE: we binarise multilevel attributes
            counts = [np.sum(app_data_regensburg_final[col] >= 1), np.sum(app_data_dusseldorf_final[col] >= 1)] # number of successes
            if col == 'TissuePerfusion':
                nobs = [app_data_regensburg_final[col].count(), 1]                                                  # number of trials
            else:
                nobs = [app_data_regensburg_final[col].count(), app_data_dusseldorf_final[col].count()]             # number of trials
            _, p_val, _ = prop.proportions_chisquare(count=counts, nobs=nobs)
            # Store the proportions
            stats[col] = {'regensburg': counts[0]/nobs[0], 'düsseldorf': counts[1]/nobs[1]}

    else:                          # Numerical variables
        # Medians and IQRs
        stats[col] = {'regensburg': [np.nanmedian(app_data_regensburg_final[col]), np.nanquantile(app_data_regensburg_final[col], 0.25),
                                      np.nanquantile(app_data_regensburg_final[col], 0.75)],
                        'düsseldorf': [np.nanmedian(app_data_dusseldorf_final[col]), np.nanquantile(app_data_dusseldorf_final[col], 0.25),
                                      np.nanquantile(app_data_dusseldorf_final[col], 0.75)]}

        # Perform an unpaired, two-sample t-test
        t_stat, p_val = ttest_ind(app_data_regensburg_final[col].dropna(), app_data_dusseldorf_final[col].dropna())

    pvals[cnt] = p_val
    cnt += 1

In [29]:
# Ajust for multiple comparisons
pvals = false_discovery_control(pvals, method='bh')

In [None]:
# Print the results in the LaTeX-like format
cnt = 0
for col in VARS_INCLUDED:
    print(col + ' & ' + str(stats[col]['regensburg']) + ' & ' + str(stats[col]['düsseldorf']) + ' & ' + str(pvals[cnt]) + ' \\\\')
    cnt += 1

In [None]:
# Plot missingness statistics
plotting_setup(12)

# TODO
app_data_regensburg_final_ = app_data_regensburg_final.drop(
    columns=['DiagnosisByCriteria', 'TreatmentGroupBinar', 'AppendicitisComplications'])
app_data_dusseldorf_final_ = app_data_dusseldorf_final.drop(
    columns=['DiagnosisByCriteria', 'TreatmentGroupBinar', 'AppendicitisComplications'])
original_nan_percentage = (app_data_regensburg_final_.isna().sum() / len(app_data_regensburg_final_)) * 100
external_nan_percentage = (app_data_dusseldorf_final_.isna().sum() / len(app_data_dusseldorf_final_)) * 100

fig, ax = plt.subplots(figsize=(8, 8))

bar_width = 0.35
index = np.arange(len(original_nan_percentage))

original_bars = ax.barh(index, original_nan_percentage, bar_width, label='Regensburg')
external_bars = ax.barh(index + bar_width, external_nan_percentage, bar_width, label='Düsseldorf')

ax.set_xlabel('Missing Values, %')
ax.set_yticks(index + bar_width / 2)
ax.set_yticklabels(VARIABLE_NAMES, rotation=15, ha='right')
ax.legend(loc='lower right')

plt.grid(visible=True, axis='y')

plt.savefig('notebooks/plots/missing_value_rates.pdf', format='pdf', bbox_inches='tight')