In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from utils import *

In [None]:
pd.set_option('display.max_columns', None)

# Data Mining for Early Detection of Cardiac Amyloidosis

### Data cleaning and preprocessing
In this section, we want to get an initial understanding of the data in its raw format. I will perform quality checking based on intuition/domain knowledge, and remove redundant and missing data to prepare it for an initial exploratory data analysis.

In the cell below we simply define a list of metrics we've learned to be red flags/indicators for Cardiac Amyloidosis. For the metrics starting with "PHMH", I have decided to include only the value without the code (PHMHxCD) since it doesn't create more value than the standard (PHMHx).

In [None]:
red_flags = [
    'SubjectSeq', 'DMAGE', 'DMSEX', 'PHRVSPYNCD','EKGVOLT','ECHLVIDD','ECHLVIDS','ECHIVSD','ECHLVPW',
    'ECHLADIA','ECHLAAR','ECHLAVOL','ECHEJFR','ECHEA','ECHESEPT','ECHELAT','ECHEAVG',
    'HEKRRE','HEGFRRE','HEBNPRE','HETNTRE','HEKAPRE','HELAMRE', 'DUOTHBIYN',
    'ECHLVEDV', 'ECHLVESV', 'HEBNPUN', 'HELAMUN', 'HETNTUN', 'HEKAPUN', 'PHDIAGCD',
    'PHFAMYNCD'
]

medical_history = [f'PHMH{i}' for i in range(1,19)]


In [None]:
#raw_data = raw_data.loc[:, ~raw_data.columns.str.endswith('CD')]
#raw_data.columns = raw_data.columns.str.replace('1\.*', '')
raw_data = pd.read_excel('raw_data.xlsx', sheet_name='Data', skiprows=1) # Skip first row
raw_data.columns = [col.replace('1.', '') for col in raw_data.columns] # Remove 1. from col names

### Handle missing data
For Medical history data 1 to 18, we can simply replace all NaN values with 0 or False, and data that exists to either 1 or True. This is still needs confirmation.

In [None]:
raw_data[medical_history] = (raw_data[medical_history].notnull()).astype('int')

In [None]:
print(len(raw_data.columns.values))
print(len(custom_summary(raw_data[red_flags]).percent_missing.values))

In [None]:
plt.figure(figsize=(4, 4), dpi=160)
plt.barh(raw_data[red_flags+medical_history].columns.values, custom_summary(raw_data[red_flags+medical_history]).percent_missing.values)
#plt.xticks(rotation='vertical')
plt.title("Features", fontsize=12)
plt.xlabel("Percent missing")

In [None]:
rf_data = raw_data.drop(columns=[col for col in raw_data if col not in red_flags+medical_history])

In [None]:
rf_data.columns

In [None]:
'''
The data set includes several columns with no values at all. 
We simply drop all columns with less than 5 "proper" values since this in a DM process only would cause bias.
'''
#df_drop_nan_threshold = raw_data.dropna(thresh=5, axis=1)

In [None]:
rf_data.describe()

In [None]:
custom_summary(rf_data)

### Grouping data based on Patient Id/ SubjectSeq
rf_first groups data by SubjectSeq, but keeps the first occuring value for each column/feature

rf_last groups data by SubjectSeq, but keeps the last occuring value for each column/feature. The intention here is to have a Pandas Dataframe containing a row for each patient, which also includes all available and latest data for each of them.

In [None]:
rf_first = rf_data.groupby('SubjectSeq').first()

In [None]:
rf_last = rf_data.groupby('SubjectSeq').last()
#rf_last = rf_data.groupby('SubjectSeq').fillna(method='ffill')
#rf_last = rf_data.groupby('SubjectSeq').apply(get_last_valid)

For the medical history metrics we see high percentages of missing data (see below). We can also tell that those metrics only includes one unique value, which I believe to mean that the patient either has the diagnosis of the specific disease (i.e. KOLS) or is not diagnosed (i.e. NaN-value). Therefore, I simply replace NaN-values with 'neg'.

In [None]:
custom_summary(rf_last)

In [None]:
#rf_last[['PHMH2', 'PHMH6', 'PHMH7','PHMH8', 'PHMH9', 'PHMH10']] = rf_last[['PHMH2', 'PHMH6', 'PHMH7','PHMH8', 'PHMH9', 'PHMH10']].fillna(value='neg')

For now, I simply want to remove all patients with too many missing metrics. In this case, I set the value to 10.

## Final data set
One row for each patient (149 in total), with 31 columns for each patient.

In [None]:
#harsh_drop = rf_last[rf_last.notnull().sum(axis=1) > 10]
final_df = get_lapp(red_flags+medical_history)

In [None]:
plot_missing_percentages(final_df)

In [None]:
male, female = get_male_female(final_df)

# Diagnostic Strategies for Diagnosis of CA

### Use of Biomarkers
The combination of low voltage at electrocardiography (EKGVOLT) ( <=1 mV in all precordial leads or <=0.5 in all limb leads) and increased LV wall thickness (ECHLVIDD, ECHLVIDS ECHLVPW, ECHLVEDV, ECHLVESV) is a hallmark of CA. The normal range for LVIDd is 3.5-5.6 cm, and the normal range for LVIDs is 2.0-4.0 cm.  The normal range for LVWPd is 0.6-1.1 cm. Source for normal values https://freelandsystems.com/echo-parameters-ventricular-dimensions/.

NT-pro BNP (HEBNPYN, HEBNPRE is the value, HEBNPUN is the measurement) with value >1800 pg/mL "same as ng/L", high sensitivity cardiac troponin T, with value >0.025 ng/mL, and FLC difference of >18 mg/dL between kappa (HEKAPRE value, HEKAPUN measurement) and lambda (HELAMYN, HELAMRE value, HELAMUN measurement) FLC levels, is shown to be useful for predicting poor prognosis AL patients.

NT-proBNP with value >3000 ng/L and high sensitivity cardiac troponin T (HETNTYN, HETNTRE is value, HETNTUN is the measurement) with value >0.0 ng/mL can indicate poor prognosis for ATTRwt.

In the following Exploratory data analysis, I compare the provided data set with the common metrics for indicating CA (stated above). One assumption I have is that I separate between genders due heart metrics differ highly between genders. For example, men typically has larger LV wall thickness than women, therefore it would be wrong to compare the two.

In [None]:
poor_prognosis_AL = ['DMAGE', 'HEBNPRE', 'HEBNPUN', 'HETNTRE', 'HETNTUN', 'HEKAPRE', 'HEKAPUN', 'HELAMRE', 'HELAMUN', 'PHDIAGCD']
poor_prognosis_ATTRwt = ['DMAGE', 'HEBNPRE', 'HEBNPUN', 'HETNTRE', 'HETNTUN', 'PHDIAGCD']
CA_hallmark = ['DMAGE', 'EKGVOLT', 'ECHLVIDD', 'ECHLVIDS', 'ECHLVPW', 'ECHLVEDV', 'ECHLVESV', 'PHDIAGCD']

## Hallmark CA Patients

In [None]:
print(female[CA_hallmark].groupby('PHDIAGCD').describe())
female[CA_hallmark]

In [None]:
pp = sns.pairplot(female[CA_hallmark], size=1.8, aspect=1.8,
                  plot_kws=dict(edgecolor="k", linewidth=0.5),
                  diag_kind="kde", diag_kws=dict(shade=True), hue="PHDIAGCD")
fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Female Hallmark Pairwise Plots', fontsize=14)

In [None]:
print(male[CA_hallmark].groupby('PHDIAGCD').describe())
male[CA_hallmark]

In [None]:
pp = sns.pairplot(male[CA_hallmark], size=1.8, aspect=1.8,
                  plot_kws=dict(edgecolor="k", linewidth=0.5),
                  diag_kind="kde", diag_kws=dict(shade=True), hue="PHDIAGCD")
fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Male Hallmark Pairwise Plots', fontsize=14)

### Evaluation Hallmark biomarkers
Female plots doesn't seem to show distinct patterns (maybe that other/non CA-patients values tend to be more spread out).

However, male plots

## Poor Prognosis AL patients

In [None]:
print(female[poor_prognosis_AL].groupby('PHDIAGCD').describe())
female[poor_prognosis_AL]

In [None]:
pp = sns.pairplot(female[poor_prognosis_AL], size=1.8, aspect=1.8,
                  plot_kws=dict(edgecolor="k", linewidth=0.5),
                  diag_kind="kde", diag_kws=dict(shade=True), hue="PHDIAGCD")
fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Female AL Pairwise Plots', fontsize=14)

In [None]:
print(male[poor_prognosis_AL].groupby('PHDIAGCD').describe())
male[poor_prognosis_AL]

In [None]:
pp = sns.pairplot(male[poor_prognosis_AL], size=1.8, aspect=1.8,
                  plot_kws=dict(edgecolor="k", linewidth=0.5),
                  diag_kind="kde", diag_kws=dict(shade=True), hue="PHDIAGCD")
fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Male AL Pairwise Plots', fontsize=14)

## Poor Prognosis ATTRwt Patients

In [None]:
print(female[poor_prognosis_ATTRwt].groupby('PHDIAGCD').describe())
female[poor_prognosis_ATTRwt]

In [None]:
pp = sns.pairplot(female[poor_prognosis_ATTRwt], size=1.8, aspect=1.8,
                  plot_kws=dict(edgecolor="k", linewidth=0.5),
                  diag_kind="kde", diag_kws=dict(shade=True), hue="PHDIAGCD")
fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Female ATTRwt Pairwise Plots', fontsize=14)

In [None]:
print(male[poor_prognosis_ATTRwt].groupby('PHDIAGCD').describe())
male[poor_prognosis_ATTRwt]

In [None]:
pp = sns.pairplot(male[poor_prognosis_ATTRwt], size=1.8, aspect=1.8,
                  plot_kws=dict(edgecolor="k", linewidth=0.5),
                  diag_kind="kde", diag_kws=dict(shade=True), hue="PHDIAGCD")
fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Male ATTRwt Pairwise Plots', fontsize=14)