In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [91]:
rawData = pd.read_csv('LAB orders Details for potential COVID19 Mar 01-24 2020.csv')

In [92]:
rawData.columns

Index(['COVID19_PATIENT', 'DX_NAME', 'ISOLATION', 'PATIENT_LIST', 'PAT_CLASS',
       'COMPONENT_BASE_NAME', 'PAT_MRN_ID', 'PAT_ID', 'HSP_ACCOUNT_ID',
       'PAT_ENC_CSN_ID', 'ORDER_ID', 'DESCRIPTION', 'COMPONENT_ID',
       'ORDER_INST', 'RESULT_TIME', 'RESULT_LINE', 'COMPONENT_NAME',
       'ORD_VALUE', 'REFERENCE_LOW', 'REFERENCE_HIGH', 'RESULT_FLAG',
       'ABNORMAL_YN', 'PAT_AGE_AT_ENC', 'ENC_TYPE', 'CONTACT_DATE',
       'APPT_TIME', 'HOSP_ADMSN_TIME', 'HOSP_DISCHRG_TIME', 'ADMIT_TYPE',
       'ORDER_TYPE_C', 'ORDER_STATUS_C', 'RESULTS_CMT'],
      dtype='object')

In [93]:
rawData = rawData[['PAT_MRN_ID', 'ADMIT_TYPE', 'COMPONENT_NAME', 'ORD_VALUE', 'PAT_AGE_AT_ENC', 'RESULT_TIME']]
rawData.replace('C-REACTIVE PROTEIN SCREEN', value='crp', inplace=True)
rawData.replace('FERRITIN', value='ferritin', inplace=True)
rawData.replace('D-DIMER (SOFT)', value='dDimer', inplace=True)
rawData.sort_values(by=['PAT_MRN_ID', 'COMPONENT_NAME', 'RESULT_TIME'], ascending=True, axis='index', inplace=True)
print(len(rawData))
rawData.drop_duplicates(subset=['PAT_MRN_ID','COMPONENT_NAME' ],keep='first', inplace=True)
print(len(rawData))

33790
15351


In [94]:
rawData = rawData.loc[rawData.COMPONENT_NAME.isin(['crp', 'ferritin', 'dDimer'])]


In [95]:
wide = rawData.pivot(index='PAT_MRN_ID', columns='COMPONENT_NAME', values='ORD_VALUE')

In [145]:
ages = rawData.groupby('PAT_MRN_ID').first().rename(columns={'PAT_AGE_AT_ENC':'age'})
cleanedData = pd.concat([wide,ages],1)
cleanedData.crp.replace('<0.2', 0.1, inplace=True)
cleanedData.dDimer.replace('<0.17', 0.1, inplace=True)
cleanedData.dDimer.replace('>35.00', 40, inplace=True)
cleanedData.ferritin.replace('>16500.0', 1650, inplace=True)
cleanedData.ferritin.replace('>1650.0', 1650, inplace=True)

cleanedData.crp = cleanedData.crp.astype('float')
cleanedData.dDimer = cleanedData.dDimer.astype('float')
cleanedData.ferritin = cleanedData.ferritin.astype('float')

In [146]:
cleanedData

Unnamed: 0_level_0,crp,dDimer,ferritin,ADMIT_TYPE,COMPONENT_NAME,ORD_VALUE,age,RESULT_TIME
PAT_MRN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
250077,0.1,3.93,148.2,Emergent,crp,<0.2,92,2020-03-20 05:54
331259,1.9,,,Emergent,crp,1.9,83,2020-03-08 22:15
409157,3.3,,,Emergent,crp,3.3,71,2020-03-03 09:13
437190,9.6,,,Emergent,crp,9.6,66,2020-03-11 16:10
4426988,,0.91,,Emergent,dDimer,0.91,80,2020-03-03 06:16
...,...,...,...,...,...,...,...,...
101236288,3.6,,,Emergent,crp,3.6,14,2020-03-23 23:53
101236739,8.4,0.87,1222.7,Emergent,crp,8.4,37,2020-03-24 12:08
101236809,,0.62,86.4,Emergent,dDimer,0.62,78,2020-03-24 13:44
101236818,47.0,5.14,3285.7,Urgent,crp,47.0,33,2020-03-24 18:38


In [171]:
import scipy.special as scipySpecial
simData = cleanedData

# need to do some imputation here...will start with something simple...
simData['crp'].fillna(simData.crp.mean(), inplace=True)
simData['dDimer'].fillna(simData.dDimer.mean(), inplace=True)
simData['ferritin'].fillna(simData.ferritin.mean(), inplace=True)


simData['linPredictor'] = (cleanedData.crp-cleanedData.crp.mean())*1.2+(cleanedData.dDimer-cleanedData.dDimer.mean())*2+(cleanedData.age-cleanedData.age.mean())/10*4 + np.random.normal(10)
simData['linPredictorCentered'] = simData.linPredictor - simData.linPredictor.mean()

simData['probDead'] = scipySpecial.expit(simData.linPredictorCentered)
simData['dead'] = simData.probDead > 0.9
simData = simData[['crp', 'dDimer', 'ferritin', 'age', 'dead']]

In [172]:
simData.dead.value_counts()

False    590
True     444
Name: dead, dtype: int64

In [173]:
simData

Unnamed: 0_level_0,crp,dDimer,ferritin,age,dead
PAT_MRN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
250077,0.100000,3.930000,148.200000,92,True
331259,1.900000,2.941451,937.886154,83,True
409157,3.300000,2.941451,937.886154,71,True
437190,9.600000,2.941451,937.886154,66,True
4426988,5.178652,0.910000,937.886154,80,True
...,...,...,...,...,...
101236288,3.600000,2.941451,937.886154,14,False
101236739,8.400000,0.870000,1222.700000,37,False
101236809,5.178652,0.620000,86.400000,78,True
101236818,47.000000,5.140000,3285.700000,33,True


In [174]:
import pymc3 as pm

dead = simData['dead']
crp = simData.crp - simData.crp.mean()
ferritin = simData.ferritin - simData.ferritin.mean()
dDimer = simData.dDimer - simData.dDimer.mean()
age = simData.age - simData.age.mean()


with pm.Model() as model_simple:
    alpha = pm.Normal('alpha', mu=0, sd=10)
    betaCRP = pm.Normal('betaCRP', mu=0, sd=10)
    betaFerritin = pm.Normal('betaFerritin', mu=0, sd=10)
    betaDDimer = pm.Normal('betaDDimer', mu=0, sd=10)
    betaAge = pm.Normal('betaAge', mu=0, sd=10)

    
    mu = alpha + betaCRP * crp + betaDDimer * dDimer + betaAge * age + betaFerritin*ferritin   
    θ = pm.Deterministic('θ', pm.math.sigmoid(mu))
    
    y_1 = pm.Bernoulli('y_1', p=θ, observed=dead)

    trace_simple = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [betaAge, betaDDimer, betaFerritin, betaCRP, alpha]
Sampling 4 chains, 9 divergences: 100%|██████████| 8000/8000 [00:18<00:00, 429.60draws/s]
There were 6 divergences after tuning. Increase `target_accept` or reparameterize.
There were 2 divergences after tuning. Increase `target_accept` or reparameterize.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
The number of effective samples is smaller than 25% for some parameters.


In [175]:
pd.Series(trace_simple['betaCRP']).describe()

count    4000.000000
mean       10.431563
std         2.211953
min         4.934773
25%         8.768431
50%        10.245079
75%        11.862011
max        19.733901
dtype: float64

In [176]:
pd.Series(trace_simple['betaAge']).describe()

count    4000.000000
mean        3.483784
std         0.738420
min         1.671489
25%         2.931102
50%         3.417458
75%         3.955217
max         6.543090
dtype: float64

In [177]:
pd.Series(trace_simple['betaDDimer']).describe()

count    4000.000000
mean       17.956865
std         3.753537
min         8.797515
25%        15.168058
50%        17.586538
75%        20.380438
max        34.265024
dtype: float64

In [178]:
pd.Series(trace_simple['betaFerritin']).describe()

count    4000.000000
mean        0.000098
std         0.000583
min        -0.001266
25%        -0.000379
50%         0.000032
75%         0.000489
max         0.002235
dtype: float64