In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [91]:
rawData = pd.read_csv('LAB orders Details for potential COVID19 Mar 01-24 2020.csv')

In [92]:
rawData.columns

Index(['COVID19_PATIENT', 'DX_NAME', 'ISOLATION', 'PATIENT_LIST', 'PAT_CLASS',
       'COMPONENT_BASE_NAME', 'PAT_MRN_ID', 'PAT_ID', 'HSP_ACCOUNT_ID',
       'PAT_ENC_CSN_ID', 'ORDER_ID', 'DESCRIPTION', 'COMPONENT_ID',
       'ORDER_INST', 'RESULT_TIME', 'RESULT_LINE', 'COMPONENT_NAME',
       'ORD_VALUE', 'REFERENCE_LOW', 'REFERENCE_HIGH', 'RESULT_FLAG',
       'ABNORMAL_YN', 'PAT_AGE_AT_ENC', 'ENC_TYPE', 'CONTACT_DATE',
       'APPT_TIME', 'HOSP_ADMSN_TIME', 'HOSP_DISCHRG_TIME', 'ADMIT_TYPE',
       'ORDER_TYPE_C', 'ORDER_STATUS_C', 'RESULTS_CMT'],
      dtype='object')

In [93]:
rawData = rawData[['PAT_MRN_ID', 'ADMIT_TYPE', 'COMPONENT_NAME', 'ORD_VALUE', 'PAT_AGE_AT_ENC', 'RESULT_TIME']]
rawData.replace('C-REACTIVE PROTEIN SCREEN', value='crp', inplace=True)
rawData.replace('FERRITIN', value='ferritin', inplace=True)
rawData.replace('D-DIMER (SOFT)', value='dDimer', inplace=True)
rawData.sort_values(by=['PAT_MRN_ID', 'COMPONENT_NAME', 'RESULT_TIME'], ascending=True, axis='index', inplace=True)
print(len(rawData))
rawData.drop_duplicates(subset=['PAT_MRN_ID','COMPONENT_NAME' ],keep='first', inplace=True)
print(len(rawData))

33790
15351


In [94]:
rawData = rawData.loc[rawData.COMPONENT_NAME.isin(['crp', 'ferritin', 'dDimer'])]


In [95]:
wide = rawData.pivot(index='PAT_MRN_ID', columns='COMPONENT_NAME', values='ORD_VALUE')

In [145]:
ages = rawData.groupby('PAT_MRN_ID').first().rename(columns={'PAT_AGE_AT_ENC':'age'})
cleanedData = pd.concat([wide,ages],1)
cleanedData.crp.replace('<0.2', 0.1, inplace=True)
cleanedData.dDimer.replace('<0.17', 0.1, inplace=True)
cleanedData.dDimer.replace('>35.00', 40, inplace=True)
cleanedData.ferritin.replace('>16500.0', 1650, inplace=True)
cleanedData.ferritin.replace('>1650.0', 1650, inplace=True)

cleanedData.crp = cleanedData.crp.astype('float')
cleanedData.dDimer = cleanedData.dDimer.astype('float')
cleanedData.ferritin = cleanedData.ferritin.astype('float')

In [146]:
cleanedData

Unnamed: 0_level_0,crp,dDimer,ferritin,ADMIT_TYPE,COMPONENT_NAME,ORD_VALUE,age,RESULT_TIME
PAT_MRN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
250077,0.1,3.93,148.2,Emergent,crp,<0.2,92,2020-03-20 05:54
331259,1.9,,,Emergent,crp,1.9,83,2020-03-08 22:15
409157,3.3,,,Emergent,crp,3.3,71,2020-03-03 09:13
437190,9.6,,,Emergent,crp,9.6,66,2020-03-11 16:10
4426988,,0.91,,Emergent,dDimer,0.91,80,2020-03-03 06:16
...,...,...,...,...,...,...,...,...
101236288,3.6,,,Emergent,crp,3.6,14,2020-03-23 23:53
101236739,8.4,0.87,1222.7,Emergent,crp,8.4,37,2020-03-24 12:08
101236809,,0.62,86.4,Emergent,dDimer,0.62,78,2020-03-24 13:44
101236818,47.0,5.14,3285.7,Urgent,crp,47.0,33,2020-03-24 18:38


In [160]:
import scipy.special as scipySpecial
simData = cleanedData

# need to do some imputation here...will start with something simple...
simData['crp'].fillna(simData.crp.mean(), inplace=True)
simData['dDimer'].fillna(simData.dDimer.mean(), inplace=True)
simData['ferritin'].fillna(simData.dDimer.mean(), inplace=True)


simData['linPredictor'] = (cleanedData.crp-cleanedData.crp.mean())*1.2+(cleanedData.dDimer-cleanedData.dDimer.mean())*2+(cleanedData.age-cleanedData.age.mean())/10*4 + np.random.normal(10)
simData['linPredictorCentered'] = simData.linPredictor - simData.linPredictor.mean()

simData['probDead'] = scipySpecial.expit(simData.linPredictorCentered)
simData['dead'] = simData.probDead > 0.9
simData = simData[['crp', 'dDimer', 'ferritin', 'age', 'dead']]

In [161]:
simData.dead.value_counts()

False    590
True     444
Name: dead, dtype: int64

In [162]:
simData

Unnamed: 0_level_0,crp,dDimer,ferritin,age,dead
PAT_MRN_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
250077,0.100000,3.930000,148.2,92,True
331259,1.900000,2.941451,,83,True
409157,3.300000,2.941451,,71,True
437190,9.600000,2.941451,,66,True
4426988,5.178652,0.910000,,80,True
...,...,...,...,...,...
101236288,3.600000,2.941451,,14,False
101236739,8.400000,0.870000,1222.7,37,False
101236809,5.178652,0.620000,86.4,78,True
101236818,47.000000,5.140000,3285.7,33,True


In [170]:
import pymc3 as pm

dead = simData['dead']
crp = simData.crp - simData.crp.mean()
ferritin = simData.ferritin - simData.ferritin.mean()
dDimer = simData.dDimer - simData.dDimer.mean()
age = simData.age - simData.age.mean()


with pm.Model() as model_simple:
    alpha = pm.Normal('alpha', mu=0, sd=10)
    betaCRP = pm.Normal('betaCRP', mu=0, sd=10)
    betaFerritin = pm.Normal('betaFerritin', mu=0, sd=10)
    betaDDimer = pm.Normal('betaDDimer', mu=0, sd=10)
    betaAge = pm.Normal('betaAge', mu=0, sd=10)

    
    mu = alpha + betaCRP * crp + betaDDimer * dDimer + betaAge * age + betaFerritin*ferritin   
    θ = pm.Deterministic('θ', pm.math.sigmoid(mu))
    
    y_1 = pm.Bernoulli('y_1', p=θ, observed=dead)

    trace_simple = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [betaAge, betaDDimer, betaFerritin, betaCRP, alpha]
  out=out, **kwargs)
INFO (theano.gof.compilelock): Waiting for existing lock by unknown process (I am process '81172')
INFO (theano.gof.compilelock): Waiting for existing lock by unknown process (I am process '81170')
INFO (theano.gof.compilelock): To manually release the lock, delete /Users/burke/.theano/compiledir_Darwin-19.4.0-x86_64-i386-64bit-i386-3.7.4-64/lock_dir
INFO (theano.gof.compilelock): To manually release the lock, delete /Users/burke/.theano/compiledir_Darwin-19.4.0-x86_64-i386-64bit-i386-3.7.4-64/lock_dir
  out=out, **kwargs)
Sampling 4 chains, 0 divergences:   0%|          | 0/8000 [00:14<?, ?draws/s]
Bad initial energy, check any log probabilities that are inf or -inf, nan or very small:
y_1   -inf


ParallelSamplingError: Bad initial energy

INFO (theano.gof.compilelock): Waiting for existing lock by unknown process (I am process '81172')
INFO (theano.gof.compilelock): To manually release the lock, delete /Users/burke/.theano/compiledir_Darwin-19.4.0-x86_64-i386-64bit-i386-3.7.4-64/lock_dir
  out=out, **kwargs)
  out=out, **kwargs)


In [164]:
pd.Series(trace_simple['betaCRP']).describe()

count    4000.000000
mean        9.858941
std         2.302312
min         5.177180
25%         8.263198
50%         9.715335
75%        11.236811
max        20.028671
dtype: float64

In [165]:
pd.Series(trace_simple['betaAge']).describe()

count    4000.000000
mean        3.290176
std         0.768743
min         1.743079
25%         2.759595
50%         3.242399
75%         3.743664
max         6.736967
dtype: float64

In [166]:
pd.Series(trace_simple['betaDDimer']).describe()

count    4000.000000
mean       16.988859
std         3.915815
min         8.875392
25%        14.249740
50%        16.677472
75%        19.358226
max        35.191479
dtype: float64