In [2]:
import snowflake.connector
from snowflake.snowpark.functions import count, when, col, corr, array_cat, sum as sum_
import os
from snowflake.snowpark import Session
import pandas as pd

cnxn_params = {
    "user":'hartsingh',
    "authenticator":'externalbrowser',
    "account":'vaa16628',
    "region":'us-east-1',
    "warehouse":"ACORN_CDS_PROD_COMMERCIALANALYTICS",
    "database":"ACORN_CDS_PROD_BIOXCEL_ADHOC",
    "role":'ACORN_CDS_PRD_CA_APP'
}

session = Session.builder.configs(cnxn_params).create()

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://mdsol.okta.com/app/snowflake/exkugim5j5lsfZiBH0x7/sso/saml?SAMLRequest=lZJfb9owFMW%2FSuQ9J04CpMwCKlrUkYptqNBJ65ubOMHDf4KvQ2Cffk5SqvahlSb5wbLPuf75nju5PknhHZkBrtUURUGIPKYynXNVTtHj9s4fIw8sVTkVWrEpOjNA17MJUCkqMq%2FtTj2wQ83Aeq6QAtJdTFFtFNEUOBBFJQNiM7KZf1%2BROAhJZbTVmRbojeVzBwVgxjrCiyUH7vB21lYE46ZpgmYQaFPiOAxDHH7FTtVKvlz0J%2FenD%2FQRDoet3imcfP3CdsNV34LPsJ57EZDldrv21z83W%2BTNL6i3WkEtmdkwc%2BQZe3xY9QDgCI6URkkSj4MafEbB%2BlEASjeFoHuWaVnV1pUN3A4XLMdCl9z9PF1MUbXneZwM7%2Be7076459%2BK9KpZHrheJTT%2FW5VDKcej9HksfpuDrc%2FjDHm%2FLtHGbbQpQM1S1QZq3VEYD%2Fxw6NY2jsgoIdEgGA2jJ%2BQtXKBcUds5L9QyBy0Cvbe0Q6NVhV%2BpMTvt65LL0Z%2BRgOKJ3yzD0xUG0LhNF%2FUDQ7rnzey%2F2zDBb%2B0vw%2FfD5ZEu1lrw7OzdaSOp%2FTiuKIi6E577RSclTFIu5nluGICLTQjd3BpGrZtxa2qG8Kx%2F9f2Uz%2F4B&RelaySt

In [3]:
%load_ext autoreload
%autoreload 2

# Objective

In [None]:
# accurately estimating mmse can be helpful to fill in scores for patients we do not have a score for 
# this score helps us estimate the severity of their condition
# lasso can be good hear since we also what to see directionality of features

# Pull raw data from snowflake

In [4]:
df_sql = session.sql(f"""
                     with scores as(
                            select distinct patient_key, result_date, value
                            from dbo.TBLDRG_EHR_RESULTS 
                            where LOINC in ('72107-6', '72133-2', '72172-0') and try_to_number(value) is not NULL and try_to_number(value) <= 30
                            order by patient_key, result_date
                         ),
                          top10_ndc as(
                            select top 10 drug_ndc, count(*) as freq
                            from processing_cns.stg_rx
                            where patient_key in (select distinct patient_key from scores)
                            group by drug_ndc
                            order by 2 desc
                         ),
                          top10_cpt as(
                            select top 10 procedure_arr, count(*) as freq
                            from processing_cns.stg_mx
                            where patient_key in (select distinct patient_key from scores) and PROCEDURE_ARR != []
                            group by procedure_arr
                            order by 2 desc
                         ),
                          top10_dx as(
                            select top 10 DIAGNOSIS_CODE_ARR, count(*) as freq
                            from processing_cns.stg_dx
                            where patient_key in (select distinct patient_key from scores) and array_size(DIAGNOSIS_CODE_ARR) = 1
                            group by DIAGNOSIS_CODE_ARR
                            order by 2 desc
                         ),
                          scores_dx as(
                            select patient_key,
                                   array_to_string(DIAGNOSIS_CODE_ARR,'') as icd_code,
                                   array_to_string(YEAR_OF_SERVICE_ARR,'') as date_dx
                            from processing_cns.stg_dx
                            where DIAGNOSIS_CODE_ARR in (select DIAGNOSIS_CODE_ARR from top10_dx) and patient_key in (select distinct patient_key from scores)
                            order by patient_key, date_dx
                         ),
                          scores_cpt as(
                            select patient_key,
                                   array_to_string(PROCEDURE_ARR,'') as cpt_code,
                                   array_to_string(PX_YEAR_OF_SERVICE_ARR,'') as date_cpt
                            from processing_cns.stg_mx
                            where PROCEDURE_ARR in (select PROCEDURE_ARR from top10_cpt) and patient_key in (select distinct patient_key from scores)
                            order by patient_key, date_cpt
                         ),
                          scores_rx as(
                            select patient_key,drug_ndc,date_of_service as date_rx
                            from processing_cns.stg_rx
                            where drug_ndc in (select drug_ndc from top10_ndc) and patient_key in (select distinct patient_key from scores)
                            order by patient_key, date_rx
                         ),
                          scores_dob as(
                            select *
                            from(
                              select distinct patient_key, patient_dob
                              from processing_cns.stg_mx 
                              where patient_key in (select distinct patient_key from scores) and patient_dob is not null and patient_dob < '2023-01-01'
                            ) a
                            union
                            select *
                            from(
                              select distinct patient_key, patient_dob
                              from processing_cns.stg_rx
                              where patient_key in (select distinct patient_key from scores) and patient_dob is not null and patient_dob < '2023-01-01'
                            ) b
                         ),
                          scores_gender as(
                            select distinct patient_key, patient_gender
                            from processing_cns.stg_mx 
                            where patient_key in (select distinct patient_key from scores) and patient_gender is not NULL
                         ),
                          scores_dx_join as(
                            select distinct
                                   coalesce(a.patient_key, b.patient_key) as patient_key, 
                                   coalesce(a.result_date, b.date_dx) as date,
                                   value, icd_code
                            from scores a
                            full outer join scores_dx b on a.patient_key = b.patient_key and a.result_date = b.date_dx
                            order by 1,2
                         ),
                          scores_dx_cpt_join as(
                            select distinct
                                   coalesce(a.patient_key, b.patient_key) as patient_key, 
                                   coalesce(a.date, b.date_cpt) as date,
                                   a.value, a.icd_code, b.cpt_code
                            from scores_dx_join a
                            full outer join scores_cpt b on a.patient_key = b.patient_key and a.date = b.date_cpt
                            order by 1,2
                         ),
                          scores_dx_cpt_rx_join as(
                            select distinct
                                   coalesce(a.patient_key, b.patient_key) as patient_key, 
                                   coalesce(a.date, b.date_rx) as date,
                                   a.value, a.icd_code, a.cpt_code, b.drug_ndc
                            from scores_dx_cpt_join a
                            full outer join scores_rx b on a.patient_key = b.patient_key and a.date = b.date_rx
                            order by 1,2
                         ),
                          scores_dx_cpt_rx_demo_join as(
                            select distinct
                                   a.*, datediff(year, b.patient_dob, a.date) as age,
                                   case when c.patient_gender = 'F' then 1 
                                        when c.patient_gender = 'M' then 0 else c.patient_gender end as gender_female
                            from scores_dx_cpt_rx_join a
                            left join scores_dob b on a.patient_key = b.patient_key
                            left join scores_gender c on a.patient_key = c.patient_key
                            order by a.patient_key, a.date
                         )
                     
                     select * from scores_dx_cpt_rx_demo_join;
                     """)
df = df_sql.to_pandas()
df

Unnamed: 0,PATIENT_KEY,DATE,VALUE,ICD_CODE,CPT_CODE,DRUG_NDC,AGE,GENDER_FEMALE
0,003f98c2-c495-581c-867d-1dd364120d29,2020-02-11,27,,,,,
1,004ce195-f105-5b1c-96f2-4860c44de5f8,2018-12-12,25,,,,,
2,00c98c5a-4705-578a-9bd9-069365a87d16,2018-09-17,23,,,,54.0,1.0
3,00c98c5a-4705-578a-9bd9-069365a87d16,2019-09-25,,F0151,,,55.0,1.0
4,00c98c5a-4705-578a-9bd9-069365a87d16,2019-10-11,,F0151,,,55.0,1.0
...,...,...,...,...,...,...,...,...
5686,ff792ca2-fb00-5dd1-9527-7372c189f450,2020-01-27,26,,,,,
5687,ffc343eb-a942-508b-84f8-537d4de1aae4,2019-01-16,23,,,,,
5688,ffc343eb-a942-508b-84f8-537d4de1aae4,2020-01-22,29,,,,,
5689,ffcc2e7b-e540-506d-84d4-e9ce769d1e50,2020-10-27,20,,,,,


In [33]:
len(np.unique(df['PATIENT_KEY']))

1401

In [31]:
sum(df['VALUE'].isna())

3944

In [26]:
sum(df['ICD_CODE'].isna())

2188

In [27]:
sum(df['CPT_CODE'].isna())

4310

In [28]:
sum(df['DRUG_NDC'].isna())

5358

# Comments on data

In [None]:
# 1401 unique patients who have mmse or moca scores
# top 10 of each category codes were taken 
# a lot of patients did not have rx, mx, or cpt data
# claims could occur after score was delivered 

# Preprocessing

In [None]:
# we will back fill with the same score until it is changed 
# removed records rx, dx, and cpt is unknown or age or gender is unknown
# one-hot encode claims and gender features

In [35]:
import preprocessing_time_series
preprocess = preprocessing_time_series.PreprocessingTimeSeries()
df_pp = preprocess.clean(df)
df_pp

Unnamed: 0,PATIENT_KEY,DATE,VALUE,ICD_CODE,CPT_CODE,DRUG_NDC,AGE
5632,fd56aea0-147f-55f9-bf73-b11e4119e59f,2019-04-10,12,F0151,99308,,79.0
5633,fd56aea0-147f-55f9-bf73-b11e4119e59f,2019-04-10,12,F0151,99213,,79.0
5497,f936ef7a-7c86-5633-8df4-ccf51e5b9d7d,2019-03-20,19,F0390,,,74.0
5439,f789ae55-7d44-5f33-b1a5-5e392dc84222,2019-02-26,12,,99213,,81.0
5343,f2b09eaf-19b3-554b-9cbe-396319ba597d,2020-09-01,1,F0280,,,80.0
...,...,...,...,...,...,...,...
567,1ca8da78-6164-5712-9a06-207017dc33e4,2019-12-16,10,F0280,,,81.0
411,1897bdbc-7bf6-51b1-a861-abd585028f6f,2018-10-15,13,F0280,99213,,80.0
342,14d84fd4-4ef8-51e0-be53-6bb1da4ac1c2,2019-04-08,6,,99214,,73.0
324,1417f3c9-6e6e-5de2-ae2c-eace375fb1ce,2019-02-07,15,F0280,,,75.0


# Some EDA

In [33]:
import plotly.express as px
fig = px.histogram(df_pp, x="VALUE")
fig.show()

In [34]:
df_pp.describe()

Unnamed: 0,VALUE,AGE,GENDER_FEMALE,cpt_99213,cpt_99214,cpt_99232,cpt_99233,cpt_99285,cpt_99308,cpt_99309,...,ndc_13668010310,ndc_29300017205,ndc_29300017216,ndc_33342029709,ndc_33342029815,ndc_43547027503,ndc_43547027509,ndc_43547027603,ndc_43547027609,ndc_43547027611
count,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0,...,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0,1401.0
mean,21.231263,24.326909,0.169165,0.095646,0.123483,0.025696,0.015703,0.038544,0.013562,0.017844,...,0.007138,0.009279,0.009279,0.006424,0.006424,0.009279,0.01142,0.007852,0.013562,0.017131
std,6.355148,37.104719,0.375031,0.29421,0.329109,0.158283,0.124368,0.192574,0.115704,0.132433,...,0.084213,0.095914,0.095914,0.07992,0.07992,0.095914,0.106292,0.088292,0.115704,0.129804
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,26.0,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,30.0,90.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Modeling - Lasso

In [56]:
import modeling
models = modeling.Modeling()
cols = df_pp.drop(['VALUE', 'PATIENT_KEY'], axis=1).columns.tolist()
model_lasso = models.lasso_regression(df_pp, cols, 'VALUE')

In [57]:
model_lasso['performance']

{'train': {'r2': 0.23096088610180243,
  'rmse': 5.609375847958335,
  'mape': 0.3517713063810486},
 'test': {'r2': 0.22601329795825364,
  'rmse': 5.413804389845,
  'mape': 0.3378396317407274}}

In [58]:
model_lasso['alpha']

0.1113768861607143

In [59]:
model_lasso['coefficients']

Unnamed: 0,features,coefficients
0,Intercept,23.069418
1,AGE,-0.078369
2,GENDER_FEMALE,-0.0
3,cpt_99213,-0.0
4,cpt_99214,-0.0
5,cpt_99232,-0.0
6,cpt_99233,-0.0
7,cpt_99285,-0.0
8,cpt_99308,-0.0
9,cpt_99309,-0.0


# Random Forest

In [40]:
model_rf = models.random_forest_regression(df_pp, cols, 'VALUE')

In [41]:
model_rf['performance']

{'r2': 0.19342389610778588, 'rmse': 5.683679385144531}

In [42]:
model_rf['importance']

Unnamed: 0,feature,importance,std
0,AGE,0.563835,0.034167
1,GENDER_FEMALE,0.037377,0.012118
14,icd_F0280,0.032065,0.011596
20,icd_G309,0.030776,0.013989
2,cpt_99213,0.030048,0.010754
16,icd_F0390,0.029545,0.010691
3,cpt_99214,0.029108,0.010157
12,icd_F0150,0.023938,0.008826
17,icd_F0391,0.022711,0.008541
6,cpt_99285,0.021288,0.008225


# Next steps

In [None]:
# include age and gender as feature
# split dependent variable into classes - low, medium, high severity
# troubleshoot lasso - why are all coefs negative
# make features binary
# build a model based on rx, dx, cpt, mmse, demo data to predict patients likelihood to get prescribed an alzhiemers treatment
# pca all features
# if we have strong prior information and we agree our data is insufficient we can build a bayesian model 
# setup data as time series and fit a NN on it