# Loading Medicare and Medicaid Claims data into i2b2

focus is currently on carrier claims

(demographics was done in Oracle PL/SQL)

so far, we can get data in chunks, map patients and encounters, pivot diagnoses, and insert the result into an observation_fact table (which is missing some constraints).


In [None]:
def _fix_password():
    from os import environ
    import getpass
    environ[getpass.getuser().upper() + '_SGROUSE'] = getpass.getpass()
_fix_password()

## Python Data Science Tools

especially [pandas](http://pandas.pydata.org/pandas-docs/)

In [None]:
import pandas as pd
import numpy as np
import sqlalchemy as sqla
import cx_Oracle as cx
dict(pandas=pd.__version__, numpy=np.__version__, sqlalchemy=sqla.__version__, cx_Oracle=cx.__version__)

## Data Access via Luigi Config

[luigi docs](https://luigi.readthedocs.io/en/stable/)

In [None]:
import luigi

In [None]:
def the_config_file(name='luigi-sgrouse.cfg'):
    import pathlib
    return pathlib.Path('luigi-sgrouse.cfg')

def explicit_config(configRd):
    cls = luigi.configuration.LuigiConfigParser
    cls._instance = None  # KLUDGE
    cls._config_paths = [str(configRd)]
    return cls.instance()

explicit_config(the_config_file())
luigi.configuration.LuigiConfigParser.instance()._config_paths

In [None]:
import importlib

import cms_pd
import cms_etl
import etl_tasks
importlib.reload(cms_pd);
importlib.reload(cms_etl);
importlib.reload(etl_tasks);
importlib.reload(cx);

from etl_tasks import log_plan 
#from etl_tasks import DBAccessTask, LoggedConnection, SqlScriptTask
from cms_etl import FromCMS, CMSExtract, BeneIdSurvey
from cms_pd import CarrierClaimUpload, dx_stack, fmt_dx_code
#from script_lib import Script

In [None]:
import logging
# log = logging.getLogger(__name__)

cms_rif_task = CMSExtract()
log = cms_rif_task._log

log.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setFormatter(
    logging.Formatter(fmt='%(asctime)s %(levelname)s %(name)s: %(message)s',
                      datefmt='%02H:%02M:%02S'))
log.addHandler(ch)
log.info('We try to log non-trivial DB access.')

with cms_rif_task.connection() as lc:
    lc.log.info('first bene_id')
    x = pd.read_sql('select min(bene_id) bene_id_first from %s.%s' % (
        cms_rif_task.cms_rif, cms_rif_task.table_eg), lc._conn)
x


In [None]:
survey = BeneIdSurvey(source_table='bcarrier_claims')
bene_chunks = survey.results()
bene_chunks = pd.DataFrame(bene_chunks, columns=bene_chunks[0].keys())
bene_chunks.head()

In [None]:
cc = CarrierClaimUpload(bene_id_first=bene_chunks.iloc[0].bene_id_first,
                        bene_id_last=bene_chunks.iloc[0].bene_id_last)
cc.account, cc.source.cms_rif, cc.project.star_schema

In [None]:
with cc.connection() as lc:
    data = next(cc.chunks(lc, chunk_size=2000))
data.info()

In [None]:
data.head()

## Column Info: Level of Measurement

Assign [levels of measurement](https://en.wikipedia.org/wiki/Level_of_measurement) based on column info:

In [None]:
with cc.connection() as lc:
    bcarrier_cols = cc.column_info(lc)
bcarrier_cols[~ bcarrier_cols.is_dx]

## Nominal Data

In [None]:
clock = cc.source.download_date.__class__.now  #@@

In [None]:
nominal_cols = bcarrier_cols[(bcarrier_cols.valtype_cd == '@') &
                             ~ bcarrier_cols.is_dx]
nominal_cols

In [None]:
nominal_data = data[cc.ix_cols + [n for n in nominal_cols.column_name]].set_index(cc.ix_cols)
nominal_data.head()

### Observation facts for nominals (WIP)

In [None]:
def pivot_valtype(data, col_info, key_cols, valtype):
    ty_cols = col_info[col_info.valtype_cd == valtype]
    ty_data = data[key_cols + [n for n in ty_cols.column_name]]
    obs = ty_data.melt(id_vars=key_cols, var_name='column').dropna(subset=['value'])
    obs['valtype_cd'] = valtype
    if valtype == '@':
        obs['concept_cd'] = obs.column.str.upper() + ':' + obs.value
    else:
        obs['concept_cd'] = obs.column.str.upper() + ':'
        if valtype == 'n':
            obs['nval_num'] = obs.value
        elif valtype == 't':
            obs['tval_char'] = obs.value
        elif valtype == 'd':
            obs['tval_char'] = obs.value  # ISSUE: format yyyy-mm-dd...
        else:
            raise TypeError

    if valtype == 'd':
        obs['start_date'] = obs['end_date'] = obs.value
    else:
        obs = obs.rename(
            columns=dict(clm_from_dt='start_date',
                         clm_thru_dt='end_date'))
    # factor medpar_id, rename update_date out of dx_data()
    obs['medpar_id'] = np.nan
    return obs.rename(
            columns=dict(nch_wkly_proc_dt='update_date'))

obs_cd = pivot_valtype(data, bcarrier_cols[~ bcarrier_cols.is_dx], cc.ix_cols, '@')
obs_cd.sort_values(['clm_id', 'column']).head()

In [None]:
obs_num = pivot_valtype(data, bcarrier_cols[~ bcarrier_cols.is_dx], cc.ix_cols, 'n')
obs_num.head()

In [None]:
obs_txt = pivot_valtype(data, bcarrier_cols[~ bcarrier_cols.is_dx], cc.ix_cols, 't')
obs_txt.head()

In [None]:
obs_dt = pivot_valtype(data, bcarrier_cols[~ bcarrier_cols.is_dx], cc.ix_cols, 'd')
obs_dt.head()

In [None]:
bcarrier_cols[~ bcarrier_cols.is_dx &
              ~ bcarrier_cols.valtype_cd.isin(['n', 't', '@', 'd']) &
              ~ bcarrier_cols.column_name.isin(cc.ix_cols)]

In [None]:
obs_cd.append(obs_num).append(obs_txt).append(obs_dt).sort_values(['clm_id', 'valtype_cd', 'concept_cd']).head(30)[
    ['clm_id', 'start_date', 'concept_cd', 'valtype_cd', 'nval_num', 'tval_char', 'end_date', 'update_date']]

In [None]:
with cc.connection() as lc:
    obs_cd_mapped = cc.with_mapping(lc, obs_cd)
obs_cd_mapped.head()

In [None]:
fact1 = cc.finish_facts(obs_cd_mapped, import_date=clock(), upload_id=100)
fact1.head()

### Diagnoses from Carrier Claims

In [None]:
dx_cols = cc.dx_pairs(bcarrier_cols)
dx_cols

In [None]:
dx_data = cc.dx_data(data, bcarrier_cols)
dx_data.sort_values(['clm_id', 'ix']).head(15)

## Patient, Encounter Mapping

In [None]:
dx_data.bene_id.describe()

In [None]:
with cc.connection() as lc:
    mapped = cc.with_mapping(lc, dx_data)
mapped.sort_values('start_date').head(15)

In [None]:
len(mapped)

In [None]:
obs_fact = cc.finish_facts(mapped, upload_id=100, import_date=clock())

obs_fact.head()

In [None]:
with cc.connection() as lc:
    obs_fact.head(100).to_sql(name='observation_fact_100', con=lc._conn,
                   if_exists='append', index=False)

In [None]:
with cc.connection() as lc:
    for x in cc.obs_data(lc, 100, chunk_size=1000):
        break
x.head()

In [None]:
cc.run()