# Loading Medicare and Medicaid Claims data into i2b2

focus is currently on carrier claims

(demographics was done in Oracle PL/SQL)

so far, we can get data in chunks, map patients and encounters, pivot diagnoses, and insert the result into an observation_fact table (which is missing some constraints).


## Python Data Science Tools

especially [pandas](http://pandas.pydata.org/pandas-docs/)

In [None]:
import pandas as pd
import numpy as np
import sqlalchemy as sqla
dict(pandas=pd.__version__, numpy=np.__version__, sqlalchemy=sqla.__version__)

## Data Access via Luigi Config

[luigi docs](https://luigi.readthedocs.io/en/stable/)

In [None]:
def the_config_file(name='luigi-sgrouse.cfg'):
    import pathlib
    return pathlib.Path('luigi-sgrouse.cfg')

In [None]:
import luigi

In [None]:
def explicit_config(configRd):
    cls = luigi.configuration.LuigiConfigParser
    cls._instance = None  # KLUDGE
    cls._config_paths = [str(configRd)]
    return cls.instance()

explicit_config(the_config_file())
luigi.configuration.LuigiConfigParser.instance()._config_paths

In [None]:
import importlib

import cms_pd
import cms_etl
import etl_tasks
importlib.reload(cms_pd);
importlib.reload(cms_etl);
importlib.reload(etl_tasks);

from etl_tasks import log_plan 
#from etl_tasks import DBAccessTask, LoggedConnection, SqlScriptTask
from cms_etl import FromCMS, CMSExtract
from cms_pd import CarrierClaims, dx_stack, fmt_dx_code
#from script_lib import Script

In [None]:
def _fix_password():
    from os import environ
    from getpass import getpass
    environ['DCONNOLLY_SGROUSE'] = getpass()
_fix_password()

In [None]:
import logging
# log = logging.getLogger(__name__)

cms_rif_task = CMSExtract()
log = cms_rif_task._log

log.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setFormatter(
    logging.Formatter(fmt='%(asctime)s %(levelname)s %(name)s: %(message)s',
                      datefmt='%02H:%02M:%02S'))
log.addHandler(ch)
log.info('We try to log non-trivial DB access.')

with cms_rif_task.connection() as lc:
    lc.log.info('first bene_id')
    x = pd.read_sql('select min(bene_id) bene_id_first from %s.%s' % (
        cms_rif_task.cms_rif, cms_rif_task.target_table), lc._conn)
x


In [None]:
with cms_rif_task.connection() as conn:
    bene_chunks = cms_rif_task.id_survey('bcarrier_claims', conn, chunk_qty=10000)
    bene_chunks = pd.DataFrame(bene_chunks, columns=bene_chunks[0].keys()).set_index('chunk_num')

bene_chunks.head()

In [None]:
cc = CarrierClaims(bene_id_first=bene_chunks.iloc[0].bene_id_first,
                   bene_id_last=bene_chunks.iloc[0].bene_id_last)
cc.account, cc.source.cms_rif, cc.project.star_schema

In [None]:
with cc.connection() as lc:
    data = next(cc.chunks(lc, chunk_size=20000))
data.info()

In [None]:
data.head()

## Column Info: Level of Measurement

Assign [levels of measurement](https://en.wikipedia.org/wiki/Level_of_measurement) based on column info:

In [None]:
with cc.connection() as lc:
    bcarrier_cols = cc.column_info(lc)
bcarrier_cols[~ bcarrier_cols.is_dx]

## Nominal Data

In [None]:
nominal_cols = bcarrier_cols[(bcarrier_cols.valtype_cd == '@') &
                             ~ bcarrier_cols.is_dx]
nominal_cols

In [None]:
nominal_data = data[cc.key_cols + [n for n in nominal_cols.column_name]].set_index(cc.key_cols)
nominal_data.head()

### carr_num - coded, not numeric

It looks like numeric info, but [carr_num docs](https://www.resdac.org/cms-data/variables/Carrier-Number) say it's a code.

In [None]:
### AMBIENT
def resdac_pg(path):
    import urllib.request
    addr = 'https://www.resdac.org/sites/resdac.umn.edu/' + path
    content = urllib.request.urlopen(addr).read()
    return content.decode('utf-8')

pg = resdac_pg('files/Carrier%20Number-MAC%20Table.txt')
len(pg)

In [None]:
def carr_num_table_line(txt):
    code, label = txt.split(' = ', 1)
    if '(' in label:
        label, note = label.split(' (', 1)
        note = '(' + note
    else:
        note = None
    return dict(code=code, label=label, note=note)

def carr_num_db(text):
    lines = text.split('\r\n')
    return pd.DataFrame([carr_num_table_line(l)
                         for l in lines if ' = ' in l])

carr_num_db(pg).head()

### Observation facts for nominals (WIP)

In [None]:
obs_cd = (nominal_data.reset_index().melt(id_vars=cc.key_cols, var_name='column')
          .dropna(subset=['value'])
          .sort_values(['clm_id', 'column'])
          .set_index(cc.key_cols))

obs_cd['valtype_cd'] = '@'
obs_cd['concept_cd'] = obs_cd.column.str.upper() + ':' + obs_cd.value
obs_cd.head(20)

### Diagnoses from Carrier Claims

In [None]:
dx_cols = cc.dx_pairs(bcarrier_cols)
dx_cols

In [None]:
dx_data = cc.dx_data(data, bcarrier_cols)
dx_data.sort_values(['clm_id', 'ix']).head(15)

## Patient, Encounter Mapping

In [None]:
dx_data.bene_id.describe()

In [None]:
with cc.connection() as lc:
    mapped = cc.with_mapping(lc, dx_data)
mapped.sort_values('start_date').head(15)

In [None]:
len(mapped)

In [None]:
clock = cc.source.download_date.__class__.now  #@@

In [None]:
obs_fact = cc.finish_facts(mapped, upload_id=100, import_date=clock())

obs_fact.head()

In [None]:
with cc.connection() as lc:
    obs_fact.head(100).to_sql(name='observation_fact_100', con=lc._conn,
                   if_exists='append', index=False)

In [None]:
with cc.connection() as lc:
    for x in cc.obs_data(lc, 100, chunk_size=1000):
        pass
x

In [None]:
cc.run()