<h1 style="text-align: center;">mandatory use</h1>

## import libraries, set constants, define helper functions

In [None]:
import polars as pl
from Levenshtein import ratio
from datetime import datetime, timedelta

constants

In [None]:
RATIO = .7              # patient name similarity between rx and search to give search credit
PARTIAL_RATIO = .5      # patient name similarity between rx and search to give search credit for partial searches
DAYS_BEFORE = 7         # max number of days before an rx was written where searching should receive credit
FILTER_VETS = True      # remove veterinarians from the data

OVERLAP_RATIO = .8      # patient name similarity between opioid and benzo prescriptions to confirm overlap
NAIVE_RATIO = .7        # patient name similarity between 2 opioid rx to confirm patient is not opioid naive
DAYS_NAIVE = 60         # how many days without an opioid makes a patient opioid naive
MME_THRESHOLD = 90      # mme threshold for single rx

helper function

In [None]:
def add_days(n, d = datetime.today()):
  return d + timedelta(n)

## read in and prepare data for processing

In [None]:
users = (
    pl.scan_csv('data/ID_data.csv', infer_schema_length=10000)
    .rename({
        'Associated DEA Number(s)':'dea_number(s)', 'True ID':'true_id', 'User Full Name':'user_full_name', 
        'User Specialty':'specialty_grouping', 'Specialty Level 1':'specialty_1', 'Specialty Level 2':'specialty_2', 'Specialty Level 3':'specialty_3'
    }) 
)

# each user dea gets its own row so a prescriber gets credit for searches on prescriptions with any of their registered deas
users_explode = (
    users
    .with_columns(
        pl.col('dea_number(s)').str.to_uppercase().str.strip().str.split(',').alias('dea_number')
    )
    .explode('dea_number')
    .select('true_id', 'dea_number')
)

In [None]:
pattern = r'^[A-Za-z]{2}\d{7}$' # 2 letters followed by 7 digits
dispensations = (
    pl.scan_csv('data/dispensations_data.csv', infer_schema_length=10000)
    .rename({'Month, Day, Year of Patient Birthdate': 'disp_dob', 'Month, Day, Year of Written At': 'written_date', 
             'Month, Day, Year of Filled At': 'filled_date', 'Month, Day, Year of Dispensations Created At': 'disp_created_date',
             'Prescriber First Name': 'prescriber_first_name', 'Prescriber Last Name': 'prescriber_last_name', 
             'Patient First Name': 'patient_first_name', 'Patient Last Name': 'patient_last_name',
             'Prescriber DEA': 'prescriber_dea', 'Generic Name':'generic_name', 'Prescription Number':'rx_number',
             'AHFS Description':'ahfs', 'Daily MME':'mme', 'Days Supply':'days_supply', 'Animal Name':'animal_name'})
    .with_columns(
        pl.col('disp_dob').str.to_date('%B %d, %Y'),
        pl.col('written_date').str.to_date('%B %d, %Y'),
        pl.col('filled_date').str.to_date('%B %d, %Y'),
        pl.col('disp_created_date').str.to_date('%B %d, %Y'),
        pl.col('prescriber_dea').str.to_uppercase().str.strip(),
        (pl.col('patient_first_name') + ' ' + pl.col('patient_last_name')).str.to_uppercase().alias('patient_name'),
        (pl.col('prescriber_first_name') + ' ' + pl.col('prescriber_last_name')).str.to_uppercase().alias('prescriber_name')
    )
    .filter(
        (pl.col('prescriber_dea').str.contains(pattern))
    )
    .join(users_explode, how='left', left_on='prescriber_dea', right_on='dea_number')
    .collect()
    .with_columns(
        (pl.col('written_date').dt.offset_by(f'-{DAYS_BEFORE}d')).alias('start_date'),
        (pl.col('written_date').dt.offset_by('1d')).alias('end_date')   # to account for bamboo's issues handling of UTC
    )
    .drop('patient_first_name', 'patient_last_name', 'prescriber_first_name', 'prescriber_last_name')
)

if FILTER_VETS:
    dispensations = (
        dispensations
        .filter(
            (pl.col('animal_name') == 'Unspecified')
        )
        .drop('animal_name')
    )
else:
    dispensations = dispensations.drop('animal_name')

In [None]:
#for filtering searches to only the days we could potentially need
min_date = add_days(-DAYS_BEFORE, dispensations['written_date'].min())
max_date = add_days(1, dispensations['written_date'].max())

searches = (
    pl.scan_csv('data/searches_data.csv', infer_schema_length=10000)
    .rename({'Month, Day, Year of Search Creation Date': 'created_date', 'Month, Day, Year of Searched DOB':
            'search_dob', 'Searched First Name': 'first_name', 'Searched Last Name': 'last_name',
            'Partial First Name?': 'partial_first', 'Partial Last Name?': 'partial_last', 'True ID': 'true_id'})
    .with_columns(
        pl.col('search_dob').str.to_date('%B %d, %Y'),
        pl.col('created_date').str.to_date('%B %d, %Y'),
        (pl.col('first_name') + ' ' + pl.col('last_name')).alias('full_name').str.to_uppercase(),
        (pl.col('partial_first') | pl.col('partial_last')).alias('partial')
    )
    .filter(
        pl.col('created_date').is_between(min_date, max_date) &
        pl.col('true_id').is_in(dispensations['true_id'])
    )
    .collect()
    .with_columns(
        (pl.col('partial').apply(lambda x: PARTIAL_RATIO if x else RATIO)).alias('ratio_check')
    )
    .drop('first_name', 'last_name', 'partial_first', 'partial_last')
    .lazy()
)

## process dispensations for searches

In [None]:
dispensations_with_searches = (
    dispensations
    .lazy()
    .join(searches, how='left', on='true_id')
    .filter(
        (pl.col('created_date').is_between(pl.col('start_date'), pl.col('end_date'))) &
        (pl.col('disp_dob') == pl.col('search_dob'))
    )
    .with_columns(
        pl.struct(['full_name', 'patient_name'])
        .apply(lambda x: ratio(x['full_name'], x['patient_name'])).alias('ratio')
    )
    .collect(streaming=True)
    .filter(
        pl.col('ratio') >= pl.col('ratio_check')
    )
    .unique(subset=['rx_number','prescriber_dea','written_date'])
    .select('rx_number','prescriber_dea','written_date')
    .with_columns(
        pl.lit(True).alias('search')
    )
)

In [None]:
final_dispensations = (
    dispensations
    .join(dispensations_with_searches, how='left', on=['rx_number','prescriber_dea','written_date'])
    .fill_null(False)
    .unique(subset=['rx_number','prescriber_dea','written_date'])
    .with_columns(
        pl.col('true_id').fill_null(pl.col('prescriber_dea')).alias('final_id')
    )
)

pattern_cap = r'^([A-Za-z]{2}\d{7})$' # 2 letters followed by 7 digits
deas = dispensations.select('prescriber_dea', 'prescriber_name').lazy()
results = (
    final_dispensations
    .groupby(['final_id'])
    .agg([pl.count(), pl.col('search').sum()])
    .with_columns(
        ((pl.col('search') / pl.col('count')) * 100).alias('rate'),
        (pl.col('final_id').str.parse_int(radix=10, strict=False).cast(pl.Int64)).alias('true_id'),
        (pl.col('final_id').str.extract(pattern_cap)).alias('unreg_dea')
    )
    .rename({'count':'dispensations', 'search':'searches'})
    .lazy()
    .join(users, how='left', on='true_id')
    .join(deas, how='left', left_on='unreg_dea', right_on='prescriber_dea')
    .unique('final_id')
    .with_columns(
        pl.col('user_full_name').fill_null(pl.col('prescriber_name')),
        pl.col('dea_number(s)').fill_null(pl.col('unreg_dea')),
        pl.col('true_id').is_not_null().alias('registered')
    )
    .drop('prescriber_name')
    .rename({'user_full_name':'prescriber_name'})
    .select(
        'final_id', 'prescriber_name', 'dea_number(s)', 'specialty_grouping', 'specialty_1', 'specialty_2', 'specialty_3', 
        'dispensations', 'searches', 'rate', 'registered'
    )
    .collect()
)

write progress to csvs

In [None]:
results.write_csv('results.csv')
final_dispensations.write_csv('dispensations_results.csv')

## overlap, over MME threshold, opioid naive

### overlap

In [None]:
opi_dispensations = (
    final_dispensations
    .lazy()
    .filter(pl.col('ahfs').str.contains('OPIATE'))
    .with_columns(
    (pl.col('filled_date') + pl.duration(days='days_supply')).alias('opi_end_date'),
    (pl.col('disp_created_date') + pl.duration(days=1)).alias('opi_start_date')
    )
    .rename({
        'written_date':'opi_written_date', 'filled_date':'opi_filled_date', 
        'patient_name':'opi_patient_name', 'disp_created_date':'opi_disp_created_date'
    })
)

opi_count = (
    opi_dispensations
    .collect()
    .groupby('final_id')
    .count()
    .rename({'count':'opi_rx'})
)

benzo_dispensations = (
    final_dispensations
    .lazy()
    .filter(pl.col('ahfs').str.contains('BENZO'))
    .with_columns(
        (pl.col('filled_date') + pl.duration(days='days_supply')).alias('benzo_end_date'),
        (pl.col('disp_created_date') + pl.duration(days=1)).alias('benzo_start_date')
    )
    .rename({
        'written_date':'benzo_written_date', 'filled_date':'benzo_filled_date', 
        'patient_name':'benzo_patient_name', 'disp_created_date':'benzo_disp_created_date'
    })
)

benzo_count = (
    benzo_dispensations
    .collect()
    .groupby('final_id')
    .count()
    .rename({'count':'benzo_rx'})
)

# add counts of opi and benzo disps
results = (
    results
    .join(opi_count, how='left', on='final_id')
    .with_columns(
        pl.col('opi_rx').fill_null(0)
    )
    .join(benzo_count, how='left', on='final_id')
    .with_columns(
        pl.col('benzo_rx').fill_null(0)
    )
)

In [None]:
overlap_dispensations = (
    benzo_dispensations
    .join(opi_dispensations, how='left', on='disp_dob')
    .filter(
        # i think we'll end up wanting to only count second rx
        # but i'll have to talk to the pmp director and compliance manager for their opinions
        # we also have a 1 day lag, so we may want to exclude same day overlaps (even for same prescriber?)
        # maybe something like written date between created+1 and end
        ((pl.col('opi_written_date').is_between(pl.col('benzo_filled_date'), pl.col('benzo_end_date'))) |
        (pl.col('benzo_written_date').is_between(pl.col('opi_filled_date'), pl.col('opi_end_date'))))
    )
    .with_columns(
        pl.struct(['opi_patient_name', 'benzo_patient_name'])
        .apply(lambda x: ratio(x['opi_patient_name'], x['benzo_patient_name'])).alias('ratio')
    )
    .filter(
        pl.col('ratio') >= OVERLAP_RATIO
    )
    .collect(streaming=True)
)

benzo_dispensations_overlap = (
    overlap_dispensations
    .select('final_id')
    .groupby('final_id')
    .count()
)

opi_dispensations_overlap = (
    overlap_dispensations
    .select('final_id_right')
    .rename({'final_id_right':'final_id'})
    .groupby('final_id')
    .count()
)

all_overlaps = (
    pl.concat([benzo_dispensations_overlap, opi_dispensations_overlap])
    .groupby('final_id')
    .sum()
    .rename({'count':'overlapping_rx'})  
)

overlap_dispensations.write_csv('overlap_both.csv') # for testing

# add overlapping rx count to results
results = (
    results
    .join(all_overlaps, how='left', on='final_id')
    .with_columns(
        pl.col('overlapping_rx').fill_null(0)
    )
)

### over MME threshold

In [None]:
over_mme = (
    final_dispensations
    .select('final_id', 'mme')
    .filter(
        pl.col('mme') >= MME_THRESHOLD
    )
    .groupby('final_id')
    .count()
    .rename({'count':'rx_over_mme_threshold'})
)

# add count of rx over the mme threshold to the results
results = (
    results
    .join(over_mme, how='left', on='final_id')
    .with_columns(
        pl.col('rx_over_mme_threshold').fill_null(0)
    )
)

### opioid naive

In [None]:
naive = (
    pl.scan_csv('data/naive_data.csv')
    .rename({'Month, Day, Year of Patient Birthdate': 'naive_dob', 'Month, Day, Year of Filled At': 'naive_filled_date',
            'Patient First Name': 'patient_first_name', 'Patient Last Name': 'patient_last_name', 'Days Supply':'naive_days_supply'})
    .with_columns(
        pl.col('naive_dob').str.to_date('%B %d, %Y'),
        pl.col('naive_filled_date').str.to_date('%B %d, %Y'),
        (pl.col('patient_first_name') + ' ' + pl.col('patient_last_name')).str.to_uppercase().alias('naive_patient_name')
    )
    .with_columns(
        (pl.col('naive_filled_date') + pl.duration(days='naive_days_supply') + pl.duration(days=DAYS_NAIVE)).alias('naive_end_date')
    )
    .drop('patient_first_name', 'patient_last_name')
)

naive_disps = (
    final_dispensations
    .lazy()
    .join(naive, how='left', left_on='disp_dob', right_on='naive_dob')
    .filter(
        pl.col('written_date').is_between('naive_filled_date', 'naive_end_date') &
        pl.col('ahfs').str.contains('OPIATE')
    )
    .with_columns(
        pl.struct(['patient_name', 'naive_patient_name'])
        .apply(lambda x: ratio(x['patient_name'], x['naive_patient_name'])).alias('ratio')
    )
    .collect(streaming=True)
    .filter(
        pl.col('ratio') >= NAIVE_RATIO
    )
    .with_columns(
        pl.lit(False).alias('opi_naive')
    )
    .unique(subset=['final_id', 'rx_number'])
)

naive_disps = (
    final_dispensations
    .join(naive_disps, how='left', on=['final_id', 'rx_number'])
    .select('final_id', 'ahfs', 'opi_naive')
    .with_columns(
        pl.col('opi_naive').fill_null(True),
        ((pl.col('ahfs').str.contains('OPIATE')) & pl.col('opi_naive')).fill_null(True).alias('opi_to_opi_naive')
    )
    .select('final_id', 'opi_to_opi_naive')
    .filter(
        pl.col('opi_to_opi_naive')
    )
    .groupby('final_id').count()
    .rename({'count':'opi_to_opi_naive'})
)

# add number of opioid dispensations to opioid naive patients to results
results = (
    results
    .join(naive_disps, how='left', on='final_id')
    .with_columns(
        pl.col('opi_to_opi_naive').fill_null(0)
    )
    .sort(['searches', 'dispensations'], descending=[False, True])
)

## results

In [None]:
results.write_csv('results_full.csv')

In [None]:
stats = (
    results
    .drop('rate')
    .sum()
    .with_columns(
        ((pl.col('searches') / pl.col('dispensations')) * 100).round(2).alias('rate')
    )
    .select('dispensations', 'searches', 'rate')
)

stats