# DenormalizedPrescriber MetricPerformance

In [2]:
import polars as pl
import pandas as pd
import gc
import json
import numpy as np

In [3]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
num_weeks_rx = js['num_weeks_rx']
qtr_data = js['qtr_data']
bucket = js['bucket']
YTD = js['YTD']
QTD_m = js['QTD_m']
monthly_data_date = js['monthly_data_date']

dflib = f's3://{bucket}/BIT/dataframes/'
geo = f's3://{bucket}/PYADM/quaterly/{qtr_data}/geography/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'
mxpn = f's3://{bucket}/PYADM/monthly/archive/{monthly_data_date}/xponent/'

In [4]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [5]:
# Importing Dependencies
load('mp_spec_seg_dec')
load('MASTER_UNI')
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

Generator Functions 
---

In [6]:
# For Voucher Removal - 
def get_lin_voucher():
    vch = pl.read_parquet(f'{mxpn}LIN_VOUCHER.parquet') 
    vch1 = pl.DataFrame()
    for prod in ['LIN1','LIN2','LIN3']: # LINV
        vch_prod = (
            vch.select(
                pl.col('IID'),
                pl.col(f'{prod}TUF1').alias(f'vTUF_1c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,4)]).alias(f'vTUF_3c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,7)]).alias(f'vTUF_6c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,13)]).alias(f'vTUF_12c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(QTD_m+1,QTD_m+4)]).alias(f'vTUF_pqtrc'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,YTD+1)]).alias(f'vTUF_ytdc'),
                pl.col(f'{prod}TUF2').alias(f'vTUF_1p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(4,7)]).alias(f'vTUF_3p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(7,13)]).alias(f'vTUF_6p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(13,25)]).alias(f'vTUF_12p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in  range(QTD_m+4,QTD_m+7)]).alias(f'vTUF_pqtrp'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(13,13+YTD)]).alias(f'vTUF_ytdp'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,25)]).alias(f'vTUF_all') #added 105 week datacut
            )
            .with_columns(pl.lit(f'LI{prod[-1]}').alias('PROD_CD'))
        )
        if prod[-1] == '1':
            vch1 = vch_prod.clone()
        else:
            vch1 = pl.concat([vch1, vch_prod])

    # voucher_mapping = {'LI1': 4, 'LI2': 5, 'LI3': 3, 'LIV': 2}
    vch1 = vch1.fill_null(0)
    return(vch1)

In [7]:
def get_summed_period_iid_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,25)]
    df = pl.read_parquet(mxpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,3,6,12,pqtd,ytd for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,4)]).alias(metric+'_3c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,7)]).alias(metric+'_6c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,13)]).alias(metric+'_12c'),
        pl.sum_horizontal([metric+str(i) for i in range(QTD_m+1,QTD_m+4)]).alias(metric+'_pqtrc'),
        pl.sum_horizontal([metric+str(i) for i in range(1,YTD+1)]).alias(metric+'_ytdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(4,7)]).alias(metric+'_3p'),
        pl.sum_horizontal([metric+str(i) for i in range(7,13)]).alias(metric+'_6p'),
        pl.sum_horizontal([metric+str(i) for i in range(13,25)]).alias(metric+'_12p'),
        pl.sum_horizontal([metric+str(i) for i in  range(QTD_m+4,QTD_m+7)]).alias(metric+'_pqtrp'),
        pl.sum_horizontal([metric+str(i) for i in range(13,13+YTD)]).alias(metric+'_ytdp'),

        pl.sum_horizontal([metric+str(i) for i in range(1,25)]).alias(metric+'_all')
    )

    # For Voucher Removal - 
    if metric == 'TUF':
        dfv = get_lin_voucher()
        df = df.join(dfv,on=['IID','PROD_CD'],how='left').fill_null(0)
        cols_to_remove = dfv.columns[1:-1]
        df = df.with_columns(
            pl.col(f'{metric}_1c') -  pl.col(f'v{metric}_1c').alias(f'{metric}_1c'),
            pl.col(f'{metric}_3c') -  pl.col(f'v{metric}_3c').alias(f'{metric}_3c'),
            pl.col(f'{metric}_6c') -  pl.col(f'v{metric}_6c').alias(f'{metric}_6c'),
            pl.col(f'{metric}_12c') -  pl.col(f'v{metric}_12c').alias(f'{metric}_12c'),
            pl.col(f'{metric}_pqtrc') -  pl.col(f'v{metric}_pqtrc').alias(f'{metric}_pqtrc'),
            pl.col(f'{metric}_ytdc') -  pl.col(f'v{metric}_ytdc').alias(f'{metric}_ytdc'),
            pl.col(f'{metric}_1p') -  pl.col(f'v{metric}_1p').alias(f'{metric}_1p'),
            pl.col(f'{metric}_3p') -  pl.col(f'v{metric}_3p').alias(f'{metric}_3p'),
            pl.col(f'{metric}_6p') -  pl.col(f'v{metric}_6p').alias(f'{metric}_6p'),
            pl.col(f'{metric}_12p') -  pl.col(f'v{metric}_12p').alias(f'{metric}_12p'),
            pl.col(f'{metric}_pqtrp') -  pl.col(f'v{metric}_pqtrp').alias(f'{metric}_pqtrp'),
            pl.col(f'{metric}_ytdp') -  pl.col(f'v{metric}_ytdp').alias(f'{metric}_ytdp'),
            pl.col(f'{metric}_all') -  pl.col(f'v{metric}_all').alias(f'{metric}_all')
        ).drop(cols_to_remove)

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))

In [8]:
# RAW DATA PREP -

all_products_tuf = get_summed_period_iid_metric('TUF',fetch_products)
all_products_nuf = get_summed_period_iid_metric('NUF',fetch_products)
all_products_tuf_nuf = (
    all_products_tuf
    .join(all_products_nuf,on = ['IID','PROD_CD'] ,how='left')
    .join(
        prod_mapping[['code','product_id']],
        left_on = 'PROD_CD',right_on='code',how='left'
    ).drop('PROD_CD')
    .select(['IID','product_id','TUF_1c','TUF_1p','TUF_pqtrc','TUF_pqtrp','NUF_1c','NUF_1p','NUF_pqtrc','NUF_pqtrp'])
)

lax_tuf_nuf = all_products_tuf_nuf.group_by('IID').agg(
    TUF_1c_LAX = pl.col('TUF_1c').sum(),TUF_pqtrc_LAX = pl.col('TUF_pqtrc').sum(),
    TUF_1p_LAX = pl.col('TUF_1p').sum(),TUF_pqtrp_LAX = pl.col('TUF_pqtrp').sum(),
    NUF_1c_LAX = pl.col('NUF_1c').sum(),NUF_pqtrc_LAX = pl.col('NUF_pqtrc').sum(),
    NUF_1p_LAX = pl.col('NUF_1p').sum(),NUF_pqtrp_LAX = pl.col('NUF_pqtrp').sum()
)

Functions 
---

In [9]:
def get_linzess_strength_columns(df,prod_id_list,prod):
    df = df.join(
        all_products_tuf_nuf.filter(pl.col('product_id').is_in(prod_id_list)),
        on = 'IID', how = 'left'
    )
    df = df.group_by('IID').agg(
        TUF_1c_prod = pl.col('TUF_1c').sum(),TUF_pqtrc_prod = pl.col('TUF_pqtrc').sum(),
        TUF_1p_prod = pl.col('TUF_1p').sum(),TUF_pqtrp_prod = pl.col('TUF_pqtrp').sum(),
        NUF_1c_prod = pl.col('NUF_1c').sum(),NUF_pqtrc_prod = pl.col('NUF_pqtrc').sum(),
        NUF_1p_prod = pl.col('NUF_1p').sum(),NUF_pqtrp_prod = pl.col('NUF_pqtrp').sum()
    )
    df = df.join(
        lax_tuf_nuf,on = 'IID',how = 'left'
    )
    expn_dict = {
        f'{prod}_pqtr_trx_cur_vol' : pl.col('TUF_pqtrc_prod'),
        f'{prod}_pqtr_trx_pri_vol' : pl.col('TUF_pqtrp_prod'),
        f'{prod}_pqtr_trx_vol_change' : (pl.col('TUF_pqtrc_prod')-pl.col('TUF_pqtrp_prod')),
        f'{prod}_pqtr_trx_share' : (pl.col('TUF_pqtrc_prod')/pl.col('TUF_pqtrc_LAX')),

        f'{prod}_pqtr_nrx_cur_vol' : pl.col('NUF_pqtrc_prod'),
        f'{prod}_pqtr_nrx_pri_vol' : pl.col('NUF_pqtrp_prod'),
        f'{prod}_pqtr_nrx_vol_change' : (pl.col('NUF_pqtrc_prod')-pl.col('NUF_pqtrp_prod')),
        f'{prod}_pqtr_nrx_share' : (pl.col('NUF_pqtrc_prod')/pl.col('NUF_pqtrc_LAX')),

        f'{prod}_1_trx_cur_vol' : pl.col('TUF_1c_prod'),
        f'{prod}_1_trx_pri_vol' : pl.col('TUF_1p_prod'),
        f'{prod}_1_trx_vol_change' : (pl.col('TUF_1c_prod')-pl.col('TUF_1p_prod')),
        f'{prod}_1_trx_share' : (pl.col('TUF_1c_prod')/pl.col('TUF_1c_LAX')),

        f'{prod}_1_nrx_cur_vol' : pl.col('NUF_1c_prod'),
        f'{prod}_1_nrx_pri_vol' : pl.col('NUF_1p_prod'),
        f'{prod}_1_nrx_vol_change' : (pl.col('NUF_1c_prod')-pl.col('NUF_1p_prod')),
        f'{prod}_1_nrx_share' : (pl.col('NUF_1c_prod')/pl.col('NUF_1c_LAX')),
    }
    # adding columns now :
    df2 = df.with_columns(**expn_dict).select(['IID']+list(expn_dict.keys()))

    df2 = df2.with_columns(
        pl.when((pl.col(f'{prod}_pqtr_trx_vol_change')/pl.col(f'{prod}_pqtr_trx_pri_vol')) > 0.02).then(pl.lit('P'))
        .when((pl.col(f'{prod}_pqtr_trx_vol_change')/pl.col(f'{prod}_pqtr_trx_pri_vol')) < -0.02).then(pl.lit('Q'))
        .when(pl.col(f'{prod}_pqtr_trx_vol_change')==0).then(pl.lit('\\N'))
        .otherwise(pl.lit('\\N')).alias(f'{prod}_pqtr_vol_change_ind_trx'),
        
        pl.when((pl.col(f'{prod}_pqtr_nrx_vol_change')/pl.col(f'{prod}_pqtr_nrx_pri_vol')) > 0.02).then(pl.lit('P'))
        .when((pl.col(f'{prod}_pqtr_nrx_vol_change')/pl.col(f'{prod}_pqtr_nrx_pri_vol')) < -0.02).then(pl.lit('Q'))
        .when(pl.col(f'{prod}_pqtr_nrx_vol_change')==0).then(pl.lit('\\N'))
        .otherwise(pl.lit('\\N')).alias(f'{prod}_pqtr_vol_change_ind_nrx'),

        pl.when((pl.col(f'{prod}_1_trx_vol_change')/pl.col(f'{prod}_1_trx_pri_vol')) > 0.02).then(pl.lit('P'))
        .when((pl.col(f'{prod}_1_trx_vol_change')/pl.col(f'{prod}_1_trx_pri_vol')) < -0.02).then(pl.lit('Q'))
        .when(pl.col(f'{prod}_1_trx_vol_change')==0).then(pl.lit('\\N'))
        .otherwise(pl.lit('\\N')).alias(f'{prod}_1_vol_change_ind_trx'),
        
        pl.when((pl.col(f'{prod}_1_nrx_vol_change')/pl.col(f'{prod}_1_nrx_pri_vol')) > 0.02).then(pl.lit('P'))
        .when((pl.col(f'{prod}_1_nrx_vol_change')/pl.col(f'{prod}_1_nrx_pri_vol')) < -0.02).then(pl.lit('Q'))
        .when(pl.col(f'{prod}_1_nrx_vol_change')==0).then(pl.lit('\\N'))
        .otherwise(pl.lit('\\N')).alias(f'{prod}_1_vol_change_ind_nrx')
    )

    #dropping shr columns if not whole family
    if prod != 'LIN':
        df2 = df2.drop([col for col in df2.columns if 'share' in col])

    return(df2)

Processing 
---

In [10]:
# Processing
temp1 = mp_spec_seg_dec.select(['IID','geography_id'])

LIN_cols = get_linzess_strength_columns(temp1,[3,4,5],'LIN')
LI1_cols = get_linzess_strength_columns(temp1,[3],'LI1')
LI2_cols = get_linzess_strength_columns(temp1,[4],'LI2')
LI3_cols = get_linzess_strength_columns(temp1,[5],'LI3')

# join all of them back to temp1
temp1 = temp1.join(LIN_cols,on='IID',how='left'
).join(LI1_cols,on='IID',how='left'
).join(LI2_cols,on='IID',how='left'
).join(LI3_cols,on='IID',how='left'
).with_columns(Product_id = pl.lit(1))

In [11]:
# for filtering extra obs
check_cols = []
for col in temp1.columns:
    if ((col.startswith('LIN')) & ('ind' not in col) & ('share' not in col) & ('change' not in col)):
        check_cols.append(col)
temp1 = (
    temp1
    .with_columns(
        qc=pl.sum_horizontal(check_cols)
    )
    .filter(
        (pl.col('qc') != 0)
    )
    .drop(['qc'])
)

for c in ['LIN_pqtr_trx_share','LIN_pqtr_nrx_share','LIN_1_trx_share','LIN_1_nrx_share']: # convert NaN to 0.0
    temp1 = temp1.with_columns(pl.col(c).replace(np.nan,0.0))

In [12]:
# PDRP Overrides - 
temp1 = temp1.join(MASTER_UNI.select(['IID','PDRPOptOutFlag']),on='IID',how='left')
override_columns =  temp1.columns[2:-2]
expression_list = [
    pl.when(pl.col('PDRPOptOutFlag')=='Y').then(pl.lit('\\N')).otherwise(pl.col(c)).alias(c)
    for c in override_columns
]
temp1 = temp1.with_columns(expression_list).drop('PDRPOptOutFlag')

In [13]:
# Making Data Feed Ready-

col_mapping = {
    'IID':'Physician_ID',
    'geography_id':'Geography_id',
    'LIN_pqtr_trx_cur_vol':'NumberMetric1',
    'LIN_pqtr_trx_pri_vol':'NumberMetric2',
    'LIN_pqtr_trx_vol_change':'NumberMetric3',
    'LIN_pqtr_trx_share':'NumberMetric4',
    'LIN_pqtr_nrx_cur_vol':'NumberMetric6',
    'LIN_pqtr_nrx_pri_vol':'NumberMetric7',
    'LIN_pqtr_nrx_vol_change':'NumberMetric8',
    'LIN_pqtr_nrx_share':'NumberMetric9',
    'LIN_1_trx_cur_vol':'NumberMetric11',
    'LIN_1_trx_pri_vol':'NumberMetric12',
    'LIN_1_trx_vol_change':'NumberMetric13',
    'LIN_1_trx_share':'NumberMetric14',
    'LIN_1_nrx_cur_vol':'NumberMetric16',
    'LIN_1_nrx_pri_vol':'NumberMetric17',
    'LIN_1_nrx_vol_change':'NumberMetric18',
    'LIN_1_nrx_share':'NumberMetric19',
    'LI1_pqtr_trx_cur_vol':'NumberMetric21',
    'LI1_pqtr_trx_pri_vol':'NumberMetric22',
    'LI1_pqtr_trx_vol_change':'NumberMetric23',
    'LI1_pqtr_nrx_cur_vol':'NumberMetric24',
    'LI1_pqtr_nrx_pri_vol':'NumberMetric25',
    'LI1_pqtr_nrx_vol_change':'NumberMetric26',
    'LI1_1_trx_cur_vol':'NumberMetric27',
    'LI1_1_trx_pri_vol':'NumberMetric28',
    'LI1_1_trx_vol_change':'NumberMetric29',
    'LI1_1_nrx_cur_vol':'NumberMetric30',
    'LI1_1_nrx_pri_vol':'NumberMetric31',
    'LI1_1_nrx_vol_change':'NumberMetric32',
    'LI2_pqtr_trx_cur_vol':'NumberMetric81',
    'LI2_pqtr_trx_pri_vol':'NumberMetric82',
    'LI2_pqtr_trx_vol_change':'NumberMetric83',
    'LI2_pqtr_nrx_cur_vol':'NumberMetric84',
    'LI2_pqtr_nrx_pri_vol':'NumberMetric85',
    'LI2_pqtr_nrx_vol_change':'NumberMetric86',
    'LI2_1_trx_cur_vol':'NumberMetric87',
    'LI2_1_trx_pri_vol':'NumberMetric88',
    'LI2_1_trx_vol_change':'NumberMetric89',
    'LI2_1_nrx_cur_vol':'NumberMetric90',
    'LI2_1_nrx_pri_vol':'NumberMetric91',
    'LI2_1_nrx_vol_change':'NumberMetric92',
    'LI3_pqtr_trx_cur_vol':'NumberMetric69',
    'LI3_pqtr_trx_pri_vol':'NumberMetric70',
    'LI3_pqtr_trx_vol_change':'NumberMetric71',
    'LI3_pqtr_nrx_cur_vol':'NumberMetric72',
    'LI3_pqtr_nrx_pri_vol':'NumberMetric73',
    'LI3_pqtr_nrx_vol_change':'NumberMetric74',
    'LI3_1_trx_cur_vol':'NumberMetric75',
    'LI3_1_trx_pri_vol':'NumberMetric76',
    'LI3_1_trx_vol_change':'NumberMetric77',
    'LI3_1_nrx_cur_vol':'NumberMetric78',
    'LI3_1_nrx_pri_vol':'NumberMetric79',
    'LI3_1_nrx_vol_change':'NumberMetric80',
    'LIN_pqtr_vol_change_ind_trx':'StringMetric1',
    'LIN_pqtr_vol_change_ind_nrx':'StringMetric2',
    'LIN_1_vol_change_ind_trx':'StringMetric3',
    'LIN_1_vol_change_ind_nrx':'StringMetric4',
    'LI1_pqtr_vol_change_ind_trx':'StringMetric5',
    'LI1_pqtr_vol_change_ind_nrx':'StringMetric6',
    'LI1_1_vol_change_ind_trx':'StringMetric7',
    'LI1_1_vol_change_ind_nrx':'StringMetric8',
    'LI2_pqtr_vol_change_ind_trx':'StringMetric25',
    'LI2_pqtr_vol_change_ind_nrx':'StringMetric26',
    'LI2_1_vol_change_ind_trx':'StringMetric27',
    'LI2_1_vol_change_ind_nrx':'StringMetric28',
    'LI3_pqtr_vol_change_ind_trx':'StringMetric21',
    'LI3_pqtr_vol_change_ind_nrx':'StringMetric22',
    'LI3_1_vol_change_ind_trx':'StringMetric23',
    'LI3_1_vol_change_ind_nrx':'StringMetric24'
}
final_feed = temp1.rename(col_mapping)
#required new columns for feed
cols_to_addna = [
    'NumberMetric5', 'NumberMetric10', 'NumberMetric15', 'NumberMetric20',
    'NumberMetric33', 'NumberMetric34', 'NumberMetric35', 'NumberMetric36',
    'NumberMetric37', 'NumberMetric38', 'NumberMetric39', 'NumberMetric40',
    'NumberMetric41', 'NumberMetric42', 'NumberMetric43', 'NumberMetric44',
    'NumberMetric45', 'NumberMetric46', 'NumberMetric47', 'NumberMetric48',
    'NumberMetric49', 'NumberMetric50', 'NumberMetric51', 'NumberMetric52',
    'NumberMetric53', 'NumberMetric54', 'NumberMetric55', 'NumberMetric56',
    'NumberMetric57', 'NumberMetric58', 'NumberMetric59', 'NumberMetric60',
    'NumberMetric61', 'NumberMetric62', 'NumberMetric63', 'NumberMetric64',
    'NumberMetric65', 'NumberMetric66', 'NumberMetric67','NumberMetric68',
    'StringMetric9', 'StringMetric10','StringMetric11','StringMetric12',
    'StringMetric13', 'StringMetric14', 'StringMetric15', 'StringMetric16', 
    'StringMetric17', 'StringMetric18', 'StringMetric19', 'StringMetric20'
]
for my_col in cols_to_addna:
        final_feed = final_feed.with_columns(pl.lit('\\N').alias(my_col))

# rearranging columns accoring to feed.
req_cols = variable_names = [
    "Physician_ID", "Geography_id", "Product_id"] + ['NumberMetric' + str(i) for i in range(1,93)] + ['StringMetric' + str(i) for i in range(1,29)]
final_feed = final_feed.select(req_cols)#final_dataset

In [13]:
#Exporting Feeds-
OUT = 's3://vortex-staging-a65ced90/BIT/output/DenormalizedPrescriber/Monthly/'
final_feed.to_pandas().to_csv(f'{OUT}Monthly_DenormalizedPrescriber_MetricPerformance_Feed.txt',sep='|',lineterminator='\r\n',index=False)
print('Denorm Presc Metric Feed Exported !')

Denorm Presc Metric Feed Exported !


---