# DenormalizedPrescriber MetricPerformance

In [1]:
import polars as pl
import pandas as pd
import gc
import json

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)
num_weeks_rx = js['num_weeks_rx']
data_date = js['data_date']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Importing Dependencies
load('mp_spec_seg_dec')
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

### Generator Functions -

In [5]:
def get_summed_period_iid(prod_cd):
    columns = ['IID','PROD_CD'] + ['TUF'+str(i) for i in range(1,27)] + ['NUF'+str(i) for i in range(1,27)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 4,13 for current and prior period for TUF and NUF
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.sum_horizontal(['TUF'+str(i) for i in range(1,5)]).alias('TUF'+'_4c'),
        pl.sum_horizontal(['TUF'+str(i) for i in range(1,14)]).alias('TUF'+'_13c'),
        pl.sum_horizontal(['TUF'+str(i) for i in range(5,9)]).alias('TUF'+'_4p'),
        pl.sum_horizontal(['TUF'+str(i) for i in range(14,27)]).alias('TUF'+'_13p'),

        pl.sum_horizontal(['NUF'+str(i) for i in range(1,5)]).alias('NUF'+'_4c'),
        pl.sum_horizontal(['NUF'+str(i) for i in range(1,14)]).alias('NUF'+'_13c'),
        pl.sum_horizontal(['NUF'+str(i) for i in range(5,9)]).alias('NUF'+'_4p'),
        pl.sum_horizontal(['NUF'+str(i) for i in range(14,27)]).alias('NUF'+'_13p'),
    )

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))

In [6]:
# Raw Data Prep -
all_products_tuf_nuf = get_summed_period_iid(fetch_products).join(
    prod_mapping[['code','product_id']],left_on = 'PROD_CD',right_on='code',how='left'
).drop('PROD_CD')

lax_tuf_nuf = all_products_tuf_nuf.group_by('IID').agg(
    TUF_4c_LAX = pl.col('TUF_4c').sum(),TUF_13c_LAX = pl.col('TUF_13c').sum(),
    TUF_4p_LAX = pl.col('TUF_4p').sum(),TUF_13p_LAX = pl.col('TUF_13p').sum(),
    NUF_4c_LAX = pl.col('NUF_4c').sum(),NUF_13c_LAX = pl.col('NUF_13c').sum(),
    NUF_4p_LAX = pl.col('NUF_4p').sum(),NUF_13p_LAX = pl.col('NUF_13p').sum()
)

#### Functions -

In [7]:
def get_linzess_strength_columns(df,prod_id_list,prod):
    df = df.join(
        all_products_tuf_nuf.filter(pl.col('product_id').is_in(prod_id_list)),
        on = 'IID', how = 'left'
    )
    df = df.group_by('IID').agg(
        TUF_4c_prod = pl.col('TUF_4c').sum(),TUF_13c_prod = pl.col('TUF_13c').sum(),
        TUF_4p_prod = pl.col('TUF_4p').sum(),TUF_13p_prod = pl.col('TUF_13p').sum(),
        NUF_4c_prod = pl.col('NUF_4c').sum(),NUF_13c_prod = pl.col('NUF_13c').sum(),
        NUF_4p_prod = pl.col('NUF_4p').sum(),NUF_13p_prod = pl.col('NUF_13p').sum()
    )
    df = df.join(
        lax_tuf_nuf,on = 'IID',how = 'left'
    )
    expn_dict = {
        f'{prod}_13_trx_cur_vol' : pl.col('TUF_13c_prod'),
        f'{prod}_13_trx_pri_vol' : pl.col('TUF_13p_prod'),
        f'{prod}_13_trx_vol_change' : (pl.col('TUF_13c_prod')-pl.col('TUF_13p_prod')),
        f'{prod}_13_trx_share' : (pl.col('TUF_13c_prod')/pl.col('TUF_13c_LAX')),

        f'{prod}_13_nrx_cur_vol' : pl.col('NUF_13c_prod'),
        f'{prod}_13_nrx_pri_vol' : pl.col('NUF_13p_prod'),
        f'{prod}_13_nrx_vol_change' : (pl.col('NUF_13c_prod')-pl.col('NUF_13p_prod')),
        f'{prod}_13_nrx_share' : (pl.col('NUF_13c_prod')/pl.col('NUF_13c_LAX')),

        f'{prod}_4_trx_cur_vol' : pl.col('TUF_4c_prod'),
        f'{prod}_4_trx_pri_vol' : pl.col('TUF_4p_prod'),
        f'{prod}_4_trx_vol_change' : (pl.col('TUF_4c_prod')-pl.col('TUF_4p_prod')),
        f'{prod}_4_trx_share' : (pl.col('TUF_4c_prod')/pl.col('TUF_4c_LAX')),

        f'{prod}_4_nrx_cur_vol' : pl.col('NUF_4c_prod'),
        f'{prod}_4_nrx_pri_vol' : pl.col('NUF_4p_prod'),
        f'{prod}_4_nrx_vol_change' : (pl.col('NUF_4c_prod')-pl.col('NUF_4p_prod')),
        f'{prod}_4_nrx_share' : (pl.col('NUF_4c_prod')/pl.col('NUF_4c_LAX')),
    }
    # adding columns now :
    df2 = df.with_columns(**expn_dict).select(['IID']+list(expn_dict.keys()))

    df2 = df2.with_columns(
        pl.when((pl.col(f'{prod}_13_trx_vol_change')/pl.col(f'{prod}_13_trx_pri_vol')) > 0.02).then(pl.lit('P'))
        .when((pl.col(f'{prod}_13_trx_vol_change')/pl.col(f'{prod}_13_trx_pri_vol')) < -0.02).then(pl.lit('Q'))
        .when(pl.col(f'{prod}_13_trx_vol_change')==0).then(pl.lit('//N'))
        .otherwise(pl.lit('//N')).alias(f'{prod}_13_vol_change_ind_trx'),
        
        pl.when((pl.col(f'{prod}_13_nrx_vol_change')/pl.col(f'{prod}_13_nrx_pri_vol')) > 0.02).then(pl.lit('P'))
        .when((pl.col(f'{prod}_13_nrx_vol_change')/pl.col(f'{prod}_13_nrx_pri_vol')) < -0.02).then(pl.lit('Q'))
        .when(pl.col(f'{prod}_13_nrx_vol_change')==0).then(pl.lit('//N'))
        .otherwise(pl.lit('//N')).alias(f'{prod}_13_vol_change_ind_nrx'),

        pl.when((pl.col(f'{prod}_4_trx_vol_change')/pl.col(f'{prod}_4_trx_pri_vol')) > 0.02).then(pl.lit('P'))
        .when((pl.col(f'{prod}_4_trx_vol_change')/pl.col(f'{prod}_4_trx_pri_vol')) < -0.02).then(pl.lit('Q'))
        .when(pl.col(f'{prod}_4_trx_vol_change')==0).then(pl.lit('//N'))
        .otherwise(pl.lit('//N')).alias(f'{prod}_4_vol_change_ind_trx'),
        
        pl.when((pl.col(f'{prod}_4_nrx_vol_change')/pl.col(f'{prod}_4_nrx_pri_vol')) > 0.02).then(pl.lit('P'))
        .when((pl.col(f'{prod}_4_nrx_vol_change')/pl.col(f'{prod}_4_nrx_pri_vol')) < -0.02).then(pl.lit('Q'))
        .when(pl.col(f'{prod}_4_nrx_vol_change')==0).then(pl.lit('//N'))
        .otherwise(pl.lit('//N')).alias(f'{prod}_4_vol_change_ind_nrx')
    )

    #dropping shr columns if not whole family
    if prod != 'LIN':
        df2 = df2.drop([col for col in df2.columns if 'share' in col])

    return(df2)

---

In [8]:
# Processing
temp1 = mp_spec_seg_dec.select(['IID','geography_id'])

LIN_cols = get_linzess_strength_columns(temp1,[3,4,5],'LIN')
LI1_cols = get_linzess_strength_columns(temp1,[3],'LI1')
LI2_cols = get_linzess_strength_columns(temp1,[4],'LI2')
LI3_cols = get_linzess_strength_columns(temp1,[5],'LI3')

# join all of them back to temp1
temp1 = temp1.join(LIN_cols,on='IID',how='left'
).join(LI1_cols,on='IID',how='left'
).join(LI2_cols,on='IID',how='left'
).join(LI3_cols,on='IID',how='left'
).with_columns(Product_id = pl.lit(1))

In [9]:
# for filtering extra obs
check_cols = []
for col in temp1.columns:
    if ((col.startswith('LIN')) & ('ind' not in col) & ('share' not in col)):
        check_cols.append(col)
temp1 = (
    temp1.with_columns(
        qc=pl.sum_horizontal(check_cols)
    )
    .filter(pl.col('qc') != 0)
    .drop('qc')
)

In [10]:
# Making Data Feed Ready-

col_mapping = {
    'IID':'Physician_ID',
    'geography_id':'Geography_id',
    'LIN_13_trx_cur_vol':'NumberMetric1',
    'LIN_13_trx_pri_vol':'NumberMetric2',
    'LIN_13_trx_vol_change':'NumberMetric3',
    'LIN_13_trx_share':'NumberMetric4',
    'LIN_13_nrx_cur_vol':'NumberMetric6',
    'LIN_13_nrx_pri_vol':'NumberMetric7',
    'LIN_13_nrx_vol_change':'NumberMetric8',
    'LIN_13_nrx_share':'NumberMetric9',
    'LIN_4_trx_cur_vol':'NumberMetric11',
    'LIN_4_trx_pri_vol':'NumberMetric12',
    'LIN_4_trx_vol_change':'NumberMetric13',
    'LIN_4_trx_share':'NumberMetric14',
    'LIN_4_nrx_cur_vol':'NumberMetric16',
    'LIN_4_nrx_pri_vol':'NumberMetric17',
    'LIN_4_nrx_vol_change':'NumberMetric18',
    'LIN_4_nrx_share':'NumberMetric19',
    'LI1_13_trx_cur_vol':'NumberMetric21',
    'LI1_13_trx_pri_vol':'NumberMetric22',
    'LI1_13_trx_vol_change':'NumberMetric23',
    'LI1_13_nrx_cur_vol':'NumberMetric24',
    'LI1_13_nrx_pri_vol':'NumberMetric25',
    'LI1_13_nrx_vol_change':'NumberMetric26',
    'LI1_4_trx_cur_vol':'NumberMetric27',
    'LI1_4_trx_pri_vol':'NumberMetric28',
    'LI1_4_trx_vol_change':'NumberMetric29',
    'LI1_4_nrx_cur_vol':'NumberMetric30',
    'LI1_4_nrx_pri_vol':'NumberMetric31',
    'LI1_4_nrx_vol_change':'NumberMetric32',
    'LI2_13_trx_cur_vol':'NumberMetric81',
    'LI2_13_trx_pri_vol':'NumberMetric82',
    'LI2_13_trx_vol_change':'NumberMetric83',
    'LI2_13_nrx_cur_vol':'NumberMetric84',
    'LI2_13_nrx_pri_vol':'NumberMetric85',
    'LI2_13_nrx_vol_change':'NumberMetric86',
    'LI2_4_trx_cur_vol':'NumberMetric87',
    'LI2_4_trx_pri_vol':'NumberMetric88',
    'LI2_4_trx_vol_change':'NumberMetric89',
    'LI2_4_nrx_cur_vol':'NumberMetric90',
    'LI2_4_nrx_pri_vol':'NumberMetric91',
    'LI2_4_nrx_vol_change':'NumberMetric92',
    'LI3_13_trx_cur_vol':'NumberMetric69',
    'LI3_13_trx_pri_vol':'NumberMetric70',
    'LI3_13_trx_vol_change':'NumberMetric71',
    'LI3_13_nrx_cur_vol':'NumberMetric72',
    'LI3_13_nrx_pri_vol':'NumberMetric73',
    'LI3_13_nrx_vol_change':'NumberMetric74',
    'LI3_4_trx_cur_vol':'NumberMetric75',
    'LI3_4_trx_pri_vol':'NumberMetric76',
    'LI3_4_trx_vol_change':'NumberMetric77',
    'LI3_4_nrx_cur_vol':'NumberMetric78',
    'LI3_4_nrx_pri_vol':'NumberMetric79',
    'LI3_4_nrx_vol_change':'NumberMetric80',
    'LIN_13_vol_change_ind_trx':'StringMetric1',
    'LIN_13_vol_change_ind_nrx':'StringMetric2',
    'LIN_4_vol_change_ind_trx':'StringMetric3',
    'LIN_4_vol_change_ind_nrx':'StringMetric4',
    'LI1_13_vol_change_ind_trx':'StringMetric5',
    'LI1_13_vol_change_ind_nrx':'StringMetric6',
    'LI1_4_vol_change_ind_trx':'StringMetric7',
    'LI1_4_vol_change_ind_nrx':'StringMetric8',
    'LI2_13_vol_change_ind_trx':'StringMetric25',
    'LI2_13_vol_change_ind_nrx':'StringMetric26',
    'LI2_4_vol_change_ind_trx':'StringMetric27',
    'LI2_4_vol_change_ind_nrx':'StringMetric28',
    'LI3_13_vol_change_ind_trx':'StringMetric21',
    'LI3_13_vol_change_ind_nrx':'StringMetric22',
    'LI3_4_vol_change_ind_trx':'StringMetric23',
    'LI3_4_vol_change_ind_nrx':'StringMetric24'
}
final_feed = temp1.rename(col_mapping)
#required new columns for feed
cols_to_addna = [
    'NumberMetric5', 'NumberMetric10', 'NumberMetric15', 'NumberMetric20',
    'NumberMetric33', 'NumberMetric34', 'NumberMetric35', 'NumberMetric36',
    'NumberMetric37', 'NumberMetric38', 'NumberMetric39', 'NumberMetric40',
    'NumberMetric41', 'NumberMetric42', 'NumberMetric43', 'NumberMetric44',
    'NumberMetric45', 'NumberMetric46', 'NumberMetric47', 'NumberMetric48',
    'NumberMetric49', 'NumberMetric50', 'NumberMetric51', 'NumberMetric52',
    'NumberMetric53', 'NumberMetric54', 'NumberMetric55', 'NumberMetric56',
    'NumberMetric57', 'NumberMetric58', 'NumberMetric59', 'NumberMetric60',
    'NumberMetric61', 'NumberMetric62', 'NumberMetric63', 'NumberMetric64',
    'NumberMetric65', 'NumberMetric66', 'NumberMetric67','NumberMetric68',
    'StringMetric9', 'StringMetric10','StringMetric11','StringMetric12',
    'StringMetric13', 'StringMetric14', 'StringMetric15', 'StringMetric16', 
    'StringMetric17', 'StringMetric18', 'StringMetric19', 'StringMetric20',
    

]
for my_col in cols_to_addna:
        final_feed = final_feed.with_columns(pl.lit('\\N').alias(my_col))

# rearranging columns accoring to feed.
req_cols = variable_names = [
    "Physician_ID", "Geography_id", "Product_id"] + ['NumberMetric' + str(i) for i in range(1,93)] + ['StringMetric' + str(i) for i in range(1,29)]
final_feed = final_feed.select(req_cols)#final_dataset

In [11]:
#Exporting Feeds-
OUT = 's3://vortex-staging-a65ced90/BIT/output/DenormalizedPrescriber/Weekly/'
final_feed.to_pandas().to_csv(f'{OUT}Weekly_DenormalizedPrescriber_MetricPerformance_Feed.txt', sep='|')
print('Denorm Presc Metric Feed Exported !')

Denorm Presc Metric Feed Exported !
