### LinzessSnapshot Sales Performance

##### pending :
- Rounding off values
- Figure out a way to conver "x.0" values into just "x" -> Differing round off is observed in sas

In [1]:
import polars as pl
import pandas as pd
import gc
import json

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
num_weeks_rx = js['num_weeks_rx']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('mp_spec_seg_dec')
load('MASTER_UNI')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

### Generator Functions -

In [7]:
# For Voucher Removal - 
def get_lin_voucher():
    vch = pl.read_parquet(f'{xpn}LIN_VOUCHER.parquet') # n_rows=500
    vch1 = pl.DataFrame()
    for prod in ['LIN1','LIN2','LIN3']: # LINV
        vch_prod = (
            vch.select(
                pl.col('IID'),
                pl.col(f'{prod}TUF1').alias(f'vTUF_1c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,5)]).alias(f'vTUF_4c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,14)]).alias(f'vTUF_13c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,27)]).alias(f'vTUF_26c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,num_weeks_rx+1)]).alias(f'vTUF_qtdc'),
                pl.col(f'{prod}TUF2').alias(f'vTUF_1p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(5,9)]).alias(f'vTUF_4p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(14,27)]).alias(f'vTUF_13p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(27,53)]).alias(f'vTUF_26p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(14,14+num_weeks_rx)]).alias(f'vTUF_qtdp'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,106)]).alias(f'vTUF_all')
            )
            .with_columns(pl.lit(f'LI{prod[-1]}').alias('PROD_CD'))
        )
        if prod[-1] == '1':
            vch1 = vch_prod.clone()
        else:
            vch1 = pl.concat([vch1, vch_prod])

    # voucher_mapping = {'LI1': 4, 'LI2': 5, 'LI3': 3, 'LIV': 2}
    # vch1 = vch1.with_columns(pl.col('PROD_CD').replace(voucher_mapping,return_dtype=pl.Int64).alias('product_id')).fill_null(0)#.drop('PROD_CD')
    vch1 = vch1.fill_null(0)

    return(vch1)

In [8]:
def get_summed_period_iid_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,106)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,4,13,26 for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,5)]).alias(metric+'_4c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,14)]).alias(metric+'_13c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,27)]).alias(metric+'_26c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,num_weeks_rx+1)]).alias(metric+'_qtdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(5,9)]).alias(metric+'_4p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,27)]).alias(metric+'_13p'),
        pl.sum_horizontal([metric+str(i) for i in range(27,53)]).alias(metric+'_26p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,14+num_weeks_rx)]).alias(metric+'_qtdp'),

        pl.sum_horizontal([metric+str(i) for i in range(1,106)]).alias(metric+'_all')
    )
    # For Voucher Removal - 
    if metric == 'TUF':
        dfv = get_lin_voucher()
        df = df.join(dfv,on=['IID','PROD_CD'],how='left').fill_null(0)
        cols_to_remove = dfv.columns[1:-1]
        df = df.with_columns(
            pl.col(f'{metric}_1c') -  pl.col(f'v{metric}_1c').alias(f'{metric}_1c'),
            pl.col(f'{metric}_4c') -  pl.col(f'v{metric}_4c').alias(f'{metric}_4c'),
            pl.col(f'{metric}_13c') -  pl.col(f'v{metric}_13c').alias(f'{metric}_13c'),
            pl.col(f'{metric}_26c') -  pl.col(f'v{metric}_26c').alias(f'{metric}_26c'),
            pl.col(f'{metric}_qtdc') -  pl.col(f'v{metric}_qtdc').alias(f'{metric}_qtdc'),
            pl.col(f'{metric}_1p') -  pl.col(f'v{metric}_1p').alias(f'{metric}_1p'),
            pl.col(f'{metric}_4p') -  pl.col(f'v{metric}_4p').alias(f'{metric}_4p'),
            pl.col(f'{metric}_13p') -  pl.col(f'v{metric}_13p').alias(f'{metric}_13p'),
            pl.col(f'{metric}_26p') -  pl.col(f'v{metric}_26p').alias(f'{metric}_26p'),
            pl.col(f'{metric}_qtdp') -  pl.col(f'v{metric}_qtdp').alias(f'{metric}_qtdp'),
            pl.col(f'{metric}_all') -  pl.col(f'v{metric}_all').alias(f'{metric}_all')
        ).drop(cols_to_remove)

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))



In [9]:
def add_parent_product_rows(df):
    agg_dict = {}
    for col in df.columns[2:]:
        agg_dict[col] = pl.col(col).sum()
    
    #join_cols = ['geography_id','plan_type','PlanID','IID']

    df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
    df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
    df_2_35 = df_2_35.group_by(['IID','parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
    
    df_1 = df.group_by('IID').agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

    # stack 1, 2_35 with df and return
    df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
    vstack_helper = df.columns
    df = df.vstack(
        df_2_35.select(vstack_helper)
    ).vstack(
        df_1.select(vstack_helper)
    )

    return(df)

In [10]:
#Raw Data Prep ETA - 21 Seconds
all_products_tuf = get_summed_period_iid_metric('TUF',fetch_products)
all_products_nuf = get_summed_period_iid_metric('NUF',fetch_products)
all_products_trx = get_summed_period_iid_metric('TRX',fetch_products)
all_products_nrx = get_summed_period_iid_metric('NRX',fetch_products)
all_products_tun = get_summed_period_iid_metric('TUN',fetch_products)
all_products_nun = get_summed_period_iid_metric('NUN',fetch_products)
all_products_tuf = add_parent_product_rows(all_products_tuf)
all_products_nuf = add_parent_product_rows(all_products_nuf)
all_products_trx = add_parent_product_rows(all_products_trx)
all_products_nrx = add_parent_product_rows(all_products_nrx)
all_products_tun = add_parent_product_rows(all_products_tun)
all_products_nun = add_parent_product_rows(all_products_nun)

---

### Functions - 

In [25]:
# Cur Vol, Vol Change
def process_1(df,metric,name):
    source_df = (
        globals()[f'all_products_{metric.lower()}'].filter(pl.col('product_id')==2)
        .rename({f'{metric}{period}c' : f'cur_{name}_vol',f'{metric}{period}p' : f'pri_{name}_vol'})
        .with_columns(
            (pl.col(f'cur_{name}_vol')-pl.col(f'pri_{name}_vol')).alias(f'{name}_vol_change')
        )
        .select(['IID',f'cur_{name}_vol',f'{name}_vol_change',f'pri_{name}_vol'])
    )
    df = df.join(source_df,on='IID',how='left')
    return(df)

In [52]:
#run_rate
def process_2(df,metric,name):
    source_df = (
        globals()[f'all_products_{metric.lower()}'].filter(pl.col('product_id')==2)
        .with_columns(
            pl.when((pl.col(f'{metric}_4c')==0)&(pl.col(f'{metric}_4p')==0)).then(pl.lit(0))
            .otherwise(
                ((pl.col(f'{metric}_4c') * 13) / 4) - pl.col(f'{metric}_13c')
            )
            .alias(f'run_rate_{name}_4v13')
        )
    )
    df = df.join(source_df,on='IID',how='left')
    return(df)

In [27]:
# Refil Rate :  TUF/NUF
def process_3(df):
    source_df = (
        all_products_tuf.join(all_products_nuf,on=['IID',p],how='left')
        .filter(pl.col('product_id')==2)
        .with_columns(
            refill_rate = pl.col(f'TUF{period}c')/pl.col(f'NUF{period}c')
        )
        .select(['IID','refill_rate'])
    )
    df = df.join(source_df,on='IID',how='left')
    return(df)


In [28]:
# percentage of Trx of Linzess Strengths
def process_4(df):
    source_df_cur = (
        all_products_tuf.filter(pl.col('product_id').is_in([3,4,5]))
        .select(['IID',f'TUF{period}c',p])
        .pivot(index = 'IID',values = f'TUF{period}c',columns=p)
    )
    source_df_cur = source_df_cur.rename({f'{c}' : f'{c}_cur' for c in [i for i in source_df_cur.columns[1:]]})
    
    source_df_pri = (
        all_products_tuf.filter(pl.col('product_id').is_in([3,4,5]))
        .select(['IID',f'TUF{period}p',p])
        .pivot(index = 'IID',values = f'TUF{period}p',columns=p)
    )
    source_df_pri = source_df_pri.rename({f'{c}' : f'{c}_pri' for c in [i for i in source_df_pri.columns[1:]]})
    
    df = (
        df.join(source_df_cur,on='IID',how='left').join(source_df_pri,on='IID',how='left')
        .with_columns(
            prc_trx_lin72 = pl.col('3_cur')/pl.col('cur_trx_vol'),
            prc_trx_lin145 = pl.col('4_cur')/pl.col('cur_trx_vol'),
            prc_trx_lin290 = pl.col('5_cur')/pl.col('cur_trx_vol')
        )
        .with_columns(
            pri_prc_trx_lin72 = pl.col('3_pri')/pl.col('pri_trx_vol'),
            pri_prc_trx_lin145 = pl.col('4_pri')/pl.col('pri_trx_vol'),
            pri_prc_trx_lin290 = pl.col('5_pri')/pl.col('pri_trx_vol')
        )
        .drop([f'{i}_cur' for i in range(3,6)] + [f'{i}_pri' for i in range(3,6)])
        .fill_nan(0)
    )
    return(df)

In [29]:
# Avg Trx Size , Size Change
def process_5(df):
    source_df = (
        all_products_tun.join(all_products_trx,on=['IID',p],how='left')
        .filter(pl.col(p)==2)
        .with_columns(
            avg_trx_size = pl.col(f'TUN{period}c')/pl.col(f'TRX{period}c'),
            prior_avg_trx_size = pl.col(f'TUN{period}p')/pl.col(f'TRX{period}p'),
        )
        .with_columns(
            avg_trx_size_ch = pl.col('avg_trx_size')-pl.col('prior_avg_trx_size')
        )
        .select(['IID','avg_trx_size','avg_trx_size_ch'])
    )
    df = df.join(source_df,on='IID',how='left')
    return(df)



In [30]:
#prc 90 day trx
def process_6(df):
    source_df = (
        all_products_tun.join(all_products_trx,on=['IID',p],how='left')
        .filter(pl.col(p)==2)
        .with_columns(
            trx_90day_pct = -(1/2) + (pl.col(f'TUN{period}c')/(60*pl.col(f'TRX{period}c')))
        )
        .select(['IID','trx_90day_pct'])
    )
    df = df.join(source_df,on='IID',how='left')
    return(df)



In [31]:
#IBSC VOL, Vol Change, %Share
def process_7(df):
    source_df = (
        all_products_tuf.filter(pl.col(p)==1)
        .with_columns(
            ibsc_vol_change = pl.col(f'TUF{period}c')-pl.col(f'TUF{period}p')
        )
        .rename({f'TUF{period}c':'ibsc_vol',f'TUF{period}p' : 'pri_ibsc_vol'})
        .select(['IID','ibsc_vol','ibsc_vol_change','pri_ibsc_vol'])
    )
    df = (
        df.join(source_df,on='IID',how='left')
        .with_columns(
            prc_ibsc_share = pl.col('cur_trx_vol')/pl.col('ibsc_vol'),
            pri_prc_ibsc_share = pl.col('pri_trx_vol')/pl.col('pri_ibsc_vol')
        )
        .fill_nan(0)
    )
    return(df)

In [32]:
#TRU : 6 , TAMT : 35, IRL : 37 | AMT alone is 7
def process_8(df):
    source_df_cur = (
        all_products_tuf.filter(pl.col(p).is_in([6,35,37]))
        .select(['IID',f'TUF{period}c',p])
        .pivot(index='IID',values=f'TUF{period}c',columns=p)
        .with_columns(pl.col('6').fill_null(0.0),pl.col('37').fill_null(0.0),pl.col('35').fill_null(0.0))
    )
    source_df_cur = source_df_cur.rename({f'{c}' : f'{c}_cur' for c in [i for i in source_df_cur.columns[1:]]})
    
    source_df_pri = (
            all_products_tuf.filter(pl.col('product_id').is_in([6,35,37]))
            .select(['IID',f'TUF{period}p',p])
            .pivot(index = 'IID',values = f'TUF{period}p',columns=p)
        )
    source_df_pri = source_df_pri.rename({f'{c}' : f'{c}_pri' for c in [i for i in source_df_pri.columns[1:]]})
    
    df = (
        df.join(source_df_cur,on='IID',how='left').join(source_df_pri,on='IID',how='left')
        .with_columns(
            prc_tru_share = pl.col('6_cur')/pl.col('ibsc_vol'),
            prc_amt_share = pl.col('35_cur')/pl.col('ibsc_vol'),
            prc_irl_share = pl.col('37_cur')/pl.col('ibsc_vol')
        )
        .with_columns(
            pri_prc_tru_share = pl.col('6_pri')/pl.col('pri_ibsc_vol'),
            pri_prc_amt_share = pl.col('35_pri')/pl.col('pri_ibsc_vol'),
            pri_prc_irl_share = pl.col('37_pri')/pl.col('pri_ibsc_vol')
        )
        .drop([f'{i}_cur' for i in [6,35,37]] + [f'{i}_pri' for i in [6,35,37]])
        .fill_nan(0)
    )
    return(df)

In [33]:
# #indicator metrics
def vol_change_ind(df,col,pri_col):
    px = pl.when(pl.col(pri_col) * 0.01 > 2).then(pl.col(pri_col) * 0.01).otherwise(2).alias('threshold')
    df = (
        df
        .with_columns(px)
        .with_columns(
            pl.when(pl.col(f'{col}').is_null()).then(pl.lit(''))
            .when(pl.col(f'{col}') > pl.col('threshold')).then(pl.lit('P'))
            .when(pl.col(f'{col}') < -1 * pl.col('threshold')).then(pl.lit('Q'))
            .otherwise(pl.lit(''))
            .alias(f'{col}_ind')
        )
        .drop([pri_col,'threshold'])
    )
    return(df)

In [34]:
def shr_change_ind(df,col,pri_col):
    px = pl.lit(0.05).alias('threshold')

    df = (
        df
        .with_columns(px)
        .with_columns(change_col = pl.col(col)-pl.col(pri_col))
        .with_columns(
            pl.when(pl.col('change_col').is_null()).then(pl.lit(''))
            .when(pl.col(pri_col)==0).then(pl.lit(''))
            .when(pl.col(col)==0).then(pl.lit(''))
            .when(pl.col('change_col') > pl.col('threshold')).then(pl.lit('P'))
            .when(pl.col('change_col') < -1 * pl.col('threshold')).then(pl.lit('Q'))
            .otherwise(pl.lit(''))
            .alias(f'{col}_ind')
        )
        .drop([pri_col,'change_col','threshold'])
    )
    return(df)

In [35]:
# To make feed Ready- 
def get_feed(df):
    col_mapping = {
        'IID':'Physician_ID',
        'geography_id':'Geography_id',
        'cur_trx_vol':'NumberMetric1',
        'trx_vol_change':'NumberMetric2',
        'cur_nrx_vol':'NumberMetric4',
        'nrx_vol_change':'NumberMetric5',
        'run_rate_trx_4v13':'NumberMetric3',
        'run_rate_nrx_4v13':'NumberMetric6',
        'refill_rate':'NumberMetric7',
        'prc_trx_lin72':'NumberMetric8',
        'prc_trx_lin145':'NumberMetric9',
        'prc_trx_lin290':'NumberMetric10',
        'avg_trx_size':'NumberMetric11',
        'avg_trx_size_ch':'NumberMetric12',
        'trx_90day_pct':'NumberMetric13',
        'ibsc_vol':'NumberMetric14',
        'ibsc_vol_change':'NumberMetric15',
        'prc_ibsc_share':'NumberMetric16',
        'prc_tru_share':'NumberMetric17',
        'prc_amt_share':'NumberMetric18',
        'prc_irl_share':'NumberMetric19',
        'trx_vol_change_ind' : 'StringMetric8',
        'nrx_vol_change_ind' : 'StringMetric9',
        'prc_trx_lin72_ind' : 'StringMetric1',
        'prc_trx_lin145_ind' : 'StringMetric2',
        'prc_trx_lin290_ind' : 'StringMetric3',
        'ibsc_vol_change_ind' : 'StringMetric10',
        'prc_ibsc_share_ind' : 'StringMetric4',
        'prc_tru_share_ind' : 'StringMetric5',
        'prc_amt_share_ind' : 'StringMetric6',
        'prc_irl_share_ind' : 'StringMetric7',
    }
    final_feed = (
        df
        .rename(col_mapping)
        #.with_columns([pl.lit('\\N').alias(c) for c in [f'StringMetric{i}' for i in range(1,11)]])
        .select(['Physician_ID','Geography_id'] + [f'NumberMetric{i}' for i in range(1,20)] +  [f'StringMetric{i}' for i in range(1,11)])
    )
    return(final_feed)

### Period Loop -

In [53]:
# for trvializing formula : 
p,sg,spc,d = 'product_id','segment','specialty_group','decile'
levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
OUT = 's3://vortex-staging-a65ced90/BIT/output/LinzessSnapshot/Weekly/'

In [54]:
for period_num,PN in zip([4,13],[2,3]):
    period = f'_{period_num}'
    temp1 = mp_spec_seg_dec.select(['IID','geography_id'])
    temp1 = process_1(temp1,'TUF','trx')
    temp1 = process_1(temp1,'NUF','nrx')
    temp1 = process_2(temp1,'TUF','trx')
    temp1 = process_2(temp1,'NUF','nrx')
    temp1 = process_3(temp1)
    temp1 = process_4(temp1)
    temp1 = process_5(temp1)
    temp1 = process_6(temp1)
    temp1 = process_7(temp1)
    temp1 = process_8(temp1)
    
    # Filter Step - 
    temp1 = (
        temp1
        .filter(
            #((pl.col('ibsc_vol').is_not_null()) & (pl.col('ibsc_vol')!=0)) | ((pl.col('trx_vol_change').is_not_null()) & (pl.col('trx_vol_change')!=0))
            ((pl.col('cur_trx_vol').is_not_null())&(pl.col('cur_trx_vol')!=0)) | ((pl.col('trx_vol_change').is_not_null()) & (pl.col('trx_vol_change')!=0))
        )
    )

    # String Metrics - 
    temp1 = vol_change_ind(temp1,'trx_vol_change','pri_trx_vol')
    temp1 = vol_change_ind(temp1,'nrx_vol_change','pri_nrx_vol')
    temp1 = vol_change_ind(temp1,'ibsc_vol_change','pri_ibsc_vol')
    temp1 = shr_change_ind(temp1,'prc_trx_lin72','pri_prc_trx_lin72')
    temp1 = shr_change_ind(temp1,'prc_trx_lin145','pri_prc_trx_lin145')
    temp1 = shr_change_ind(temp1,'prc_trx_lin290','pri_prc_trx_lin290')
    temp1 = shr_change_ind(temp1,'prc_ibsc_share','pri_prc_ibsc_share')
    temp1 = shr_change_ind(temp1,'prc_tru_share','pri_prc_tru_share')
    temp1 = shr_change_ind(temp1,'prc_amt_share','pri_prc_amt_share')
    temp1 = shr_change_ind(temp1,'prc_irl_share','pri_prc_irl_share')

    # ROUNDING FIXES HERE -  (BEFROE DTYPE BCOMES MIXED CUZ OF PDRP OVERRDIDE)
    temp1 = temp1.with_columns(
        pl.col('ibsc_vol_change').round(1)
    )

    # PDRP Overrides - 
    temp1 = temp1.join(MASTER_UNI.select(['IID','PDRPOptOutFlag']),on='IID',how='left')
    override_columns =  temp1.columns[2:]
    expression_list = [
        pl.when(pl.col('PDRPOptOutFlag')=='Y').then(pl.lit('\\N')).otherwise(pl.col(c)).alias(c)
        for c in override_columns
    ]
    temp1 = temp1.with_columns(expression_list).drop('PDRPOptOutFlag')
    
    feed_dataset = get_feed(temp1)
    
    #Exporting Feeds-
    OUT = 's3://vortex-staging-a65ced90/BIT/output/LinzessSnapshot/Weekly/'
    feed_dataset.to_pandas().to_csv(f'{OUT}Weekly_LinzessSnapshot_MetricPerformance_P{PN}_Feed.txt',sep='|',lineterminator='\r\n',index=False)
    print(f'LS Sales Perf feed {PN} done')

LS Sales Perf feed 2 done
LS Sales Perf feed 3 done


---