#### Prescriber View - Sales Performance pt1

In [1]:
import polars as pl
import pandas as pd
import gc
import json

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
num_weeks_rx = js['num_weeks_rx']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('mp_spec_seg_dec')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

### Generator Functions -

In [5]:
def get_summed_period_iid_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,106)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,4,13,26 for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,5)]).alias(metric+'_4c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,14)]).alias(metric+'_13c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,27)]).alias(metric+'_26c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,num_weeks_rx+1)]).alias(metric+'_qtdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(5,9)]).alias(metric+'_4p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,27)]).alias(metric+'_13p'),
        pl.sum_horizontal([metric+str(i) for i in range(27,53)]).alias(metric+'_26p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,14+num_weeks_rx)]).alias(metric+'_qtdp'),

        pl.sum_horizontal([metric+str(i) for i in range(1,106)]).alias(metric+'_all')
    )

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))



In [6]:
def add_parent_product_rows(df):
    agg_dict = {}
    for col in df.columns[2:]:
        agg_dict[col] = pl.col(col).sum()
    
    #join_cols = ['geography_id','plan_type','PlanID','IID']

    df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
    df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
    df_2_35 = df_2_35.group_by(['IID','parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
    
    df_1 = df.group_by('IID').agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

    # stack 1, 2_35 with df and return
    df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
    vstack_helper = df.columns
    df = df.vstack(
        df_2_35.select(vstack_helper)
    ).vstack(
        df_1.select(vstack_helper)
    )

    return(df)

In [7]:
#Raw Data Prep ETA - 21 Seconds
all_products_tuf = get_summed_period_iid_metric('TUF',fetch_products)
all_products_nuf = get_summed_period_iid_metric('NUF',fetch_products)
all_products_trx = get_summed_period_iid_metric('TRX',fetch_products)
all_products_nrx = get_summed_period_iid_metric('NRX',fetch_products)
all_products_tun = get_summed_period_iid_metric('TUN',fetch_products)
all_products_nun = get_summed_period_iid_metric('NUN',fetch_products)
all_products_tuf = add_parent_product_rows(all_products_tuf)
all_products_nuf = add_parent_product_rows(all_products_nuf)
all_products_trx = add_parent_product_rows(all_products_trx)
all_products_nrx = add_parent_product_rows(all_products_nrx)
all_products_tun = add_parent_product_rows(all_products_tun)
all_products_nun = add_parent_product_rows(all_products_nun)

### Functions ->

In [8]:
def process_1(df):
    cols = ['IID',p]
    fetch_df = all_products_tuf[cols+[f'TUF{period}c',f'TUF{period}p']].join(
        all_products_nuf[cols+[f'NUF{period}c',f'NUF{period}p']],on = cols,how = 'left'
    )
    df = df.join(fetch_df,on = 'IID',how = 'left'
    ).filter(pl.col(p).is_not_null() #added this to remove people with no rx data.
    ).rename({f'TUF{period}c':'cur_vol_trx',f'TUF{period}p' : 'pri_vol_trx',
              f'NUF{period}c':'cur_vol_nrx',f'NUF{period}p' : 'pri_vol_nrx'
    }).with_columns(
        vol_change_trx = pl.col('cur_vol_trx')-pl.col('pri_vol_trx'),
        vol_change_nrx = pl.col('cur_vol_nrx')-pl.col('pri_vol_nrx')

    ).with_columns(
        prc_vol_growth_trx = (pl.col('cur_vol_trx')/pl.col('pri_vol_trx'))-1,
        prc_vol_growth_nrx = (pl.col('cur_vol_nrx')/pl.col('pri_vol_nrx'))-1
    ).filter(
        (pl.col('cur_vol_trx')!=0) | ((pl.col('cur_vol_nrx')!=0))
	)

    return(df)

# Grower or Decliner (TYPE)	
def process_2(df):
    df_t = (
        df.select(['IID','geography_id','product_id','cur_vol_trx','pri_vol_trx','vol_change_trx'])
        .join(geo_code_mapper[['geography_id','region_geography_id']],on='geography_id',how='left')
        .filter((pl.col('pri_vol_trx')!=0) & (pl.col('pri_vol_trx').is_not_null()))
        .filter((pl.col('vol_change_trx')!=0))
    )
    #if prior volume is 0 then that IID does not contribue to grower or decliner.

    # next step is to get 90th percentile and 10th percentile for each geo:
    df_percentile_t = df_t.group_by('region_geography_id','product_id').agg(
        ten_perc = pl.col('vol_change_trx').quantile(0.1,interpolation='linear'),
        nin_perc = pl.col('vol_change_trx').quantile(0.9,interpolation='linear')
    )

    df_temp_t = df_t.join(df_percentile_t,on=['region_geography_id','product_id'],how = 'left')

    df_temp_t = df_temp_t.with_columns(
        pl.when((pl.col('vol_change_trx')<0) & (pl.col('vol_change_trx') <= pl.col('ten_perc'))).then(pl.lit('GROW'))
        .when((pl.col('vol_change_trx')>0) & (pl.col('vol_change_trx') >= pl.col('nin_perc'))).then(pl.lit('DECL'))
        .otherwise(pl.lit(None))
        .alias('TYPE_trx')
    ) 
    #QC DATASET [Check here for inconsistency if needed]

    #HCPs whose prescription volume has fdeclined rom the prior period 
    #and the volume growth falls in bottom 10 percentile of each region level group -> DECLINERS

    #HCPs whose prescription volume has grown from the prior period and the volume growth falls in 
    #top 10 percentile of each region level group -> GROWERS

    df = df.join(df_temp_t[['IID','geography_id','product_id','TYPE_trx']],on = ['IID','geography_id','product_id'],how='left'
    ).with_columns(TYPE_nrx = pl.col('TYPE_trx'))

    return(df)

# New Prescriber | PDRP | NC
def process_3(df):
    load('MASTER_UNI')
    source_df = all_products_tuf.select(['IID',p,'TUF_all']).join(
        all_products_nuf.select(['IID',p,'NUF_all']),on=['IID',p],how='left'
    )
    df = (
        df.join(source_df,on=['IID',p],how='left')
        .join(MASTER_UNI[['IID','PDRPOptOutFlag']],on='IID',how='left')
        .with_columns(
            old_volume_trx = pl.col('TUF_all')-pl.col('cur_vol_trx'),
            old_volume_nrx = pl.col('NUF_all')-pl.col('cur_vol_nrx')
        )
        .with_columns(
            pl.when(pl.col('old_volume_trx') == 0)
            .then(pl.lit('NP'))
            .otherwise(pl.col('TYPE_trx'))
            .alias('TYPE_trx'),

            pl.when(pl.col('old_volume_nrx') == 0)
            .then(pl.lit('NP'))
            .otherwise(pl.col('TYPE_nrx'))
            .alias('TYPE_nrx')
        )
        .with_columns(
            pl.when(pl.col('vol_change_trx') == 0)
            .then(pl.lit('NC'))
            .otherwise(pl.col('TYPE_trx'))
            .alias('TYPE_trx'),

            pl.when(pl.col('vol_change_trx') == 0)
            .then(pl.lit('NC'))
            .otherwise(pl.col('TYPE_nrx'))
            .alias('TYPE_nrx')
        )
        .with_columns(
            pl.when(pl.col('PDRPOptOutFlag')=='Y')
            .then(pl.lit('PDRP'))
            .otherwise(pl.col('TYPE_trx'))
            .alias('TYPE_trx'),

            pl.when(pl.col('PDRPOptOutFlag')=='Y')
            .then(pl.lit('PDRP'))
            .otherwise(pl.col('TYPE_nrx'))
            .alias('TYPE_nrx')
        )
        .drop(['old_volume_trx','old_volume_nrx','TUF_all','NUF_all','PDRPOptOutFlag'])
    )
    return(df)
	
#volume change indicator
def process_4(df):

    expression_for_trx = pl.when(pl.col('vol_change_trx')/pl.col('pri_vol_trx') > 0.02).then(pl.lit('P')
    ).when(pl.col('vol_change_trx')/pl.col('pri_vol_trx') < -0.02).then(pl.lit('Q')
    ).when(pl.col('vol_change_trx')==0).then(None
    ).otherwise(None).alias('vol_change_ind_trx')

    expression_for_nrx = pl.when(pl.col('vol_change_nrx')/pl.col('pri_vol_nrx') > 0.02).then(pl.lit('P')
    ).when(pl.col('vol_change_nrx')/pl.col('pri_vol_nrx') < -0.02).then(pl.lit('Q')
    ).when(pl.col('vol_change_nrx')==0).then(None
    ).otherwise(None).alias('vol_change_ind_nrx')

    return(df.with_columns(expression_for_trx,expression_for_nrx))
	
#current prior and share change
def process_5(df):
    df1 = df.group_by('IID').agg(
        mkt_TUF_c = pl.col('cur_vol_trx').sum(),mkt_TUF_p = pl.col('pri_vol_trx').sum(),
        mkt_NUF_c = pl.col('cur_vol_nrx').sum(),mkt_NUF_p = pl.col('pri_vol_nrx').sum()
    )

    return(
    df.join(df1,on='IID',how='left').with_columns(
        cur_shr_trx = pl.col('cur_vol_trx')/pl.col('mkt_TUF_c'),cur_shr_nrx = pl.col('cur_vol_nrx')/pl.col('mkt_NUF_c'),
        pri_shr_trx = pl.col('pri_vol_trx')/pl.col('mkt_TUF_p'),pri_shr_nrx = pl.col('pri_vol_nrx')/pl.col('mkt_NUF_p')
    ).with_columns(
        shr_change_trx = pl.col('cur_shr_trx')-pl.col('pri_shr_trx'),shr_change_nrx = pl.col('cur_shr_nrx')-pl.col('pri_shr_nrx')
    ).drop(['mkt_TUF_c','mkt_TUF_p','mkt_NUF_c','mkt_NUF_p'] # maybe consider not dropping it.
    ) .with_columns(
        prc_shr_growth_trx = (pl.col('cur_shr_trx')/pl.col('pri_shr_trx'))-1,
        prc_shr_growth_nrx = (pl.col('cur_shr_nrx')/pl.col('pri_shr_nrx'))-1
    ))
	
#share change indicator
def process_6(df):
    return(
        df.with_columns(
            pl.when(pl.col('shr_change_trx') > 0.005).then(pl.lit('P'))
            .when(pl.col('shr_change_trx') < -0.005).then(pl.lit('Q'))
            .when(pl.col('shr_change_trx')==0).then(None)
            .otherwise(None).alias('shr_change_ind_trx'),

            pl.when(pl.col('shr_change_nrx') > 0.005).then(pl.lit('P'))
            .when(pl.col('shr_change_nrx') < -0.005).then(pl.lit('Q'))
            .when(pl.col('shr_change_nrx')==0).then(None)
            .otherwise(None).alias('shr_change_ind_nrx')
        )
    )
	
#Trx Size Metrics (copied values for nrx)
def process_7(df):
    cols = ['IID','product_id']
    fetch_df = all_products_tun[cols+[f'TUN{period}c',f'TUN{period}p']].join(
        all_products_trx[cols+[f'TRX{period}c',f'TRX{period}p']],on = cols,how = 'left'
    )

    df2 = df.join(fetch_df,on = ['IID','product_id'],how = 'left'
    ).with_columns(
        avg_trx_size = pl.col(f'TUN{period}c')/pl.col(f'TRX{period}c'),
        pri_avg_trx_size = pl.col(f'TUN{period}p')/pl.col(f'TRX{period}p')
    ).with_columns(
        avg_trx_size_ch = pl.col('avg_trx_size') - pl.col('pri_avg_trx_size')
    ).rename(
        {f'TRX{period}c' : 'avg_trx_size_trx',f'TUN{period}c':'avg_trx_size_unit'}
    ).with_columns(
        avg_nrx_size = pl.col('avg_trx_size'),
        avg_nrx_size_ch = pl.lit('\\N'), #not copying the raw data columns here nrx metric is not to be calc
    ).drop([f'TUN{period}p',f'TRX{period}p','pri_avg_trx_size'])

    return(df2)
	
#FIX - x SHOULD NOT BE CUR_VOL , it should be CUR_TRX
#90 day trx perc (no values for nrx)
# trx_90day_pct   =  ((tuf_rx_&ce. - ((tuf_units_&ce. - 90*tuf_rx_&ce.) / -60)) / tuf_rx_&ce.)
#simplyfy z = (x-((y-90x)/-60))/x, where x = cur_vol, y = cur_tun
# you get z = -(1/2) + (y/60x)
def process_8(df):
    cols = ['IID','product_id']
    fetch_df = all_products_tun[cols+[f'TUN{period}c']].rename({f'TUN{period}c':'tuf_units'})

    formula = -(1/2) + (pl.col('tuf_units')/(60*pl.col('cur_vol_trx')))

    df2 = df.join(fetch_df,on=cols,how='left').with_columns(
        trx_90day_pct = formula,
        trx_90day_pct_nrx = None
    ).drop('tuf_units')

    return(df2)

def get_benchmark_cols(df,metric,b_metric,b_name):
    df = (
        df
        .join(
            mp_spec_seg_dec.drop('geography_id',d),on='IID',how='left'
        )
        .join(
            terr_growths.select('geography_id',p,spc,sg,b_metric),
            on = ['geography_id',spc,sg,p], how = 'left'
        )
        .rename(
            {
                f'{b_metric}_right':f'Prc_Benchmark_{b_name}_{metric}'
            }
        )
        .with_columns(
            pl.when(pl.col(b_metric)>pl.col(f'Prc_Benchmark_{b_name}_{metric}'))
            .then(pl.lit('L'))
            .otherwise(pl.lit(None))
            .alias(f'{b_name}_Ind_{metric}')
        )
        .drop(spc,sg)
    )

    return(df)

In [9]:
# For converting to Feed Ready Data -
def get_feed(temp1):
    final_feed = temp1.with_columns(
        pl.col('avg_trx_size_trx').alias('avg_trx_size_trx_nrx')
    ).with_columns(
        pl.col('avg_trx_size_unit').alias('avg_trx_size_unit_nrx')
    )
    #function to diving dataframe in two levels('Trx','Nrx')
    def select_columns_by_condition(df,metric):
        # Get the column names to be excluded based on the condition
        excluded_columns = [col for col in df.columns if not col.endswith(metric)]
        
        # Select all columns except the excluded ones
        selected_df = df.select(excluded_columns)
        return selected_df
    #working on trx level
    final_feed_trx = select_columns_by_condition(final_feed,'nrx')
    final_feed_trx = final_feed_trx.drop(['avg_nrx_size','avg_nrx_size_ch'])
    final_feed_trx = final_feed_trx.with_columns(
        pl.lit('TRX').alias('Metric')
    )
    #working on nrx level
    final_feed_nrx = select_columns_by_condition(final_feed,'trx')
    final_feed_nrx = final_feed_nrx.drop(['avg_trx_size','avg_trx_size_trx','avg_trx_size_ch','trx_90day_pct','avg_trx_size_unit'])
    final_feed_nrx = final_feed_nrx.with_columns(
        pl.lit('NRX').alias('Metric')
    )
    #function to remove _trx or _nrx from final_feed_nrx and final_feed_trx
    def rename_columns_by_condition(df,metric):
        renamed_columns = {col: col[:-4] if col.endswith(metric) and col != 'avg_trx_size_trx' else col for col in df.columns}
        renamed_df = df.rename(renamed_columns)
        return renamed_df
    # making trx feed columns and nrx feed columns similar so that we can vstack them
    final_feed_nrx = rename_columns_by_condition(final_feed_nrx,'nrx')
    final_feed_nrx = final_feed_nrx.rename({
        'avg_nrx_size':'avg_trx_size',
        'avg_nrx_size_ch':'avg_trx_size_ch'
    })
    final_feed_nrx = final_feed_nrx.with_columns(
        pl.lit('\\N').alias('trx_90day_pct')
    )
    
    final_feed_nrx = final_feed_nrx.select(['IID',
     'geography_id',
     'product_id',
     'cur_vol',
     'pri_vol',
     'vol_change',
     'prc_vol_growth',
     'TYPE',
     'vol_change_ind',
     'cur_shr',
     'pri_shr',
     'shr_change',
     'prc_shr_growth',
     'shr_change_ind',
     # 'product_id_right',
     'avg_trx_size_unit',
     'avg_trx_size_trx',
     'avg_trx_size',
     'avg_trx_size_ch',
     'trx_90day_pct',
     'Prc_Benchmark_Vol_Growth',
     'Vol_Growth_Ind',
     'Prc_Benchmark_Shr_Growth',
     'Shr_Growth_Ind',
     'Metric'])
    #making final_feed_trx ready for vstack with final_feed_nrx 
    final_feed_trx = rename_columns_by_condition(final_feed_trx,'trx')
    final_feed_trx = final_feed_trx.with_columns(
        pl.col("avg_trx_size_ch").cast(pl.String)
        ).with_columns(
            pl.col('trx_90day_pct').cast(pl.String)
        )
    final_feed = final_feed_trx.vstack(final_feed_nrx)
    #removing extra columns a\c to feed
    #final_feed = final_feed.drop(['product_id_right'])
    #Renaming existing columns according to feed
    rnm_cols = {
        'IID':'Physician_ID',
        'geography_id':'Geography_id',
        'product_id':'Product_id',
        'cur_vol':'Current_Vol',
        'pri_vol':'Prior_Vol',
        'vol_change':'Vol_Change',
        'prc_vol_growth':'Prc_Vol_Growth',
        'TYPE':'Type',
        'vol_change_ind':'Vol_Change_Ind',
        'cur_shr':'Current_Shr',
        'pri_shr':'Prior_Shr',
        'shr_change':'Shr_Change',
        'prc_shr_growth':'Prc_Shr_Growth',
        'shr_change_ind':'Shr_Change_Ind',
        'avg_trx_size_unit':'Avg_TRx_Size_Unit',
        'avg_trx_size_trx':'Avg_TRx_Size_TRx',
        'avg_trx_size':'Avg_TRx_Size',
        'avg_trx_size_ch':'Avg_TRx_Size_Change',
        'trx_90day_pct':'Ninty_Day_TRx_Prc',
    }
    final_feed = final_feed.rename(rnm_cols)
    #required new columns for feed
    col_to_addrt = ['ReportType']
    col_to_addp = ['Period']
    col_to_addna = ["Total_Num_Of_Redemptions", "Frozen_Competitor_Vol", "DS1_Current_Vol", "DS1_Prior_Vol", "DS2_Current_Vol", "DS2_Prior_Vol"]
    # func to add columns with desired value
    def addcol(df,columns_to_add,wtl):
        for my_col in columns_to_add:
            df = df.with_columns(pl.lit(wtl).alias(my_col))
        return df
    final_feed = addcol(final_feed,col_to_addrt,'WEEKLY')
    final_feed = addcol(final_feed,col_to_addp,f'{period_num}-WEEK')
    final_feed = addcol(final_feed,col_to_addna,'\\N')
    # rearranging columns accoring to feed.
    req_cols = [
    "Physician_ID", "Geography_id", "Product_id", "Metric", "ReportType", "Period", "Type", "Current_Vol", "Prior_Vol", "Vol_Change", 
    "Vol_Change_Ind", "Prc_Vol_Growth", "Prc_Benchmark_Vol_Growth", "Vol_Growth_Ind", "Current_Shr", "Prior_Shr", "Shr_Change", 
    "Shr_Change_Ind", "Prc_Shr_Growth", "Prc_Benchmark_Shr_Growth", "Shr_Growth_Ind", "Avg_TRx_Size", "Avg_TRx_Size_TRx", 
    "Avg_TRx_Size_Unit", "Total_Num_Of_Redemptions", "Frozen_Competitor_Vol", "DS1_Current_Vol", "DS1_Prior_Vol", "DS2_Current_Vol", 
    "DS2_Prior_Vol", "Avg_TRx_Size_Change", "Ninty_Day_TRx_Prc"]
    final_feed = final_feed.select(req_cols)# final data set
    
    return (final_feed)

### Period Loop -

In [10]:
# for trvializing formula : 
p,sg,spc,d = 'product_id','segment','specialty_group','decile'
levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
OUT = 's3://vortex-staging-a65ced90/BIT/output/Prescriber/Weekly/'

In [11]:
#for period_num,PN in zip(['qtd'],[5]):
for period_num,PN in zip([1,4,13,26,'qtd'],[1,2,3,4,5]):
    period = f'_{period_num}'
    temp1 = mp_spec_seg_dec.select(['IID','geography_id'])
    temp1 = process_1(temp1)
    temp1 = process_2(temp1)
    temp1 = process_3(temp1)
    temp1 = process_4(temp1)
    temp1 = process_5(temp1)
    temp1 = process_6(temp1)
    temp1 = process_7(temp1)
    temp1 = process_8(temp1)
    load(f'terr_growths_{PN}')
    terr_growths = globals()[f'terr_growths_{PN}']
    temp1 = get_benchmark_cols(temp1,'trx','prc_vol_growth_trx','Vol_Growth')
    temp1 = get_benchmark_cols(temp1,'nrx','prc_vol_growth_nrx','Vol_Growth')
    temp1 = get_benchmark_cols(temp1,'trx','prc_shr_growth_trx','Shr_Growth')
    temp1 = get_benchmark_cols(temp1,'nrx','prc_shr_growth_nrx','Shr_Growth')

    feed_dataset = get_feed(temp1)
    feed_dataset.to_pandas().to_csv(f'{OUT}Weekly_Prescriber_SalesPerformance_P{PN}_Feed.txt', sep='|')
    print(f'Exported Feed {PN}!')

Exported Feed 1!


Exported Feed 2!


Exported Feed 3!


Exported Feed 4!


Exported Feed 5!
