### GS Sales KPI pt1

In [1]:
import polars as pl
import pandas as pd
import gc
from datetime import datetime, timedelta,date
import json

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
num_weeks_rx = js['num_weeks_rx']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('mp_spec_seg_dec')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL'] # only these products are to be read from lax

### Generator Functions

In [5]:
# get_summed_metric_period -> Function to fetch Summed Metric values for all 5 periods - 1,4,13,26,qtd (for both current and prior)
# Input 1 : metric -> str -> supply the metric name you want to fetch (TUF,NUF,TRX etc.)
# Input 2 : prod_cd -> list -> supply the list of product codes you want to filter for while reading the data.
# Output : The output dataframes are at 4 levels : Territory, Region, Area, Nation.

def get_summed_metric_period(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,53)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,4,13,26 for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,5)]).alias(metric+'_4c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,14)]).alias(metric+'_13c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,27)]).alias(metric+'_26c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,num_weeks_rx+1)]).alias(metric+'_qtdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(5,9)]).alias(metric+'_4p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,27)]).alias(metric+'_13p'),
        pl.sum_horizontal([metric+str(i) for i in range(27,53)]).alias(metric+'_26p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,14+num_weeks_rx)]).alias(metric+'_qtdp')
    )

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    metrics_to_calc = pl.col(metric+'_1c').sum().alias(metric+'_1c'),pl.col(metric+'_4c').sum().alias(metric+'_4c'),pl.col(metric+'_13c').sum().alias(metric+'_13c'),pl.col(metric+'_26c').sum().alias(metric+'_26c'),\
        pl.col(metric+'_qtdc').sum().alias(metric+'_qtdc'),pl.col(metric+'_1p').sum().alias(metric+'_1p'),pl.col(metric+'_4p').sum().alias(metric+'_4p'),pl.col(metric+'_13p').sum().alias(metric+'_13p'),\
        pl.col(metric+'_26p').sum().alias(metric+'_26p'),pl.col(metric+'_qtdp').sum().alias(metric+'_qtdp')
    
    df_terr = df.group_by(['geography_id','specialty_group','segment','decile','PROD_CD']).agg(metrics_to_calc)

    df_reg = df.join(geo_code_mapper[['geography_id','region_geography_id']],on='geography_id',how='left'
    ).group_by(['region_geography_id','specialty_group','segment','decile','PROD_CD']).agg(metrics_to_calc)

    df_area = df.join(geo_code_mapper[['geography_id','area_geography_id']],on='geography_id',how='left'
    ).group_by(['area_geography_id','specialty_group','segment','decile','PROD_CD']).agg(metrics_to_calc)

    df_nation = df.join(geo_code_mapper[['geography_id','nation_geography_id']],on='geography_id',how='left'
    ).group_by(['nation_geography_id','specialty_group','segment','decile','PROD_CD']).agg(metrics_to_calc)

    return(
        df_terr,df_reg,df_area,df_nation
    )

In [6]:
def add_parent_product_rows(all_prod_df):
    # converting tuple to list , because i cant assign the processed df back to it
    all_prod_df = list(all_prod_df)
    for i in range(4): 
        df = all_prod_df[i]
        agg_dict = {}
        for col in df.columns[5:]:
            agg_dict[col] = pl.col(col).sum()
        
        join_cols = df.columns[0:4]

        df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
        df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
        df_2_35 = df_2_35.group_by(join_cols + ['parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
        df_1 = df.group_by(join_cols).agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

        # stack 1, 2_35 with df and return
        df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
        vstack_helper = df.columns
        df = df.vstack(
            df_2_35.select(vstack_helper)
        ).vstack(
            df_1.select(vstack_helper)
        )

        all_prod_df[i] = df
    return(tuple(all_prod_df))


In [7]:
def add_parent_product_rows_iid(df):
    agg_dict = {}
    for col in df.columns[2:]:
        agg_dict[col] = pl.col(col).sum()
    
    #join_cols = ['geography_id','plan_type','PlanID','IID']

    df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
    df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
    df_2_35 = df_2_35.group_by(['IID','parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
    
    df_1 = df.group_by('IID').agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

    # stack 1, 2_35 with df and return
    df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
    vstack_helper = df.columns
    df = df.vstack(
        df_2_35.select(vstack_helper)
    ).vstack(
        df_1.select(vstack_helper)
    )

    return(df)

In [8]:
def add_full_rollups(all_prod_df):
    # converting the tuple of dfs into a list for processing
    all_prod_df = list(all_prod_df)
    # for trivializing formulas - 
    p,sg,d,spc = 'product_id','segment','decile','specialty_group'
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    
    #Looping over 4 levels (terr,reg,area,nation)
    for i in range(4):
        df = all_prod_df[i]
        g = df.columns[0] #should contain geo level
        metric_cols = df.columns[4:-1] #should contain the tuf / nuf columns
        main_seq = ([g,p,sg,d,spc] + metric_cols) #used for vstack later
        agg_dict = {metric: pl.col(metric).sum() for metric in metric_cols}
        # First Round - 
        sg_df = (df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        #### Processing Done ####
        df = (
            df.select(main_seq)
            .vstack(sg_df).vstack(d_df).vstack(spc_df)
            .vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df)
            .vstack(sg_d_spc_df)
        )
        # Store Data Back :
        all_prod_df[i] = df
    
    return(tuple(all_prod_df))

In [9]:
def get_period_prec_count_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,53)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))
    # 1,4,13,26 for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,5)]).alias(metric+'_4c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,14)]).alias(metric+'_13c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,27)]).alias(metric+'_26c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,num_weeks_rx+1)]).alias(metric+'_qtdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(5,9)]).alias(metric+'_4p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,27)]).alias(metric+'_13p'),
        pl.sum_horizontal([metric+str(i) for i in range(27,53)]).alias(metric+'_26p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,14+num_weeks_rx)]).alias(metric+'_qtdp')
    )
    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())
    # Adding Geo Hier
    df = df.join(geo_code_mapper,on='geography_id')

    time_periods = [f'{metric}_1c',f'{metric}_4c',f'{metric}_13c',f'{metric}_26c',f'{metric}_qtdc'
                    ,f'{metric}_1p',f'{metric}_4p',f'{metric}_13p',f'{metric}_26p',f'{metric}_qtdp'] #time periods
    group_levels = ['geography_id','region_geography_id', 'area_geography_id', 'nation_geography_id'] #group levels

    df_terr = None
    df_reg = None
    df_area = None
    df_nation = None #initialize the output dfs

    for period in time_periods:
        df_filtered = df.filter(pl.col(period) >= 1)
        for level in group_levels:
            df_period = (df_filtered.group_by([level, 'specialty_group', 'segment', 'decile', 'PROD_CD']).agg([pl.col('IID').n_unique().alias(f'cp_{period}')]))

            # If the dataframes are not initialized, assign df_period to them
            if level == 'geography_id' and df_terr is None:
                df_terr = df_period
            elif level == 'region_geography_id' and df_reg is None:
                df_reg = df_period
            elif level == 'area_geography_id' and df_area is None:
                df_area = df_period
            elif level == 'nation_geography_id' and df_nation is None:
                df_nation = df_period
            else:
                # Else, join df_period with the dataframes
                if level == 'geography_id':
                    df_terr = df_terr.join(df_period, on=[level, 'specialty_group', 'segment', 'decile', 'PROD_CD'], how='outer_coalesce')
                elif level == 'region_geography_id':
                    df_reg = df_reg.join(df_period, on=[level, 'specialty_group', 'segment', 'decile', 'PROD_CD'], how='outer_coalesce')
                elif level == 'area_geography_id':
                    df_area = df_area.join(df_period, on=[level, 'specialty_group', 'segment', 'decile', 'PROD_CD'], how='outer_coalesce')
                elif level == 'nation_geography_id':
                    df_nation = df_nation.join(df_period, on=[level, 'specialty_group', 'segment', 'decile', 'PROD_CD'], how='outer_coalesce')
    
    return(
        df_terr,df_reg,df_area,df_nation
    )
#Change log : added outer_coalesce 

In [10]:
def get_summed_period_iid_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,106)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,4,13,26 for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,5)]).alias(metric+'_4c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,14)]).alias(metric+'_13c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,27)]).alias(metric+'_26c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,num_weeks_rx+1)]).alias(metric+'_qtdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(5,9)]).alias(metric+'_4p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,27)]).alias(metric+'_13p'),
        pl.sum_horizontal([metric+str(i) for i in range(27,53)]).alias(metric+'_26p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,14+num_weeks_rx)]).alias(metric+'_qtdp'),

        pl.sum_horizontal([metric+str(i) for i in range(1,106)]).alias(metric+'_all')
    )

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))

In [11]:
# Raw Data Prep : ETA 20 Seconds
# Calling Function to Process Summed Metric Values
all_products_tuf = get_summed_metric_period('TUF',fetch_products) # this is a tuple containg all 4 rollups
all_products_nuf = get_summed_metric_period('NUF',fetch_products)
all_products_trx =  get_summed_metric_period('TRX',fetch_products)
all_products_nrx =  get_summed_metric_period('NRX',fetch_products)
all_products_tun =  get_summed_metric_period('TUN',fetch_products)
all_products_nun =  get_summed_metric_period('NUN',fetch_products)

# calling function to add parent product rows to it.
all_products_tuf = add_parent_product_rows(all_products_tuf)
all_products_nuf = add_parent_product_rows(all_products_nuf)
all_products_trx = add_parent_product_rows(all_products_trx)
all_products_nrx = add_parent_product_rows(all_products_nrx)
all_products_tun = add_parent_product_rows(all_products_tun)
all_products_nun = add_parent_product_rows(all_products_nun)

# Adding Full Rol Ups - 
all_products_tuf = add_full_rollups(all_products_tuf)
all_products_nuf = add_full_rollups(all_products_nuf)
all_products_trx = add_full_rollups(all_products_trx)
all_products_nrx = add_full_rollups(all_products_nrx)
all_products_tun = add_full_rollups(all_products_tun)
all_products_nun = add_full_rollups(all_products_nun)

# Calling Function to process count of Prec ('metric_period' >= 1 only)
all_products_tuf_hcp = get_period_prec_count_metric('TUF',fetch_products)
# calling function to add parent product rows to it.
all_products_tuf_hcp = add_parent_product_rows(all_products_tuf_hcp)
# adding full rollups 
all_products_tuf_hcp = add_full_rollups(all_products_tuf_hcp)

#IID level data for grower and decliner-
all_products_tuf_iid = get_summed_period_iid_metric('TUF',fetch_products)
all_products_tuf_iid = add_parent_product_rows_iid(all_products_tuf_iid)

### Functions ->

In [12]:
def process_1(df):
    hold = [[f'{m}{period}c',f'{m}{period}p'] for m in ['TUN','TRX','TUF','NUN','NRX','NUF']]
    for i in range(4):
        g = levels[i]
        gb_helper = [g,spc,sg,d,p]
        f = (
            all_products_tun[i][gb_helper + hold[0]]
            .join(all_products_trx[i][gb_helper + hold[1]],on = gb_helper,how = 'left')
            .join(all_products_tuf[i][gb_helper + hold[2]],on = gb_helper,how = 'left')
            .join(all_products_nun[i][gb_helper + hold[3]],on = gb_helper,how = 'left')
            .join(all_products_nrx[i][gb_helper + hold[4]],on = gb_helper,how = 'left')
            .join(all_products_nuf[i][gb_helper + hold[5]],on = gb_helper,how = 'left')
        )
        df[i] = f
    return(df)

# Trx, Nrx : Size and Size Change ,  Average Trx
def process_2(df,period_num):
    for i in range(4):
        f = df[i]
        f = (
            f
            .with_columns(
                avg_trx_size = pl.col(f'TUN{period}c')/pl.col(f'TRX{period}c'),
                avg_nrx_size = pl.col(f'NUN{period}c')/pl.col(f'NRX{period}c'),
                prior_avg_trx_size = pl.col(f'TUN{period}p')/pl.col(f'TRX{period}p'),
                prior_avg_nrx_size = pl.col(f'NUN{period}p')/pl.col(f'NRX{period}p')
            )
            .with_columns(
                avg_trx_size_ch = pl.col('avg_trx_size')-pl.col('prior_avg_trx_size'),
                avg_nrx_size_ch = pl.col('avg_nrx_size')-pl.col('prior_avg_nrx_size')
            )
            .with_columns(
                avg_trx = pl.col(f'TUF{period}c')/period_num #no avg_nrx ? 
            )
        )
        df[i] = f
    return(df)
	
#prc_shr_bus
def process_3(df):
    for i in range(3):
        gmap = geo_code_mapper[[levels[i],levels[i+1]]].unique()
        fp = df[i+1].select(levels[i+1],spc,sg,d,p,f'TUF{period}c',f'NUF{period}c')
        f = df[i]
        ft = (
            f
            .select(levels[i],spc,sg,d,p,f'TUF{period}c',f'NUF{period}c')
            .join(gmap,on = levels[i],how ='left')
            .join(fp,on = [levels[i+1],spc,sg,d,p],how='left')
            .with_columns(
                prc_shr_bus_trx = pl.col(f'TUF{period}c')/pl.col(f'TUF{period}c_right'),
                prc_shr_bus_nrx = pl.col(f'NUF{period}c')/pl.col(f'NUF{period}c_right')
            )
            .drop([levels[i+1],f'TUF{period}c',f'NUF{period}c',f'TUF{period}c_right',f'NUF{period}c_right'])
        )
        f = f.join(ft,on = [levels[i],spc,sg,d,p],how = 'left')
        df[i] = f
    df[3] = df[3].with_columns(
        pl.lit(None).alias('prc_shr_bus_trx'),pl.lit(None).alias('prc_shr_bus_nrx')
    )

    return(df)

#rr_4v13
def process_4(df):
    for i in range(4):
        f = df[i]
        source_df = all_products_tuf[i][[levels[i],p,sg,d,spc,'TUF_4c','TUF_13c']]
        f = f.join(source_df,on = [levels[i],p,sg,d,spc],how='left').with_columns(
            rr_4v13 = (pl.col('TUF_4c')*(13/4)) - pl.col('TUF_13c')
        ).drop(['TUF_4c','TUF_13c'])
        df[i] = f
    return(df)

#count of prec , ind
def process_5(df):
    for i in range(4):
        f = df[i]

        formula = ((pl.col('num_hcp')-pl.col('pri_num_hcp'))/pl.col('pri_num_hcp')).alias('ind_metric')

        source_df = all_products_tuf_hcp[i][[levels[i],spc,sg,d,p,f'cp_TUF{period}c',f'cp_TUF{period}p']]
        f = f.join(source_df,[levels[i],spc,sg,d,p],how='left').rename({f'cp_TUF{period}c':'num_hcp',f'cp_TUF{period}p':'pri_num_hcp'}
        ).with_columns(pl.col('num_hcp').fill_null(0),pl.col('pri_num_hcp').fill_null(0)
        ).with_columns(formula
        ).with_columns(
            pl.when(pl.col('ind_metric') > 0.02).then(pl.lit('P'))
            .when(pl.col('ind_metric') < -0.02).then(pl.lit('Q'))
            .otherwise(None).alias('num_hcp_ind')
        ).drop(['pri_num_hcp','ind_metric'])


        df[i] = f
    return(df)
	
#Num_Of_Prescribers_BnchMrk_Ind
def process_bnch_presc(df,metric,ind_col_name):# WORKING CORRECTLY BUT NOT MODULAR 
    def add_indicator(df, ind_name, col1, col2, col3):
        return df.with_columns(
            pl.when(pl.col(col1) > pl.col(col2))
            .then(pl.lit('A'))
            .when(pl.col(col1) < pl.col(col3))
            .then(pl.lit('B'))
            .when((pl.col(col3) < pl.col(col1)) & (pl.col(col1) < pl.col(col2)))
            .then(pl.lit('E'))
            .otherwise(None)  # You can replace 'N/A' with any default value
            .alias(ind_name)
        )
    #Terr
    f = df[0]
    nf = f.select([levels[0],p,sg,spc,d,metric])
    nf = nf.join(geo_code_mapper,on = levels[0],how = 'left')
    nf_n = nf.group_by([p,sg,spc,d]).agg(
        nul = (pl.col(metric).median() + (0.5*pl.col(metric).std())),
        nll = (pl.col(metric).median() - (0.5*pl.col(metric).std()))
    )
    nf_a = nf.group_by([levels[2],p,sg,spc,d]).agg(
        aul = (pl.col(metric).median() + (0.5*pl.col(metric).std())),
        all = (pl.col(metric).median() - (0.5*pl.col(metric).std()))
    )
    nf_r = nf.group_by([levels[1],p,sg,spc,d]).agg(
        rul = (pl.col(metric).median() + (0.5*pl.col(metric).std())),
        rll = (pl.col(metric).median() - (0.5*pl.col(metric).std()))
    )
    nf = (
        nf
        .join(nf_n, on=[p, sg, spc, d], how='left')
        .join(nf_a,on=[levels[2],p, sg, spc, d],how='left')
        .join(nf_r,on=[levels[1],p, sg, spc, d],how='left')
    ).drop(levels[1],levels[2],levels[3])
    nf = add_indicator(nf, f'{ind_col_name}_BnchMrk_Ind1', metric, 'nul', 'nll')
    nf = add_indicator(nf, f'{ind_col_name}_BnchMrk_Ind2', metric, 'aul', 'all')
    nf = add_indicator(nf, f'{ind_col_name}_BnchMrk_Ind3', metric, 'rul', 'rll').drop(['nul','nll','aul','all','rul','rll',metric])
    f = f.join(nf,on=[levels[0],p, sg, spc, d],how = 'left')
    df[0] = f
    #Region
    f = df[1]
    nf = f.select([levels[1],p,sg,spc,d,metric]).join(
        geo_code_mapper[['region_geography_id','area_geography_id']].unique(),on = levels[1],how = 'left'
    )
    # create upper and lowers : 
    nf_n = nf.group_by([p,sg,spc,d]).agg(
        nul = (pl.col(metric).median() + (0.5*pl.col(metric).std())),
        nll = (pl.col(metric).median() - (0.5*pl.col(metric).std()))
    )
    nf_a = nf.group_by([levels[2],p,sg,spc,d]).agg(
        aul = (pl.col(metric).median() + (0.5*pl.col(metric).std())),
        all = (pl.col(metric).median() - (0.5*pl.col(metric).std()))
    )
    nf = (
        nf
        .join(nf_n, on=[p, sg, spc, d], how='left')
        .join(nf_a,on=[levels[2],p, sg, spc, d],how='left')
    ).drop(levels[2],levels[3])
    nf = add_indicator(nf, f'{ind_col_name}_BnchMrk_Ind1', metric, 'nul', 'nll')
    nf = add_indicator(nf, f'{ind_col_name}_BnchMrk_Ind2', metric, 'aul', 'all')
    nf = nf.with_columns(pl.lit(None).alias(f'{ind_col_name}_BnchMrk_Ind3')).drop(['nul','nll','aul','all',metric])
    f = f.join(nf,on=[levels[1],p, sg, spc, d],how = 'left')
    df[1] = f
    #Area
    f = df[2]
    nf = f.select([levels[2],p,sg,spc,d,metric])
    # create upper and lowers : 
    nf_n = nf.group_by([p,sg,spc,d]).agg(
        nul = (pl.col(metric).median() + (0.5*pl.col(metric).std())),
        nll = (pl.col(metric).median() - (0.5*pl.col(metric).std()))
    )
    nf = (nf.join(nf_n, on=[p, sg, spc, d], how='left'))
    nf = add_indicator(nf, f'{ind_col_name}_BnchMrk_Ind1', metric, 'nul', 'nll')
    nf = nf.with_columns(pl.lit(None).alias(f'{ind_col_name}_BnchMrk_Ind2'),pl.lit(None).alias(f'{ind_col_name}_BnchMrk_Ind3')).drop(['nul','nll',metric])
    f = f.join(nf,on=[levels[2],p, sg, spc, d],how = 'left')
    df[2] = f
    #Nation 
    f = df[3]
    f = f.with_columns(pl.lit(None).alias(f'{ind_col_name}_BnchMrk_Ind1'),pl.lit(None).alias(f'{ind_col_name}_BnchMrk_Ind2'),pl.lit(None).alias(f'{ind_col_name}_BnchMrk_Ind3'))
    df[3] = f
    return(df)
	
def get_terr_cgd():    #count of growers and decliners -> requires all_products_tuf_iid to be set up before hand
    i = 0 #keep it locked
    source_df = (
        all_products_tuf_iid[['IID','product_id',f'TUF{period}c',f'TUF{period}p']]
        .rename({f'TUF{period}c':'cur_vol',f'TUF{period}p':'pri_vol'})
        .with_columns(vol_change = pl.col('cur_vol')-pl.col('pri_vol'))
        .join(mp_spec_seg_dec,on = 'IID',how = 'left')
        .join(geo_code_mapper,on = levels[0],how ='left') #keeping levels 0 as base data is only on iid-terr level
        .filter((pl.col('pri_vol')!=0) & (pl.col('pri_vol').is_not_null()))
        .filter((pl.col('vol_change')!=0))
    )

    source_df_percentile = source_df.group_by(levels[i+1],p).agg(
        ten_perc = pl.col('vol_change').quantile(0.1,interpolation='linear'),
        nin_perc = pl.col('vol_change').quantile(0.9,interpolation='linear')
    )

    source_df = source_df.join(source_df_percentile,on=[levels[i+1],p],how = 'left'
    ).with_columns(
        pl.when((pl.col('vol_change')<0) & (pl.col('vol_change') <= pl.col('ten_perc'))).then(pl.lit('DECLINER'))
        .when((pl.col('vol_change')>0) & (pl.col('vol_change') >= pl.col('nin_perc'))).then(pl.lit('GROWER'))
        .otherwise(pl.lit(None))
        .alias('TYPE')
    )

    cg = source_df.filter(pl.col('TYPE')=='GROWER').group_by([levels[i],spc,sg,d,p]).agg(num_growers = pl.col('IID').n_unique())
    cd = source_df.filter(pl.col('TYPE')=='DECLINER').group_by([levels[i],spc,sg,d,p]).agg(num_decliners = pl.col('IID').n_unique())
    cgd = cg.join(cd,on = [levels[i],spc,sg,d,p],how='outer_coalesce').with_columns(pl.col('num_growers').fill_null(0),pl.col('num_decliners').fill_null(0))

    def add_all_roll_up_cgd(df):
        g = levels[i]
        p,sg,d,spc = 'product_id','segment','decile','specialty_group'
        sg_roll_up,d_roll_up,spc_roll_up = pl.lit('Universe'),pl.lit('0-10'),pl.lit('all_spec')
        metric_cols = ['num_growers','num_decliners']
        main_seq = ([g,p,sg,d,spc] + metric_cols)
        agg_dict = {metric: pl.col(metric).sum() for metric in metric_cols}
        # First Round - 
        sg_df = (df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (
                df.select(main_seq)
                .vstack(sg_df).vstack(d_df).vstack(spc_df)
                .vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df)
                .vstack(sg_d_spc_df)
        )
        return(df)

    cgd = add_all_roll_up_cgd(cgd)

    return(cgd)
	
def get_terr_cnp(cop): # count of new prescribers
    i = 0
    source_df = (
        all_products_tuf_iid[['IID','product_id',f'TUF{period}{cop}',f'TUF_all']]
        .rename({f'TUF{period}{cop}':'cur_vol'})
        .with_columns(old_volume = pl.col('TUF_all')-pl.col('cur_vol'))
        .join(mp_spec_seg_dec,on = 'IID',how = 'left')
        .filter(pl.col('old_volume')==0)
        .with_columns(TYPE = pl.lit('NEW'))
        .group_by([levels[i],spc,sg,d,p])
        .agg(num_new_prec = pl.col('IID').n_unique())
    )
    def add_all_roll_up_cnp(df):
        g = levels[i]
        p,sg,d,spc = 'product_id','segment','decile','specialty_group'
        sg_roll_up,d_roll_up,spc_roll_up = pl.lit('Universe'),pl.lit('0-10'),pl.lit('all_spec')
        metric_cols = ['num_new_prec']
        main_seq = ([g,p,sg,d,spc] + metric_cols)
        agg_dict = {metric: pl.col(metric).sum() for metric in metric_cols}
        # First Round - 
        sg_df = (df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (
                df.select(main_seq)
                .vstack(sg_df).vstack(d_df).vstack(spc_df)
                .vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df)
                .vstack(sg_d_spc_df)
        )
        return(df)

    source_df = add_all_roll_up_cnp(source_df)

    return(source_df)
	
# num_new_prec ind_metric num_growers num_decliners
def process_6(df):
    for i in range(4):
        f = df[i]
        formula = ((pl.col('num_new_prec')-pl.col('pri_num_new_prec'))/pl.col('pri_num_new_prec')).alias('ind_metric')
        #if at terr level then no need to roll up:
        if i==0:
            f = (
                f
                .join(cgd,on=[levels[i],spc,sg,d,p],how='left')
                .join(cnp,on=[levels[i],spc,sg,d,p],how='left')
                .with_columns(formula)
                .with_columns(
                    pl.when(pl.col('ind_metric') > 0.02).then(pl.lit('P'))
                    .when(pl.col('ind_metric') < -0.02).then(pl.lit('Q'))
                    .otherwise(None).alias('new_prec_ind')
                ).drop(['pri_num_new_prec','ind_metric'])
            )
        else:
            source_df_1 = cgd.join(geo_code_mapper,on = levels[0],how='left').group_by([levels[i],spc,sg,d,p]).agg(
                num_growers = pl.col('num_growers').sum(),num_decliners = pl.col('num_decliners').sum()
            )
            source_df_2 = cnp.join(geo_code_mapper,on = levels[0],how='left').group_by([levels[i],spc,sg,d,p]).agg(
                num_new_prec = pl.col('num_new_prec').sum(),pri_num_new_prec = pl.col('pri_num_new_prec').sum()
            )
            f = (
                f
                .join(source_df_1,on=[levels[i],spc,sg,d,p],how='left')
                .join(source_df_2,on=[levels[i],spc,sg,d,p],how='left')
                .with_columns(formula)
                .with_columns(
                    pl.when(pl.col('ind_metric') > 0.02).then(pl.lit('P'))
                    .when(pl.col('ind_metric') < -0.02).then(pl.lit('Q'))
                    .otherwise(None).alias('new_prec_ind')
                ).drop(['pri_num_new_prec','ind_metric'])
            )
        df[i] = f
    return(df)
	
#trx size change , and nrx size change
def process_7(df):
    for i in range(4):
        f = df[i]
        f = (
            f
            .with_columns(
                pl.when(pl.col('avg_trx_size_ch') > 0.005).then(pl.lit('P'))
                .when(pl.col('avg_trx_size_ch') < -0.005).then(pl.lit('Q'))
                .when(pl.col('avg_trx_size_ch')==0).then(None)
                .otherwise(None).alias('avg_trx_size_ch_ind'),
                
                pl.when(pl.col('avg_nrx_size_ch') > 0.005).then(pl.lit('P'))
                .when(pl.col('avg_nrx_size_ch') < -0.005).then(pl.lit('Q'))
                .when(pl.col('avg_nrx_size_ch')==0).then(None)
                .otherwise(None).alias('avg_nrx_size_ch_ind'),
            )
        )
        df[i] = f
    return(df)

In [13]:
# For Convert To Feed Ready data
def get_feed(temp1):
    drop_cols = [f'{m}{period}{suffix}' for m in ['TUN','TRX','TUF','NUN','NRX','NUF'] for suffix in ['c', 'p']]
    drop_cols += ['prior_avg_trx_size','prior_avg_nrx_size','prc_shr_bus_nrx']
    for i in range(4):
        temp1[i] = temp1[i].drop(drop_cols)
    	
    temp1[0] = temp1[0].rename({'geography_id': 'Geography_id'})
    temp1[1] = temp1[1].rename({'region_geography_id': 'Geography_id'})
    temp1[2] = temp1[2].rename({'area_geography_id': 'Geography_id'})
    temp1[3] = temp1[3].rename({'nation_geography_id': 'Geography_id'})
    final_feed = temp1[0].vstack(temp1[1]).vstack(temp1[2]).vstack(temp1[3])
    #Renaming columns
    new_col_mapping = {
        'product_id': 'Product_id',
        'segment': 'Segment',
        'specialty_group': 'Specialty',
        'decile': 'Decile',
        'prc_shr_bus_trx': 'Share_of_Business_Prc',
        'rr_4v13': 'Run_Rate_Change_4v13',
        'num_hcp': 'Num_Of_Prescribers',
        'num_hcp_ind': 'Num_Of_Prescribers_Ind',
        'num_new_prec': 'Num_Of_New_Prescribers',
        'new_prec_ind': 'Num_Of_New_Prescribers_Ind',
        'num_growers': 'Num_Of_Growers',
        'num_decliners': 'Num_Of_Decliners',
        'avg_trx_size': 'Avg_TRx_Size',
        'avg_trx': 'Avg_TRx',
        'avg_trx_size_ch': 'Avg_TRx_Size_Change',
        'avg_trx_size_ch_ind': 'Avg_TRx_Size_Change_Ind',
        'avg_nrx_size': 'Avg_NRx_Size',
        'avg_nrx_size_ch': 'Avg_NRx_Size_Change',
        'avg_nrx_size_ch_ind': 'Avg_NRx_Size_Change_Ind'
    } 
    final_feed = final_feed.rename(new_col_mapping)
    #required new columns for feed
    col_to_addrt = ['ReportType']
    col_to_addp = ['Period']
    col_to_addz = ['Num_Of_New_To_Brand']
    col_to_addna = ['Num_Of_Prescribers_BnchMrk_Ind1','Num_Of_Prescribers_BnchMrk_Ind2','Num_Of_Prescribers_BnchMrk_Ind3',
                    'Num_Of_Growers_Ind','Num_Of_Decliners_Ind','Avg_TRx_Size_BnchMrk_Ind1','Avg_TRx_Size_BnchMrk_Ind2','Avg_TRx_Size_BnchMrk_Ind3',
                    'Num_Of_New_To_Brand_Ind','TRx_Goal','Prc_TRx_Attainment','Avg_TRx_Size_Change_BnchMrk_Ind1',
                    'Avg_TRx_Size_Change_BnchMrk_Ind2','Avg_TRx_Size_Change_BnchMrk_Ind3','Avg_NRx_Size_BnchMrk_Ind1','Avg_NRx_Size_BnchMrk_Ind2','Avg_NRx_Size_BnchMrk_Ind3',
                    'Avg_NRx_Size_Change_BnchMrk_Ind1','Avg_NRx_Size_Change_BnchMrk_Ind2','Avg_NRx_Size_Change_BnchMrk_Ind3']
    # func to add columns with desired value
    def addcol(df,columns_to_add,wtl):
        for my_col in columns_to_add:
            df = df.with_columns(pl.lit(wtl).alias(my_col))
        return df
    
    final_feed = addcol(final_feed,col_to_addrt,'WEEKLY')
    final_feed = addcol(final_feed,col_to_addp,f'{period_num}-WEEK')
    final_feed = addcol(final_feed,col_to_addz,0)
    final_feed = addcol(final_feed,col_to_addna,'\\N')

    #changing values according to feed of SAS. - 06/20
    final_feed = final_feed.with_columns(
        pl.when(pl.col('Segment')=='ALG-ONLY-TARGET')
        .then(pl.lit('AGNT'))
        .when(pl.col('Segment')=='Non-Target')
        .then(pl.lit('NT'))
        .when(pl.col('Segment')=='Target')
        .then(pl.lit('T'))
        .otherwise(pl.col('Segment'))
        .alias('Segment'))
    
    # arranging columns according to feed
    req_cols = ['Geography_id', 'Product_id', 'Segment', 'Specialty', 'ReportType', 'Period', 'Decile', 'Share_of_Business_Prc', 
                'Run_Rate_Change_4v13', 'Num_Of_Prescribers', 'Num_Of_Prescribers_Ind', 'Num_Of_Prescribers_BnchMrk_Ind1', 
                'Num_Of_Prescribers_BnchMrk_Ind2', 'Num_Of_Prescribers_BnchMrk_Ind3', 'Num_Of_New_Prescribers', 'Num_Of_New_Prescribers_Ind', 
                'Num_Of_Growers', 'Num_Of_Growers_Ind', 'Num_Of_Decliners', 'Num_Of_Decliners_Ind', 'Avg_TRx_Size', 'Avg_TRx_Size_BnchMrk_Ind1', 
                'Avg_TRx_Size_BnchMrk_Ind2', 'Avg_TRx_Size_BnchMrk_Ind3', 'Avg_TRx', 'Num_Of_New_To_Brand', 'Num_Of_New_To_Brand_Ind', 
                'TRx_Goal', 'Prc_TRx_Attainment', 'Avg_TRx_Size_Change', 'Avg_TRx_Size_Change_Ind', 'Avg_TRx_Size_Change_BnchMrk_Ind1', 
                'Avg_TRx_Size_Change_BnchMrk_Ind2', 'Avg_TRx_Size_Change_BnchMrk_Ind3', 'Avg_NRx_Size', 'Avg_NRx_Size_BnchMrk_Ind1', 
                'Avg_NRx_Size_BnchMrk_Ind2', 'Avg_NRx_Size_BnchMrk_Ind3', 'Avg_NRx_Size_Change', 'Avg_NRx_Size_Change_Ind', 
                'Avg_NRx_Size_Change_BnchMrk_Ind1', 'Avg_NRx_Size_Change_BnchMrk_Ind2', 'Avg_NRx_Size_Change_BnchMrk_Ind3']
    final_feed = final_feed.select(req_cols)
    
    return(final_feed)


### Period Loop-

In [14]:
# for trvializing formula : 
p,sg,spc,d = 'product_id','segment','specialty_group','decile'
levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
OUT = 's3://vortex-staging-a65ced90/BIT/output/GeoSummary/Weekly/'

In [15]:
# Calling Functions and Exporting Feeds-
#for period_num,PN in zip(['qtd'],[5]):
for period_num,PN in zip([1,4,13,26,'qtd'],[1,2,3,4,5]):
    period = f'_{period_num}'

    temp1 = [pl.DataFrame() for _ in range(4)] # creating an empty dataframe holder list obj
    temp1 = process_1(temp1)
    if PN == 5:
        temp1 = process_2(temp1, num_weeks_rx)
    else:
        temp1 = process_2(temp1, period_num)

    temp1 = process_3(temp1)
    temp1 = process_4(temp1)
    temp1 = process_5(temp1)
    temp1 = process_bnch_presc(temp1,'num_hcp','Num_Of_Prescribers')
    cgd = get_terr_cgd()
    cnp = (
        get_terr_cnp('c')
        .join(
            get_terr_cnp('p').rename({'num_new_prec':'pri_num_new_prec'}),
            on = [levels[0],p,sg,d,spc],how = 'outer_coalesce'
        ).with_columns(
            pl.col('num_new_prec').fill_null(0),pl.col('pri_num_new_prec').fill_null(0)
        )
    )
    temp1 = process_6(temp1)
    temp1 = process_7(temp1)
    # Reusing Function from HCP benchmark , assuming same upper and lower limit logic is applied.
    #Avg_TRx_Size_BnchMrk_Ind
    temp1 = process_bnch_presc(temp1,'avg_trx_size','Avg_TRx_Size')
    #Avg_TRx_Size_Change_BnchMrk_Ind
    temp1 = process_bnch_presc(temp1,'avg_trx_size_ch','Avg_TRx_Size_Change')
    #Avg_NRx_Size_BnchMrk_Ind
    temp1 = process_bnch_presc(temp1,'avg_nrx_size','Avg_NRx_Size')
    #Avg_NRx_Size_Change_BnchMrk_Ind
    temp1 = process_bnch_presc(temp1,'avg_nrx_size_ch','Avg_NRx_Size_Change')

    feed_dataset = get_feed(temp1)
    feed_dataset.to_pandas().to_csv(f'{OUT}Weekly_GeoSummary_SalesKPIs_P{PN}_Feed.txt', sep='|')
    print(f'Exported Feed {PN}!')

Exported Feed 1!


Exported Feed 2!


Exported Feed 3!


Exported Feed 4!


Exported Feed 5!
