### GS Sales Activity v2

In [5]:
import polars as pl
import pandas as pd
import gc
from datetime import datetime, timedelta,date
from dateutil.relativedelta import relativedelta
import numpy as np
import json

In [13]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

curr_date = datetime.strptime(js['curr_date'], '%Y-%m-%d').date()
quarter_start = datetime.strptime(js['quarter_start'], '%Y-%m-%d').date()
quarter_end = datetime.strptime(js['quarter_end'], '%Y-%m-%d').date()
qtr_data = js['qtr_data']
num_weeks_calls = js['num_weeks_calls']
num_weeks_rx = js['num_weeks_rx']
num_of_months = js['num_of_months']
monthly_data_date = js['monthly_data_date']
data_date = js['data_date']
YTD = js['YTD']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
geo = f's3://{bucket}/PYADM/quaterly/{qtr_data}/geography/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'
mxpn = f's3://{bucket}/PYADM/monthly/archive/{monthly_data_date}/xponent/'

In [7]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

def intck(interval, start_date, end_date):
    if interval == 'DAY':
        return (end_date - start_date).days
    elif interval == 'MONTH':
        rd = relativedelta(end_date, start_date)
        return rd.years * 12 + rd.months
    elif interval == 'WEEK':
        return (end_date - start_date).days // 7

In [8]:
# Imporing Dependencies
load('temp_calls')
load('temp_samples')
load('temp_abbv')
load('mp_spec_seg_dec')
load('hierarchy',geo)
load('wd_raw')
load('lirwd_call_plan')
load('MASTER_UNI')
load('roster')

geo_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/GeographyMapping.txt',separator='|')
geo_mapping = geo_mapping.with_columns(
    Code = pl.when(pl.col('Code')!= 'NATION').then(pl.lit('1111-')+pl.col('Code')).otherwise(pl.col('Code'))
)
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')

geo_id_full = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))


#fixes for vortex import -> Probably caused by Polars Upgrades
temp_calls = temp_calls.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_samples = temp_samples.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_abbv = temp_abbv.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
wd_raw = wd_raw.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
#laxdn_geoid_sum = laxdn_geoid_sum.with_columns(pl.col('geography_id').cast(pl.Int64))

In [9]:
# Processing  1. temp calls  2. temp samples 3. temp abbv datasets
# - doubt is physiian terr id same as salesrepterrid for every record?
temp_calls_mp_spec = (
    temp_calls
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)
# NOTE -
# Combining MP and dropping null geo
# Adding Area and Region Code
# Adding Working Day
# Adding call_freq_quarter

# For Supproting Calc ->
#geo_code_mapper = temp_calls_mp_spec[['geography_id','region_geography_id','area_geography_id','nation_geography_id']].unique()
geo_code_mapper = geo_id_full
geo_code_mapper.to_pandas().to_parquet(dflib+'geo_code_mapper.parquet') #exporting for other code use

###
temp_samples_mp_spec = (
    temp_samples
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)
###
temp_abbv_mp_spec = (
    temp_abbv
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)

RX Util Functions ->

In [45]:
# For Voucher Removal - 
def get_lin_voucher():
    vch = pl.read_parquet(f'{mxpn}LIN_VOUCHER.parquet') 
    vch1 = pl.DataFrame()
    for prod in ['LIN1','LIN2','LIN3']: # LINV
        vch_prod = (
            vch.select(
                pl.col('IID'),
                pl.col(f'{prod}TUF1').alias(f'vTUF_1c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,4)]).alias(f'vTUF_3c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,7)]).alias(f'vTUF_6c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,13)]).alias(f'vTUF_12c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(4,7)]).alias(f'vTUF_pqtrc'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,YTD+1)]).alias(f'vTUF_ytdc'),
                pl.col(f'{prod}TUF2').alias(f'vTUF_1p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(4,7)]).alias(f'vTUF_3p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(7,13)]).alias(f'vTUF_6p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(13,25)]).alias(f'vTUF_12p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(7,10)]).alias(f'vTUF_pqtrp'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(13,13+YTD)]).alias(f'vTUF_ytdp')
            )
            .with_columns(pl.lit(f'LI{prod[-1]}').alias('PROD_CD'))
        )
        if prod[-1] == '1':
            vch1 = vch_prod.clone()
        else:
            vch1 = pl.concat([vch1, vch_prod])

    # voucher_mapping = {'LI1': 4, 'LI2': 5, 'LI3': 3, 'LIV': 2}
    vch1 = vch1.fill_null(0)
    return(vch1)

In [51]:
def get_summed_metric_period(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,25)]
    df = pl.read_parquet(mxpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,4,13,26 for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,4)]).alias(metric+'_3c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,7)]).alias(metric+'_6c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,13)]).alias(metric+'_12c'),
        pl.sum_horizontal([metric+str(i) for i in range(4,7)]).alias(metric+'_pqtrc'),
        pl.sum_horizontal([metric+str(i) for i in range(1,YTD+1)]).alias(metric+'_ytdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(4,7)]).alias(metric+'_3p'),
        pl.sum_horizontal([metric+str(i) for i in range(7,13)]).alias(metric+'_6p'),
        pl.sum_horizontal([metric+str(i) for i in range(13,25)]).alias(metric+'_12p'),
        pl.sum_horizontal([metric+str(i) for i in range(7,10)]).alias(metric+'_pqtrp'),
        pl.sum_horizontal([metric+str(i) for i in range(13,13+YTD)]).alias(metric+'_ytdp')
    )

    # For Voucher Removal - 
    if metric == 'TUF':
        dfv = get_lin_voucher()
        df = df.join(dfv,on=['IID','PROD_CD'],how='left').fill_null(0)
        cols_to_remove = dfv.columns[1:-1]
        df = df.with_columns(
            pl.col(f'{metric}_1c') -  pl.col(f'v{metric}_1c').alias(f'{metric}_1c'),
            pl.col(f'{metric}_3c') -  pl.col(f'v{metric}_3c').alias(f'{metric}_3c'),
            pl.col(f'{metric}_6c') -  pl.col(f'v{metric}_6c').alias(f'{metric}_6c'),
            pl.col(f'{metric}_12c') -  pl.col(f'v{metric}_12c').alias(f'{metric}_12c'),
            pl.col(f'{metric}_pqtrc') -  pl.col(f'v{metric}_pqtrc').alias(f'{metric}_pqtrc'),
            pl.col(f'{metric}_ytdc') -  pl.col(f'v{metric}_ytdc').alias(f'{metric}_ytdc'),
            pl.col(f'{metric}_1p') -  pl.col(f'v{metric}_1p').alias(f'{metric}_1p'),
            pl.col(f'{metric}_3p') -  pl.col(f'v{metric}_3p').alias(f'{metric}_3p'),
            pl.col(f'{metric}_6p') -  pl.col(f'v{metric}_6p').alias(f'{metric}_6p'),
            pl.col(f'{metric}_12p') -  pl.col(f'v{metric}_12p').alias(f'{metric}_12p'),
            pl.col(f'{metric}_pqtrp') -  pl.col(f'v{metric}_pqtrp').alias(f'{metric}_pqtrp'),
            pl.col(f'{metric}_ytdp') -  pl.col(f'v{metric}_ytdp').alias(f'{metric}_ytdp')
        ).drop(cols_to_remove)

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    metrics_to_calc = {}
    for f in ['c','p']:
        for p in [1,3,6,12,'pqtr','ytd']:
            column = f'{metric}_{p}{f}'
            metrics_to_calc[column] = pl.col(column).sum()
    
    df_terr = df.group_by(['geography_id','specialty_group','segment','decile','PROD_CD']).agg(**metrics_to_calc)

    df_reg = df.join(geo_code_mapper[['geography_id','region_geography_id']],on='geography_id',how='left'
    ).group_by(['region_geography_id','specialty_group','segment','decile','PROD_CD']).agg(**metrics_to_calc)

    df_area = df.join(geo_code_mapper[['geography_id','area_geography_id']],on='geography_id',how='left'
    ).group_by(['area_geography_id','specialty_group','segment','decile','PROD_CD']).agg(**metrics_to_calc)

    df_nation = df.join(geo_code_mapper[['geography_id','nation_geography_id']],on='geography_id',how='left'
    ).group_by(['nation_geography_id','specialty_group','segment','decile','PROD_CD']).agg(**metrics_to_calc)

    return(
        df_terr,df_reg,df_area,df_nation
    )

In [47]:
def add_parent_product_rows(all_prod_df):
    # converting tuple to list , because i cant assign the processed df back to it
    all_prod_df = list(all_prod_df)
    for i in range(4): 
        df = all_prod_df[i]
        agg_dict = {}
        for col in df.columns[5:]:
            agg_dict[col] = pl.col(col).sum()
        
        join_cols = df.columns[0:4]

        df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
        df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
        df_2_35 = df_2_35.group_by(join_cols + ['parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
        df_1 = df.group_by(join_cols).agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

        # stack 1, 2_35 with df and return
        df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
        vstack_helper = df.columns
        df = df.vstack(
            df_2_35.select(vstack_helper)
        ).vstack(
            df_1.select(vstack_helper)
        )

        all_prod_df[i] = df
    return(tuple(all_prod_df))

In [48]:
def add_full_rollups(all_prod_df):
    # converting the tuple of dfs into a list for processing
    all_prod_df = list(all_prod_df)
    # for trivializing formulas - 
    p,sg,d,spc = 'product_id','segment','decile','specialty_group'
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    
    #Looping over 4 levels (terr,reg,area,nation)
    for i in range(4):
        df = all_prod_df[i]
        g = df.columns[0] #should contain geo level
        metric_cols = df.columns[4:-1] #should contain the tuf / nuf columns
        main_seq = ([g,p,sg,d,spc] + metric_cols) #used for vstack later
        agg_dict = {metric: pl.col(metric).sum() for metric in metric_cols}
        # First Round - 
        sg_df = (df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        #### Processing Done ####
        df = (
            df.select(main_seq)
            .vstack(sg_df).vstack(d_df).vstack(spc_df)
            .vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df)
            .vstack(sg_d_spc_df)
        )
        # Store Data Back :
        all_prod_df[i] = df
    
    return(tuple(all_prod_df))

In [54]:
def rx_util():
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    all_products_tuf = get_summed_metric_period('TUF',['LI1','LI2','LI3'])
    all_products_tuf = add_parent_product_rows(all_products_tuf)
    all_products_tuf = add_full_rollups(all_products_tuf)
    all_products_tuf = list(all_products_tuf)
    
    res = []
    for i in range(4):
        all_products_tuf[i] = all_products_tuf[i].filter(pl.col('product_id').is_in({2,3,4,5})).select([levels[i],'segment','specialty_group','decile','product_id','TUF_qtdc'])
    
        df = all_products_tuf[i].clone()
        
        df_2 = df.filter(product_id = 2).rename({'TUF_qtdc':'wk_qtd_LIN'}).drop('product_id')
        df_3 = df.filter(product_id = 3).rename({'TUF_qtdc':'wk_qtd_LI3'}).drop('product_id')
        df_4 = df.filter(product_id = 4).rename({'TUF_qtdc':'wk_qtd_LI1'}).drop('product_id')
        df_5 = df.filter(product_id = 5).rename({'TUF_qtdc':'wk_qtd_LI2'}).drop('product_id')
        
        df_final = (
            df_2
            .join(df_4,on = [levels[i],'segment','specialty_group','decile'], how = 'outer_coalesce')
            .join(df_5,on = [levels[i],'segment','specialty_group','decile'], how = 'outer_coalesce')
            .join(df_3,on = [levels[i],'segment','specialty_group','decile'], how = 'outer_coalesce')
        )
        res.append(df_final)
    
    return(res)
laxdn_geoid_sum = rx_util()

geography_id,specialty_group,segment,decile,PROD_CD,TUF_1c,TUF_3c,TUF_6c,TUF_12c,TUF_pqtrc,TUF_ytdc,TUF_1p,TUF_3p,TUF_6p,TUF_12p,TUF_pqtrp,TUF_ytdp
i64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
98,"""A/O""","""Target""","""5-7""","""LI1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,"""GE""","""Target""","""8-10""","""LI3""",62.486,133.145,329.908,737.561,196.763,397.581,60.59,196.763,407.653,847.983,206.25,488.44
41,"""A/O""","""Non-Target""","""0-2""","""LI3""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.081,0.0,0.0
108,"""PED""","""ALG-ONLY-TARGET""","""0-2""","""LI3""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,"""A/O""","""Non-Target""","""8-10""","""LI2""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
96,"""GE""","""Target""","""8-10""","""LI3""",14.381,111.597,191.475,562.635,79.878,297.601,60.167,79.878,371.16,739.856,184.634,422.638
97,"""GE""","""Non-Target""","""3-4""","""LI2""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94,"""PED""","""Non-Target""","""3-4""","""LI3""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,"""GE""","""Target""","""8-10""","""LI3""",0.0,81.028,167.767,284.308,86.739,193.466,38.374,86.739,116.541,189.603,56.757,112.691
