### Weekly_LinzessSnapshot_SalesActivity_Feed

In [1]:
import polars as pl
import pandas as pd
import gc
import numpy as np
import json
from datetime import datetime, timedelta,date

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

bucket = js['bucket']
num_weeks_rx = js['num_weeks_rx']
num_weeks_calls = js['num_weeks_calls']
data_date = js['data_date']
quarter_start = datetime.strptime(js['quarter_start'], '%Y-%m-%d').date()

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('mp_spec_seg_dec')
load('temp_calls')
load('temp_samples')
load('temp_abbv')
load('MASTER_UNI')
load('roster')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

#fixes for vortex import -> Probably caused by Polars Upgrades
temp_calls = temp_calls.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_samples = temp_samples.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_abbv = temp_abbv.with_columns(pl.col('SalesRepIID').cast(pl.Int64))

### Generator Functions - 

In [5]:
# For Voucher Removal - 
def get_lin_voucher():
    vch = pl.read_parquet(f'{xpn}LIN_VOUCHER.parquet') # n_rows=500
    vch1 = pl.DataFrame()
    for prod in ['LIN1','LIN2','LIN3']: # LINV
        vch_prod = (
            vch.select(
                pl.col('IID'),
                pl.col(f'{prod}TUF1').alias(f'vTUF_1c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,5)]).alias(f'vTUF_4c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,14)]).alias(f'vTUF_13c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,27)]).alias(f'vTUF_26c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,num_weeks_rx+1)]).alias(f'vTUF_qtdc'),
                pl.col(f'{prod}TUF2').alias(f'vTUF_1p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(5,9)]).alias(f'vTUF_4p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(14,27)]).alias(f'vTUF_13p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(27,53)]).alias(f'vTUF_26p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(14,14+num_weeks_rx)]).alias(f'vTUF_qtdp'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,106)]).alias(f'vTUF_all')
            )
            .with_columns(pl.lit(f'LI{prod[-1]}').alias('PROD_CD'))
        )
        if prod[-1] == '1':
            vch1 = vch_prod.clone()
        else:
            vch1 = pl.concat([vch1, vch_prod])

    # voucher_mapping = {'LI1': 4, 'LI2': 5, 'LI3': 3, 'LIV': 2}
    # vch1 = vch1.with_columns(pl.col('PROD_CD').replace(voucher_mapping,return_dtype=pl.Int64).alias('product_id')).fill_null(0)#.drop('PROD_CD')
    vch1 = vch1.fill_null(0)

    return(vch1)

In [6]:
def get_summed_period_iid_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,106)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,4,13,26 for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,5)]).alias(metric+'_4c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,14)]).alias(metric+'_13c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,27)]).alias(metric+'_26c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,num_weeks_rx+1)]).alias(metric+'_qtdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(5,9)]).alias(metric+'_4p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,27)]).alias(metric+'_13p'),
        pl.sum_horizontal([metric+str(i) for i in range(27,53)]).alias(metric+'_26p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,14+num_weeks_rx)]).alias(metric+'_qtdp'),

        pl.sum_horizontal([metric+str(i) for i in range(1,106)]).alias(metric+'_all')
    )
    # For Voucher Removal - 
    if metric == 'TUF':
        dfv = get_lin_voucher()
        df = df.join(dfv,on=['IID','PROD_CD'],how='left').fill_null(0)
        cols_to_remove = dfv.columns[1:-1]
        df = df.with_columns(
            pl.col(f'{metric}_1c') -  pl.col(f'v{metric}_1c').alias(f'{metric}_1c'),
            pl.col(f'{metric}_4c') -  pl.col(f'v{metric}_4c').alias(f'{metric}_4c'),
            pl.col(f'{metric}_13c') -  pl.col(f'v{metric}_13c').alias(f'{metric}_13c'),
            pl.col(f'{metric}_26c') -  pl.col(f'v{metric}_26c').alias(f'{metric}_26c'),
            pl.col(f'{metric}_qtdc') -  pl.col(f'v{metric}_qtdc').alias(f'{metric}_qtdc'),
            pl.col(f'{metric}_1p') -  pl.col(f'v{metric}_1p').alias(f'{metric}_1p'),
            pl.col(f'{metric}_4p') -  pl.col(f'v{metric}_4p').alias(f'{metric}_4p'),
            pl.col(f'{metric}_13p') -  pl.col(f'v{metric}_13p').alias(f'{metric}_13p'),
            pl.col(f'{metric}_26p') -  pl.col(f'v{metric}_26p').alias(f'{metric}_26p'),
            pl.col(f'{metric}_qtdp') -  pl.col(f'v{metric}_qtdp').alias(f'{metric}_qtdp'),
            pl.col(f'{metric}_all') -  pl.col(f'v{metric}_all').alias(f'{metric}_all')
        ).drop(cols_to_remove)

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))



In [7]:
def add_parent_product_rows(df):
    agg_dict = {}
    for col in df.columns[2:]:
        agg_dict[col] = pl.col(col).sum()
    
    #join_cols = ['geography_id','plan_type','PlanID','IID']

    df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
    df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
    df_2_35 = df_2_35.group_by(['IID','parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
    
    df_1 = df.group_by('IID').agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

    # stack 1, 2_35 with df and return
    df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
    vstack_helper = df.columns
    df = df.vstack(
        df_2_35.select(vstack_helper)
    ).vstack(
        df_1.select(vstack_helper)
    )

    return(df)

In [8]:
# data prep -
all_products_tuf = get_summed_period_iid_metric('TUF',fetch_products)
all_products_tuf = add_parent_product_rows(all_products_tuf)

### Functions -

In [9]:
# iw_calls_13wks, # QTD IW CALLS-
def process_1(df,nw,col):

    if nw == 13:
        source_df = (temp_calls.filter(pl.col('call_week')<=13))
    else:
        source_df = (
            temp_calls
            .filter(pl.col('call_week')<=num_weeks_calls)
            .filter(pl.col('CallDate')>= quarter_start)
            .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
            .join(roster, on = 'SalesRepIID' , how = 'left')
            .filter(pl.col('Territory')==pl.col('GEO'))
        )
    source_df = (
        source_df
        .group_by(['AttendeeIID'])
        .agg(pl.col('CallID').n_unique().alias(col))
    )

    df = df.join(source_df,left_on='IID',right_on = 'AttendeeIID',how='left')

    return(df)

In [10]:
#abbv_visit
def process_2(df):
    source_df = (
        temp_abbv.filter(pl.col('call_week')<=num_weeks_calls)
        .filter(pl.col('CallDate')>= quarter_start)
        .group_by('AttendeeIID')
        .agg(abbv_visit=pl.col('CallID').n_unique())
    )

    df = df.join(source_df,left_on='IID',right_on='AttendeeIID',how='left')
    return (df)

In [11]:
# Total IW Samples , Total Samples IW + ABBV (no samples for ABBV so its the same value)
# NOTE : the samples are summed for all 3 dossages here
def process_3(df):

    source_df = (
        temp_samples
        .filter(pl.col('sample_week')<=num_weeks_calls)
        .filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    
    source_df = (
        source_df
        .group_by('AttendeeIID')
        .agg(total_iw_samples = pl.col('CallProductQuantity').sum())
        .with_columns(total_iw_abbv_samples = pl.col('total_iw_samples'))
    )

    df = df.join(source_df,left_on='IID',right_on='AttendeeIID',how='left')
    return(df)

In [12]:
# Trx Per Sample IW + ABBV -
def process_4(df):
    # getting Rx Data -
    lin_iid = all_products_tuf.filter(pl.col('product_id')==2).select(['IID','TUF_qtdc'])
    df = (
        df.join(lin_iid,on='IID',how='left')
        .with_columns(trx_per_sample = pl.col('TUF_qtdc')/pl.col('total_iw_abbv_samples')).drop('TUF_qtdc')
    )
    return(df)

In [13]:
def get_feed(temp1):
    for col in ['iw_calls_13wks','qtd_iw_calls','total_iw_samples','total_iw_abbv_samples','trx_per_sample']:
        globals()['temp1'] = globals()['temp1'].with_columns(pl.col(col).fill_null('\\N'))
    #renaming columns according to feed
    rnm_cols = {
        'IID':'Physician_ID',
        'geography_id':'Geography_id',
        'iw_calls_13wks':'SA_NumberMetric6',
        'qtd_iw_calls':'SA_NumberMetric1',
        'abbv_visit':'SA_NumberMetric2',
        'total_iw_samples':'SA_NumberMetric3',
        'total_iw_abbv_samples':'SA_NumberMetric4',
        'trx_per_sample':'SA_NumberMetric5'
    }
    #PDRP override - 
    pdrp = MASTER_UNI.select(['IID','PDRPOptOutFlag'])
    temp1 = (
        temp1
        .join(pdrp, on='IID',how='left')
        .with_columns(
            pl.when(pl.col('PDRPOptOutFlag')=='Y').then(pl.lit('\\N')).otherwise(pl.col('trx_per_sample')).alias('trx_per_sample')
        )
    )
 
    final_feed = (
        temp1
        .rename(rnm_cols)
        .select(['Physician_ID','Geography_id'] + [f'SA_NumberMetric{i}' for i in range(1,7)])
    )

    #Nan Corection -
    px = [pl.col(c).replace(None,'\\N') for c in [f'SA_NumberMetric{i}' for i in range(1,7)]]
    final_feed = final_feed.with_columns(px)
    
    return(final_feed)

---

In [14]:
# Processing - 
temp1 = mp_spec_seg_dec.select(['IID','geography_id'])
temp1 = process_1(temp1,13,'iw_calls_13wks')
temp1 = process_1(temp1,num_weeks_calls,'qtd_iw_calls')
temp1 = process_2(temp1)
temp1 = process_3(temp1)
temp1 = process_4(temp1)

In [15]:
# Filtering - 
temp1 = temp1.join(mp_spec_seg_dec.select(['IID','segment']),on='IID',how='left')
temp1 = temp1.filter(
    (pl.col('iw_calls_13wks').is_not_null()) | (pl.col('segment') == 'Target') 
)
temp1 = temp1.filter(
    (pl.col('qtd_iw_calls').is_not_null()) | (pl.col('segment') == 'Target') 
).drop('segment')

In [16]:
#Exporting Feeds-
OUT = 's3://vortex-staging-a65ced90/BIT/output/LinzessSnapshot/Weekly/'
feed_dataset = get_feed(temp1)
feed_dataset.to_pandas().to_csv(f'{OUT}Weekly_LinzessSnapshot_SalesActivity_Feed.txt', sep='|',lineterminator='\r\n',index=False)
print(f'LS Sales Actvity Exported !')

LS Sales Actvity Exported !


---