### GS Sales Activity v2

In [1]:
import polars as pl
import pandas as pd
import gc
from datetime import datetime, timedelta,date
from dateutil.relativedelta import relativedelta
import numpy as np
import json

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

curr_date = datetime.strptime(js['curr_date'], '%Y-%m-%d').date()
quarter_start = datetime.strptime(js['quarter_start'], '%Y-%m-%d').date()
quarter_end = datetime.strptime(js['quarter_end'], '%Y-%m-%d').date()
qtr_data = js['qtr_data']
num_weeks_calls = js['num_weeks_calls']
num_weeks_rx = js['num_weeks_rx']
num_of_months = js['num_of_months']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
geo = f's3://{bucket}/PYADM/quaterly/{qtr_data}/geography/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

def intck(interval, start_date, end_date):
    if interval == 'DAY':
        return (end_date - start_date).days
    elif interval == 'MONTH':
        rd = relativedelta(end_date, start_date)
        return rd.years * 12 + rd.months
    elif interval == 'WEEK':
        return (end_date - start_date).days // 7

In [4]:
# Imporing Dependencies
load('temp_calls')
load('temp_samples')
load('temp_abbv')
load('mp_spec_seg_dec')
load('hierarchy',geo)
load('wd_raw')
load('lirwd_call_plan')
load('laxdn_geoid_sum')
load('MASTER_UNI')
load('roster')

geo_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/GeographyMapping.txt',separator='|')
geo_mapping = geo_mapping.with_columns(
    Code = pl.when(pl.col('Code')!= 'NATION').then(pl.lit('1111-')+pl.col('Code')).otherwise(pl.col('Code'))
)
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')

geo_id_full = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))

#fixes for vortex import -> Probably caused by Polars Upgrades
temp_calls = temp_calls.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_samples = temp_samples.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_abbv = temp_abbv.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
wd_raw = wd_raw.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
laxdn_geoid_sum = laxdn_geoid_sum.with_columns(pl.col('geography_id').cast(pl.Int64))

In [5]:
# Processing  1. temp calls  2. temp samples 3. temp abbv datasets
# - doubt is physiian terr id same as salesrepterrid for every record?
temp_calls_mp_spec = (
    temp_calls
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)
# NOTE -
# Combining MP and dropping null geo
# Adding Area and Region Code
# Adding Working Day
# Adding call_freq_quarter

# For Supproting Calc ->
#geo_code_mapper = temp_calls_mp_spec[['geography_id','region_geography_id','area_geography_id','nation_geography_id']].unique()
geo_code_mapper = geo_id_full
geo_code_mapper.to_pandas().to_parquet(dflib+'geo_code_mapper.parquet') #exporting for other code use

###
temp_samples_mp_spec = (
    temp_samples
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)
###
temp_abbv_mp_spec = (
    temp_abbv
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)

### Generator Functions

In [6]:
# Gets Count of HCP For All Rollups (Just for prod LIN)
def get_num_hcp_counts():
    # get presc_count :
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []
    source_df = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    #source_df = source_df.filter(pl.col('call_freq_quarter').is_not_null()) # should i change this to segment = target ? 
    source_df = source_df.filter(pl.col('segment')=='Target')
    for g in levels:
        df = (
            source_df
            .group_by([g,sg,p,spc,d]) # Remove p as only linzess data present ? .
            .agg(num_hcps = pl.col('AttendeeIID').n_unique())
        )
        
        main_seq = [g,p,sg,d,spc] + ['num_hcps']
        agg_dict = {'num_hcps':pl.col('AttendeeIID').n_unique()}
        
        # First Round - 
        sg_df = (source_df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (source_df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (source_df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (source_df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (source_df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (source_df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (source_df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
        res.append(df)
    
    return(res)


In [7]:
# Gets Call_Freq_Quarter for all rollups -
def get_call_freq_quarter_vals():
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []
    source_df = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    for g in levels:
        df = (
            source_df
            .group_by([g,sg,p,spc,d]) # Remove p as only linzess data present ? .
            .agg(
                call_freq_quarter = pl.col('call_freq_quarter').sum(), # doubt
            )
        )
        main_seq = [g,p,sg,d,spc] + ['call_freq_quarter']
        agg_dict = {'call_freq_quarter':pl.col('call_freq_quarter').sum()}
        
        # First Round - 
        sg_df = (source_df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (source_df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (source_df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (source_df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (source_df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (source_df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (source_df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
        res.append(df)
    
    return(res)


In [8]:
# Gets days_in_field counts for MAX rollup row only - 
def get_days_in_field_counts():
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []
    source_df = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    for g in levels:
        df = (
            source_df
            .group_by([g,sg,p,spc,d])
            .agg(days_in_field = pl.col('days_in_field').mean()) # WD (using mean here because all vals are same))
        )
        main_seq = [g,p,sg,d,spc] + ['days_in_field']
        agg_dict = {'days_in_field':pl.col('days_in_field').mean()}
        # First Round - 
        sg_df = (source_df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (source_df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (source_df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (source_df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (source_df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (source_df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (source_df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
        res.append(df)
    return (res)

In [9]:
# Gets count of calls for all rollsups for a given source (temp_call)
def get_num_calls_counts(source,agg_var,fl=None):
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []
    source_df = source.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
    if fl == 1: #dropping invalid calls from temp_calls only | NOT temp_abbv
        source_df = (
            source_df
            .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
            .join(roster, on = 'SalesRepIID' , how = 'left')
            .filter(pl.col('Territory')==pl.col('GEO'))
        )
    for g in levels:
        df = (
            source_df
            .group_by([g,sg,p,spc,d]) # Remove p as only linzess data present ? .
            .agg(pl.col('CallID').n_unique().alias(agg_var))
        )
        main_seq = [g,p,sg,d,spc] + [agg_var]
        agg_dict = {agg_var:pl.col('CallID').n_unique()}
        # First Round - 
        sg_df = (source_df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (source_df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (source_df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (source_df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (source_df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (source_df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (source_df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
        res.append(df)
    return (res)

In [10]:
# Gets coutn of abbv visits for all rollups
def get_num_abbv_calls_counts(agg_var):
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []

    ab = (
        temp_abbv_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .group_by('AttendeeIID').agg(ab_calls = pl.col('CallID').n_unique())
        .rename({'AttendeeIID':'IID'})
        .join(mp_spec_seg_dec,on='IID',how='left')
    )
    
    iw = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
        .group_by('AttendeeIID').agg(iw_calls = pl.col('CallID').n_unique())
        .rename({'AttendeeIID':'IID'})
        .join(mp_spec_seg_dec,on='IID',how='left')
        .join(mp_spec_seg_dec.filter(segment = 'Target'),on = ['IID','geography_id','specialty_group','segment','decile'], how = 'outer_coalesce')
        .join(ab,on = ['IID','geography_id','specialty_group','segment','decile'],how = 'left')
        .join(geo_code_mapper,on='geography_id',how='left')
        .with_columns(pl.lit('LIN').alias(p))
    )

    for g in levels:
        df = (
            iw
            .group_by([g,sg,p,spc,d]) # Remove p as only linzess data present ? .
            .agg(pl.col('ab_calls').sum().alias(agg_var))
        )
        main_seq = [g,p,sg,d,spc] + [agg_var]
        agg_dict = {agg_var:pl.col('ab_calls').sum()}
        # First Round - 
        sg_df = (iw.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (iw.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (iw.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (iw.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (iw.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (iw.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (iw.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
        res.append(df)
    return (res)

In [11]:
# Gets Count of Calls for just geo rollups.
def get_total_calls_counts(): #Total Calls Per Geo -
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    res = []
    source_df = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    for g in levels:
        df = (
            source_df
            .group_by([g])
            .agg(total_calls = pl.col('CallID').n_unique())
        )
        res.append(df)
    return(res)

In [12]:
# Gets Count of targets for all rollups-
def get_target_hcps_counts():
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []
    source_df = mp_spec_seg_dec.filter(pl.col('segment')=='Target').join(geo_code_mapper,on = levels[0],how = 'left')
    for g in levels:
        df = source_df.group_by([g,sg,d,spc]).agg(target_hcps = pl.col('IID').n_unique())
        main_seq = [g,sg,d,spc] + ['target_hcps']
        agg_dict = {'target_hcps':pl.col('IID').n_unique()}
        # First Round - 
        sg_df = (source_df.group_by([g,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (source_df.group_by([g,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (source_df.group_by([g,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (source_df.group_by([g,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (source_df.group_by([g,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (source_df.group_by([g,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (source_df.group_by([g]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
        res.append(df)
    return (res)

In [13]:
# Gets count of 13 week targets for all rollups -
def get_tgts_13wks_counts():
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []
    source_df = temp_calls_mp_spec.filter((pl.col('call_week')<=13)&(pl.col('segment')=='Target'))
    for g in levels:
        df = source_df.group_by([g,sg,p,spc,d]).agg(tgts_13wks = pl.col('AttendeeIID').n_unique())
        main_seq = [g,sg,p,d,spc] + ['tgts_13wks']
        agg_dict = {'tgts_13wks':pl.col('AttendeeIID').n_unique()}
        # First Round - 
        sg_df = (source_df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (source_df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (source_df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (source_df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (source_df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (source_df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (source_df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
        res.append(df)
    return (res)

In [14]:
# Gets count of sampled HCPs for all rollups-
def get_num_sample_hcp_counts():
    def samples_hcp_utl(product_input):
        p,sg,spc,d = product_input,'segment','specialty_group','decile'
        levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
        sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
        res = []
        source_df = (
            temp_samples_mp_spec
            .filter(pl.col('sample_week')<=num_weeks_calls)
            .filter(pl.col('CallDate')>= quarter_start)
            .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
            .join(roster, on = 'SalesRepIID' , how = 'left')
            .filter(pl.col('Territory')==pl.col('GEO'))
            ### TO GET SUB  PRODUCT ####
            .with_columns(
                pl.when(pl.col('CallProductDescription')=="145 mcg").then(pl.lit('LI1'))
                .when(pl.col('CallProductDescription')=="290 mcg").then(pl.lit('LI2'))
                .when(pl.col('CallProductDescription')=="72 mcg").then(pl.lit('LI3'))
                .otherwise(None)
                .alias('sub_product')
            )
            .filter(segment = 'Target')
        )
        for g in levels:
            df = (source_df.group_by([g,sg,p,spc,d]).agg(num_sample_hcps = pl.col('AttendeeIID').n_unique()))
            main_seq = [g,p,sg,d,spc] + ['num_sample_hcps']
            agg_dict = {'num_sample_hcps':pl.col('AttendeeIID').n_unique()}
            # First Round - 
            sg_df = (source_df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
            d_df = (source_df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
            spc_df = (source_df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
            # Second Round - 
            sg_d_df = (source_df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
            sg_spc_df = (source_df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
            d_spc_df = (source_df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
            # Third Round
            sg_d_spc_df = (source_df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
            df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
            if product_input != 'product':
                df = df.rename({'sub_product':'product'})
            res.append(df)
        return (res)

    res_LINF = samples_hcp_utl('product')
    res_LINP = samples_hcp_utl('sub_product')

    res = []
    for g,i in zip(levels,[0,1,2,3]):
        frame = res_LINP[i]
        frame_parent = res_LINF[i]
        frame_pivoted = (
            frame
            .pivot(values="num_sample_hcps", index=[g, "segment", "decile", "specialty_group"], columns="product")
            .rename({"LI1": "num_sample_hcps_LI1", "LI2": "num_sample_hcps_LI2", "LI3": "num_sample_hcps_LI3"})
        ).fill_null(0)
    
        frame_final = frame_parent.join(frame_pivoted, on = [g, "segment", "decile", "specialty_group"] ,how = 'outer_coalesce')
        res.append(frame_final)

    return(res)
        



In [15]:
# Gets count of samples for all rollups-
def get_num_samples_count():
    def samples_util(product_input):
        p,sg,spc,d = product_input,'segment','specialty_group','decile'
        levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
        sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
        res = []
        source_df = (
            temp_samples_mp_spec
            .filter(pl.col('sample_week')<=num_weeks_calls)
            .filter(pl.col('CallDate')>= quarter_start)
            .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
            .join(roster, on = 'SalesRepIID' , how = 'left')
            .filter(pl.col('Territory')==pl.col('GEO'))
            ### TO GET SUB  PRODUCT ####
            .with_columns(
                pl.when(pl.col('CallProductDescription')=="145 mcg").then(pl.lit('LI1'))
                .when(pl.col('CallProductDescription')=="290 mcg").then(pl.lit('LI2'))
                .when(pl.col('CallProductDescription')=="72 mcg").then(pl.lit('LI3'))
                .otherwise(None)
                .alias('sub_product')
            )
        )
        
        for g in levels:
            df = (
                source_df
                .group_by([g,sg,p,spc,d])
                .agg(pl.col('CallProductQuantity').sum().alias('num_samples'))
            )
            main_seq = [g,p,sg,d,spc] + ['num_samples']
            agg_dict = {'num_samples':pl.col('CallProductQuantity').sum()}
            # First Round - 
            sg_df = (source_df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
            d_df = (source_df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
            spc_df = (source_df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
            # Second Round - 
            sg_d_df = (source_df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
            sg_spc_df = (source_df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
            d_spc_df = (source_df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
            # Third Round
            sg_d_spc_df = (source_df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
            df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
            
            if product_input != 'product':
                df = df.rename({'sub_product':'product'})
            res.append(df)
        return(res)
    
    
    res_LINF = samples_util('product')
    res_LINP = samples_util('sub_product')

    res = []
    for g,i in zip(levels,[0,1,2,3]):
        frame = res_LINP[i]
        frame_parent = res_LINF[i]
        frame_pivoted = (
            frame
            .pivot(values="num_samples", index=[g, "segment", "decile", "specialty_group"], columns="product")
            .rename({"LI1": "num_samples_LI1", "LI2": "num_samples_LI2", "LI3": "num_samples_LI3"})
        ).fill_null(0)
    
        frame_final = frame_parent.join(frame_pivoted, on = [g, "segment", "decile", "specialty_group"] ,how = 'outer_coalesce')
        res.append(frame_final)
    

    return(res)

In [16]:
# Gets Count of samples for just geo rollups.
def get_total_samples_counts(): #Total Samples Per Geo -
    p,sg,spc,d = 'sub_product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    res = []
    source_df = (
        temp_samples_mp_spec
        .filter(pl.col('sample_week')<=num_weeks_calls)
        .filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
        ### TO GET SUB  PRODUCT ####
        .with_columns(
            pl.when(pl.col('CallProductDescription')=="145 mcg").then(pl.lit('LI1'))
            .when(pl.col('CallProductDescription')=="290 mcg").then(pl.lit('LI2'))
            .when(pl.col('CallProductDescription')=="72 mcg").then(pl.lit('LI3'))
            .otherwise(None)
            .alias('sub_product')
        )
    )
    for g in levels:
        df = (
            source_df
            .group_by([g,p])
            .agg(total_samples = pl.col('CallProductQuantity').sum())
        )
        res.append(df)

    res2 = []
    for g,i in zip(levels,[0,1,2,3]):
        frame = res[i]
        frame_pivoted = (
            frame
            .pivot(values="total_samples", index=[g], columns="sub_product")
            .rename({"LI1": "total_samples_LI1", "LI2": "total_samples_LI2", "LI3": "total_samples_LI3"})
        ).fill_null(0).with_columns(
            product = pl.lit('LIN'),
            total_samples_LIN = pl.sum_horizontal(['total_samples_LI1','total_samples_LI2','total_samples_LI3'])
        ).select([g,'total_samples_LIN','total_samples_LI1','total_samples_LI2','total_samples_LI3'])

        
        res2.append(frame_pivoted)
    
    return(res2)
    

In [17]:
# Gets count of targets with more than 3 calls at all rollups - 
def get_tgts3_counts(): #Num_Of_QTD_Tgts_3Plus_Calls
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []
    source_df = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
        .filter(segment = 'Target')
    )
    source_df = (
        source_df
        .group_by('AttendeeIID').agg(pl.col('CallID').n_unique().alias('calls'))
        .filter(pl.col('calls')>=3)
        .join(mp_spec_seg_dec,left_on='AttendeeIID',right_on='IID')
        .join(geo_code_mapper,on='geography_id',how='left').drop('calls')
    )
    for g in levels:
        df = source_df.group_by([g,sg,d,spc]).agg(tgts3 = pl.col('AttendeeIID').n_unique())
        main_seq = [g,sg,d,spc] + ['tgts3']
        agg_dict = {'tgts3':pl.col('AttendeeIID').n_unique()}
        # First Round - 
        sg_df = (source_df.group_by([g,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (source_df.group_by([g,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (source_df.group_by([g,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (source_df.group_by([g,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (source_df.group_by([g,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (source_df.group_by([g,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (source_df.group_by([g]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
        res.append(df)
    return (res)

In [18]:
# Gets count of HCPs who are considered Optimal , Bellow for all rollups 
def get_optimal_below_counts():
    col = 'optimal'
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    source_df = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    source_df = (
        source_df
        .group_by('AttendeeIID').agg(month_call_count = pl.col('call_month').n_unique())
        .join(mp_spec_seg_dec,left_on='AttendeeIID',right_on='IID')
        .join(geo_code_mapper,on='geography_id',how='left').drop('calls')
    )
    source_df_o = source_df.filter(pl.col('month_call_count')==num_of_months).filter(segment = 'Target')
    
    res = []
    for g in levels:
        main_seq = [g,d,spc] + [col]
        df = source_df_o.group_by([g,d,spc]).agg(pl.col('AttendeeIID').n_unique().alias(col))
        d_df = source_df_o.group_by([g,spc]).agg(pl.col('AttendeeIID').n_unique().alias(col)).with_columns(d_roll_up.alias(d)).select(main_seq)
        spc_df = source_df_o.group_by([g,d]).agg(pl.col('AttendeeIID').n_unique().alias(col)).with_columns(spc_roll_up.alias(spc)).select(main_seq)
        d_spc_df = source_df_o.group_by([g]).agg(pl.col('AttendeeIID').n_unique().alias(col)).with_columns(spc_roll_up.alias(spc),d_roll_up.alias(d)).select(main_seq)
        df = (df.select(main_seq).vstack(d_df).vstack(spc_df).vstack(d_spc_df))
    
        df1 = (
            df.with_columns(segment = pl.lit('UNI'))
            .vstack(df.with_columns(segment = pl.lit('Target')))
            .vstack(df.with_columns(pl.lit(0).cast(pl.UInt32).alias(col),pl.lit('ALG-ONLY-TARGET').alias('segment')))
            .vstack(df.with_columns(pl.lit(0).cast(pl.UInt32).alias(col),pl.lit('Non-Target').alias('segment')))
            .select([g,d,spc,sg,col])
        )
        # Pulling count of targets to subtract and get bellow HCPs - 
        df1 = (
            df1
            .join(target_hcps_counts[levels.index(g)],on = [g,d,spc,sg],how='left')
            .with_columns(
                pl.when(pl.col('segment').is_in(['UNI','Target'])).then(pl.col('target_hcps')-pl.col('optimal')).otherwise(pl.lit(0).cast(pl.UInt32)).alias('below')
            )
            .drop('target_hcps')
        )
        res.append(df1)
    return(res)

In [19]:
# Gets count of mean number of called months in last 12 months at all rollups-
def get_called_12m_counts():
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []
    
    ###
    iw = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
        .group_by('AttendeeIID').agg(iw_calls = pl.col('CallID').n_unique())
        .rename({'AttendeeIID':'IID'})
        .join(mp_spec_seg_dec,on='IID',how='left')
        .join(mp_spec_seg_dec.filter(segment = 'Target'),on = ['IID','geography_id','specialty_group','segment','decile'], how = 'outer_coalesce')
    
    )
    qtd_iw_iids = list(iw['IID'].unique())
    ###
    source_df = (
         temp_calls_mp_spec.filter(pl.col('call_month')<=12)
        .filter(pl.col('AttendeeIID').is_in(qtd_iw_iids))
        .group_by('AttendeeIID').agg(called_months = pl.col('call_month').n_unique())
        .join(mp_spec_seg_dec,left_on='AttendeeIID',right_on='IID')
        .join(geo_code_mapper,on='geography_id',how='left')
    )
    terr = pl.DataFrame()
    g = levels[0]
    df = source_df.group_by([g,sg,d,spc]).agg(called_12m = pl.col('called_months').mean())
    main_seq = [g,sg,d,spc] + ['called_12m']
    agg_dict = {'called_12m':pl.col('called_months').mean()}
    # First Round - 
    sg_df = (source_df.group_by([g,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
    d_df = (source_df.group_by([g,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
    spc_df = (source_df.group_by([g,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
    # Second Round - 
    sg_d_df = (source_df.group_by([g,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
    sg_spc_df = (source_df.group_by([g,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
    d_spc_df = (source_df.group_by([g,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
    # Third Round
    sg_d_spc_df = (source_df.group_by([g]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
    df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
    terr = df
    res.append(terr)
    
    #Region = Mean of Area - 
    reg = (
        terr.join(geo_code_mapper,on = 'geography_id',how='left')
        .group_by(['region_geography_id',sg,d,spc]).agg(called_12m = pl.col('called_12m').mean())
    )
    res.append(reg)
    
    
    #Area = Mean of Region
    area = (
        reg
        .join(geo_code_mapper.select(['region_geography_id','area_geography_id']).unique(),on='region_geography_id',how='left')
        .group_by(['area_geography_id',sg,d,spc]).agg(called_12m = pl.col('called_12m').mean())
    )
    res.append(area)
    
    #Nation = Mean of Area
    nation = (
        area
        .join(geo_code_mapper.select(['area_geography_id','nation_geography_id']).unique(),on='area_geography_id',how='left')
        .group_by(['nation_geography_id',sg,d,spc]).agg(called_12m = pl.col('called_12m').mean())
    )
    res.append(nation)
    
    return(res)

In [20]:
# Gets call frequency for all roll ups -
def get_call_freq_counts():
    p,sg,spc,d = 'product','segment','specialty_group','decile'
    levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    res = []
    source_df = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    source_df = (
        source_df
        .group_by('AttendeeIID').agg(calls = pl.col('CallID').n_unique())
        .join(mp_spec_seg_dec,left_on='AttendeeIID',right_on='IID')
        .join(geo_code_mapper,on='geography_id',how='left')
    )
    for g in levels:
        df = source_df.group_by([g,sg,d,spc]).agg(call_freq = pl.col('calls').mean())
        main_seq = [g,sg,d,spc] + ['call_freq']
        agg_dict = {'call_freq':pl.col('calls').mean()}
        # First Round - 
        sg_df = (source_df.group_by([g,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (source_df.group_by([g,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (source_df.group_by([g,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (source_df.group_by([g,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (source_df.group_by([g,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (source_df.group_by([g,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (source_df.group_by([g]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        df = (df.select(main_seq).vstack(sg_df).vstack(d_df).vstack(spc_df).vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df).vstack(sg_d_spc_df))
        res.append(df)
    return (res)

In [21]:
# DATA PREP - Function Calls -
levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
num_hcp_counts = get_num_hcp_counts()
call_freq_quarter_vals = get_call_freq_quarter_vals()
days_in_field_counts = get_days_in_field_counts()
num_calls_counts = get_num_calls_counts(temp_calls_mp_spec,'num_calls',1)
total_calls_counts = get_total_calls_counts()
num_abbv_calls_counts = get_num_abbv_calls_counts('num_abbv_calls')
target_hcps_counts = get_target_hcps_counts()
tgts_13wks_counts = get_tgts_13wks_counts()
num_sample_hcp_counts = get_num_sample_hcp_counts()
num_samples_count = get_num_samples_count()
total_samples_counts = get_total_samples_counts()
tgts3_counts = get_tgts3_counts()
optimal_below_counts = get_optimal_below_counts()
called_12m_counts = get_called_12m_counts()
call_freq_counts= get_call_freq_counts()

---

### Functions - 

In [22]:
# num_calls , num_hcps, days_in_field, call_freq_quarter, total_calls , num_abbv_calls_
def process_1(df):
    source_df = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    for i in range(4):
        g = levels[i]
        f1 = num_calls_counts[i] #num_calls from here
        f2 = num_hcp_counts[i] # num_hcps from here
        f3 = (call_freq_quarter_vals[i]) #call freq qtr here
        f4 = num_abbv_calls_counts[i]
        f5 = total_calls_counts[i]
        f6 = days_in_field_counts[i]
        f = (
            f1
            .join(f2,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f3,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f4,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f6,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f5,on=[g],how='outer_coalesce')
        )
        df[i] = f
    return (df)

In [23]:
# target_hcps, tgts_13wks, called_12m,optimal,below, tgts3 , wk_qtd
# num_samples ,num_sample_hcps ,total_samples , call_freq 
def process_2(df):
    source_df = (laxdn_geoid_sum.join(geo_code_mapper,on=levels[0],how = 'left'))
    for i in range(4):
        g = levels[i]
        f1 = target_hcps_counts[i]
        f2 = tgts_13wks_counts[i]
        f3 = called_12m_counts[i].with_columns(pl.lit('LIN').alias(p))
        f4 = optimal_below_counts[i].with_columns(pl.lit('LIN').alias(p))
        f6 = tgts3_counts[i].with_columns(pl.lit('LIN').alias(p))
        f7 = num_samples_count[i]
        f8 = num_sample_hcp_counts[i]
        f9 = total_samples_counts[i]
        f10 = call_freq_counts[i].with_columns(pl.lit('LIN').alias(p))
        fn = (
            source_df
            .group_by(levels[i])
            .agg(
                wk_qtd_LIN = pl.col('wk_qtd_LIN').sum(),
                wk_qtd_LI1 = pl.col('wk_qtd_LI1').sum(),
                wk_qtd_LI2 = pl.col('wk_qtd_LI2').sum(),
                wk_qtd_LI3 = pl.col('wk_qtd_LI3').sum(),
            )
        )

        f = (
            f1.with_columns(pl.lit('LIN').alias(p))
            .join(f2,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f7,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f8,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f3,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f4,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f6,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f10,on=[g,p,sg,spc,d],how='outer_coalesce')
            .join(f9,on=[g],how='left')
            .join(fn,on=[g],how='left')
        ).fill_null(0)

        # merging with process_1()
        f = f.join(df[i],on=[g,p,sg,spc,d],how='outer_coalesce')
        df[i] = f
    return (df)

In [24]:
#call_distribution, 
def process_3(df):
    for i in range(4):
        f = df[i]
        f = f.with_columns(
            call_distribution = pl.col('num_calls') / pl.col('total_calls'),
        )
        df[i] = f
    return(df)

In [25]:
#calls_per_day
def process_3_1(df):
    f_mr = (
        df[0]
        .filter((pl.col('segment')=='UNI')&(pl.col('decile')=='0-10')&(pl.col('specialty_group')=='ALL SPEC'))
        .with_columns(calls_per_day = pl.col('num_calls')/pl.col('days_in_field'))
        .select(['geography_id','calls_per_day'])
    )
    # Terr Level - 
    ft = df[0].join(f_mr,on = 'geography_id',how='left')
    df[0] = ft
    # All Other levels are avg of their children - 
    ft = ft.join(geo_code_mapper,on = levels[0], how = 'left')
    for i in range(1,4):
        f = df[i]
        ft_rollup = ft.group_by(levels[i]).agg(calls_per_day = pl.col('calls_per_day').mean())
        f = f.join(ft_rollup, on = levels[i],how = 'left')
        df[i] = f

    return(df)


In [26]:
#qtd reach , 13 week reach, QTD_Tgts_Not_Reached
def process_4(df):
    for i in range(4):
        f = df[i]
        f = (
            f
            .with_columns(
                prc_reach = pl.col('num_hcps')/pl.col('target_hcps'),
                tgt_reach_13wk = pl.col('tgts_13wks')/pl.col('target_hcps'),
                #qtd_tgt_nreach = pl.col('target_hcps')- pl.col('num_hcps')
                qtd_tgt_nreach = pl.when(pl.col('num_hcps').is_null()).then(pl.col('target_hcps')).otherwise(pl.col('target_hcps') - pl.col('num_hcps'))
            )
            .drop('tgts_13wks')
        )
        df[i] = f
    return(df)

In [27]:
# Trx Per Call
def process_5(df):
    for i in range(4):
        f = df[i]

        f = f.with_columns(
            rx_per_call = pl.col('wk_qtd_LIN')/pl.col('num_calls'),
            rx_per_call_LI1 = pl.col('wk_qtd_LI1')/pl.col('num_calls'),
            rx_per_call_LI2 = pl.col('wk_qtd_LI2')/pl.col('num_calls'),
            rx_per_call_LI3 = pl.col('wk_qtd_LI3')/pl.col('num_calls'),
        )

        df[i] = f
    return(df)

In [28]:
# sample_distribution, prc_sampled_phy, ,rx_per_sample
def process_6(df):
    for i in range(4):
        f = df[i]
        f = (
            f
            .with_columns(
                sample_distribution = pl.col('num_samples')/pl.col('total_samples_LIN'),
                
                prc_sampled_phy_LIN = pl.col('num_sample_hcps')/pl.col('target_hcps'),
                prc_sampled_phy_LI1 = pl.col('num_sample_hcps_LI1')/pl.col('target_hcps'),
                prc_sampled_phy_LI2 = pl.col('num_sample_hcps_LI2')/pl.col('target_hcps'),
                prc_sampled_phy_LI3 = pl.col('num_sample_hcps_LI3')/pl.col('target_hcps'),
                
                rx_per_sample_LIN = pl.col('wk_qtd_LIN')/pl.col('total_samples_LIN'),
                rx_per_sample_LI1 = pl.col('wk_qtd_LI1')/pl.col('total_samples_LI1'),
                rx_per_sample_LI2 = pl.col('wk_qtd_LI2')/pl.col('total_samples_LI2'),
                rx_per_sample_LI3 = pl.col('wk_qtd_LI3')/pl.col('total_samples_LI3'),
            )#.drop('wk_qtd')
        )

        df[i] = f

    return(df)

In [29]:
# avg_sample_per_hcp
def process_6_1(df):
    for i in range(4):
        g = levels[i]
        f = df[i]
        sf = (
            f
            .select([g,sg,d,spc,p,'num_samples','num_sample_hcps','num_samples_LI1','num_sample_hcps_LI1','num_samples_LI2','num_sample_hcps_LI2','num_samples_LI3','num_sample_hcps_LI3'])
            .filter(segment = 'Target')
            .with_columns(
                avg_sample_per_hcp_LIN = pl.col('num_samples')/pl.col('num_sample_hcps'),
                avg_sample_per_hcp_LI1 = pl.col('num_samples_LI1')/pl.col('num_sample_hcps_LI1'),
                avg_sample_per_hcp_LI2 = pl.col('num_samples_LI2')/pl.col('num_sample_hcps_LI2'),
                avg_sample_per_hcp_LI3 = pl.col('num_samples_LI3')/pl.col('num_sample_hcps_LI3'),
            )
            .with_columns(
                pl.col('avg_sample_per_hcp_LIN').replace(np.nan,0),
                pl.col('avg_sample_per_hcp_LI1').replace(np.nan,0),
                pl.col('avg_sample_per_hcp_LI2').replace(np.nan,0),
                pl.col('avg_sample_per_hcp_LI3').replace(np.nan,0),
            )
        )
        sf_uni = (sf.drop('segment').with_columns(pl.lit('UNI').alias('segment')).select(sf.columns))
        sf = sf.vstack(sf_uni).drop(['num_samples','num_sample_hcps','num_samples_LI1','num_sample_hcps_LI1','num_samples_LI2','num_sample_hcps_LI2','num_samples_LI3','num_sample_hcps_LI3'])
        f = (f.join(sf,on = [g,sg,d,spc,p],how='left'))

        df[i] = f
    return(df)

In [30]:
#call_freq_goal_prc
def process_7(df):
    formula_helper_1 = intck('DAY',quarter_start,curr_date) / intck('DAY',quarter_start,quarter_end)
    for i in range(4):
        f = df[i]
        f = f.with_columns(
            call_freq_goal_prc = pl.when(pl.col('call_freq_quarter').is_null()).then(None
            ).otherwise(pl.col('total_calls')/(pl.col('call_freq_quarter') * formula_helper_1))
        )
        df[i] = f
    return(df)

In [31]:
# adding upper and lower limits : (use prc_reach)
# flow -
## for terr level -
# three cols , ind1,ind2,ind3
# for ind1 ->nation , ind2->area , ind3->region
# to create an upper limit and lower limit 
# upper : meadian + 0.5*stddev
# lower : median - 0.5*stddev
#bench_ind : if var > upper then A | if var < lower then B | if lower <= var <= upper then E

def process_reach_benchmark(df): # WORKING CORRECTLY BUT NOT MODULAR , PLEASE UPDATE STRUCTURE 
    def add_indicator(df, ind_name, col1, col2, col3):
        return df.with_columns(
            pl.when(pl.col(col1) > pl.col(col2))
            .then(pl.lit('A'))
            .when(pl.col(col1) < pl.col(col3))
            .then(pl.lit('B'))
            .when((pl.col(col3) < pl.col(col1)) & (pl.col(col1) < pl.col(col2)))
            .then(pl.lit('E'))
            .otherwise(None)  # You can replace 'N/A' with any default value
            .alias(ind_name)
        )
    #Terr
    f = df[0]
    nf = f.select([levels[0],p,sg,spc,d,'prc_reach'])
    nf = nf.join(geo_code_mapper,on = levels[0],how = 'left')
    # create upper and lowers : 
    nf_n = nf.group_by([p,sg,spc,d]).agg(
        nul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        nll = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf_a = nf.group_by([levels[2],p,sg,spc,d]).agg(
        aul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        all = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf_r = nf.group_by([levels[1],p,sg,spc,d]).agg(
        rul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        rll = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf = (
        nf
        .join(nf_n, on=[p, sg, spc, d], how='left')
        .join(nf_a,on=[levels[2],p, sg, spc, d],how='left')
        .join(nf_r,on=[levels[1],p, sg, spc, d],how='left')
    ).drop(levels[1],levels[2],levels[3])


    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind1', 'prc_reach', 'nul', 'nll')
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind2', 'prc_reach', 'aul', 'all')
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind3', 'prc_reach', 'rul', 'rll').drop(['nul','nll','aul','all','rul','rll','prc_reach'])

    f = f.join(nf,on=[levels[0],p, sg, spc, d],how = 'left')
    df[0] = f
    #Region
    f = df[1]
    nf = f.select([levels[1],p,sg,spc,d,'prc_reach']).join(
        geo_code_mapper[['region_geography_id','area_geography_id']].unique(),on = levels[1],how = 'left'
    )
    # create upper and lowers : 
    nf_n = nf.group_by([p,sg,spc,d]).agg(
        nul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        nll = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf_a = nf.group_by([levels[2],p,sg,spc,d]).agg(
        aul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        all = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf = (
        nf
        .join(nf_n, on=[p, sg, spc, d], how='left')
        .join(nf_a,on=[levels[2],p, sg, spc, d],how='left')
    ).drop(levels[2],levels[3])
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind1', 'prc_reach', 'nul', 'nll')
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind2', 'prc_reach', 'aul', 'all')
    nf = nf.with_columns(pl.lit(None).alias('Reach_Prc_BnchMrk_Ind3')).drop(['nul','nll','aul','all','prc_reach'])
    f = f.join(nf,on=[levels[1],p, sg, spc, d],how = 'left')
    df[1] = f
    #Area
    f = df[2]
    nf = f.select([levels[2],p,sg,spc,d,'prc_reach'])
    # create upper and lowers : 
    nf_n = nf.group_by([p,sg,spc,d]).agg(
        nul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        nll = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf = (nf.join(nf_n, on=[p, sg, spc, d], how='left'))
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind1', 'prc_reach', 'nul', 'nll')
    nf = nf.with_columns(pl.lit(None).alias('Reach_Prc_BnchMrk_Ind2'),pl.lit(None).alias('Reach_Prc_BnchMrk_Ind3')).drop(['nul','nll','prc_reach'])
    f = f.join(nf,on=[levels[2],p, sg, spc, d],how = 'left')
    df[2] = f
    #Nation 
    f = df[3]
    f = f.with_columns(pl.lit(None).alias('Reach_Prc_BnchMrk_Ind1'),pl.lit(None).alias('Reach_Prc_BnchMrk_Ind2'),pl.lit(None).alias('Reach_Prc_BnchMrk_Ind3'))
    df[3] = f

    return(df)

In [32]:
#Prc_Of_Optimal_And_Above
def process_8(df):
    for i in range(4):
        f = df[i]
        f = f.with_columns(perc_opt_above = pl.col('optimal')/(pl.col('optimal')+pl.col('below')))
        df[i] = f
    return(df)

---

In [33]:
# For Convert To Feed Ready data
def get_feed(temp1):
    # Get FEED PROTOYPING ->
    temp1[0] = temp1[0].rename({'geography_id': 'Geography_id'})
    temp1[1] = temp1[1].rename({'region_geography_id': 'Geography_id'}).select(temp1[0].columns)
    temp1[2] = temp1[2].rename({'area_geography_id': 'Geography_id'}).select(temp1[0].columns)
    temp1[3] = temp1[3].rename({'nation_geography_id': 'Geography_id'}).select(temp1[0].columns)
    
    final_feed = temp1[0].vstack(temp1[1]).vstack(temp1[2]).vstack(temp1[3])
    final_feed = final_feed.drop([
        'num_hcps','call_freq_quarter','total_calls',
        'total_samples_LIN','total_samples_LI1','total_samples_LI2','total_samples_LI3',
        'wk_qtd_LIN','wk_qtd_LI1','wk_qtd_LI2','wk_qtd_LI3'
    ])
    
    #fix for product_id
    pm = prod_mapping.with_columns(pl.lit('LIN').alias('product')).select(['product','product_id'])
    final_feed2 = final_feed.join(pm,on='product',how='left').drop('product')
    
    # Split into two parts ->
    final_feed2_LIN = final_feed2.filter(~pl.col('product_id').is_in([3,4,5])) # For Product Code 2 and other products  
    final_feed2_LIP = final_feed2.filter(pl.col('product_id').is_in([3,4,5])) # For Sub Products
    
    final_feed2_LIN = (
        final_feed2_LIN
        .drop([
            'num_samples_LI1','num_samples_LI2','num_samples_LI3',
            'num_sample_hcps_LI1','num_sample_hcps_LI2','num_sample_hcps_LI3',
            'rx_per_call_LI1','rx_per_call_LI2','rx_per_call_LI3',
            'prc_sampled_phy_LI1','prc_sampled_phy_LI2','prc_sampled_phy_LI3',
            'rx_per_sample_LI1','rx_per_sample_LI2','rx_per_sample_LI3',
            'avg_sample_per_hcp_LI1','avg_sample_per_hcp_LI2','avg_sample_per_hcp_LI3'
        ])
        .rename({
            'prc_sampled_phy_LIN' : 'prc_sampled_phy',
            'rx_per_sample_LIN' : 'rx_per_sample',
            'avg_sample_per_hcp_LIN' : 'avg_sample_per_hcp'
        })
    )
    
    column_parings = {
        'num_samples' : ['num_samples_LI3','num_samples_LI1','num_samples_LI2'],
        'num_sample_hcps' : ['num_sample_hcps_LI3','num_sample_hcps_LI1','num_sample_hcps_LI2'],
        'rx_per_call' : ['rx_per_call_LI3','rx_per_call_LI1','rx_per_call_LI2'],
        'prc_sampled_phy_LIN' : ['prc_sampled_phy_LI3','prc_sampled_phy_LI1','prc_sampled_phy_LI2'],
        'rx_per_sample_LIN' : ['rx_per_sample_LI3','rx_per_sample_LI1','rx_per_sample_LI2'],
        'avg_sample_per_hcp_LIN' : ['avg_sample_per_hcp_LI3','avg_sample_per_hcp_LI1','avg_sample_per_hcp_LI2']
    }
    
    sub_product_copy_logic = []
    for k,v in column_parings.items():
        expn = (
            pl.when(pl.col('product_id')==3).then(pl.col(v[0]))
            .when(pl.col('product_id')==4).then(pl.col(v[1]))
            .when(pl.col('product_id')==5).then(pl.col(v[2]))
            .otherwise(pl.col(k)).alias(k)
        )
        sub_product_copy_logic.append(expn)
    
    final_feed2_LIP = (
        final_feed2_LIP
        .with_columns(*sub_product_copy_logic)
        .drop([
            'num_samples_LI1','num_samples_LI2','num_samples_LI3',
            'num_sample_hcps_LI1','num_sample_hcps_LI2','num_sample_hcps_LI3',
            'rx_per_call_LI1','rx_per_call_LI2','rx_per_call_LI3',
            'prc_sampled_phy_LI1','prc_sampled_phy_LI2','prc_sampled_phy_LI3',
            'rx_per_sample_LI1','rx_per_sample_LI2','rx_per_sample_LI3',
            'avg_sample_per_hcp_LI1','avg_sample_per_hcp_LI2','avg_sample_per_hcp_LI3'
        ])
        .rename({
            'prc_sampled_phy_LIN' : 'prc_sampled_phy',
            'rx_per_sample_LIN' : 'rx_per_sample',
            'avg_sample_per_hcp_LIN' : 'avg_sample_per_hcp'
        })
    )
    
    # Joining Back Together - >
    final_feed3 = final_feed2_LIN.vstack(final_feed2_LIP)
    
    #renaming columns 
    col_mapping = {
        'product_id':'Product_id',
        'segment':'Segment',
        'decile':'Decile',
        'specialty_group':'Specialty',
        'num_calls':'Num_Of_Calls',
        'days_in_field':'Days_In_Field',
        'num_abbv_calls':'ABBV_Visits',
        'target_hcps':'Num_Of_Targets',
        'num_sample_hcps':'Num_Of_Sampled_Physicians',
        'num_samples':'Total_Samples',
        'tgts3':'Num_Of_QTD_Tgts_3Plus_Calls',
        'below':'Below',
        'optimal':'Optimal',
        'call_distribution':'Call_Distribution',
        'calls_per_day':'Calls_Per_Day',
        'call_freq':'Frequency',
        'prc_reach':'Prc_Reach',
        'tgt_reach_13wk':'Thirteen_Week_Tgt_Reach',
        'qtd_tgt_nreach':'QTD_Tgts_Not_Reached',
        'called_12m':'Num_Of_Called_Months_12M',
        'rx_per_call':'Rx_Per_Call',
        'sample_distribution':'Sample_Distribution',
        'prc_sampled_phy':'Prc_Of_Sampled_Physicians',
        'avg_sample_per_hcp':'Avg_Samples_Per_HCP',
        'rx_per_sample':'Rx_Per_Sample',
        'call_freq_goal_prc':'Call_Freq_Goal_Prc',
        'perc_opt_above':'Prc_Of_Optimal_And_Above'
    }
    final_feed = final_feed3.rename(col_mapping)
    
    # required columns for feed
    col_to_addrt = ['ReportType']
    col_to_addp = ['Period']
    col_to_adds = ['No_Call','Above']
    col_to_addna = ['Prc_Of_Surveyed_HCPs','Called_1_Time', 'Called_2_Times', 'Called_3_Times', 
                    'Called_4_Times', 'Called_5_Times', 'Called_6_Times', 'Total_Num_Of_Called_2_Times', 
                    'Prc_Of_Called_2_Times','Pharmacy_Calls_Per_Day', 'Num_Of_Calls2', 'Calls1', 'Calls2', 
                    'Calls3', 'Calls4', 'Calls5', 'Calls6', 'Calls7', 'Calls8', 'Calls9', 'Calls10', 'Calls11', 
                    'Calls12', 'Calls13', 'Calls14', 'Calls15', 'Calls16', 'Calls17', 'Calls18', 'Calls19', 'Calls20']
                    
    # func to add columns with desired value
    def addcol(df,columns_to_add,wtl):
        for my_col in columns_to_add:
            df = df.with_columns(pl.lit(wtl).alias(my_col))
        return df
    
    final_feed = addcol(final_feed,col_to_addrt,'WEEKLY')
    final_feed = addcol(final_feed,col_to_addp,f'{period_num}-WEEK')
    final_feed = addcol(final_feed,col_to_adds,'.')
    final_feed = addcol(final_feed,col_to_addna,'\\N')
    
    # changing value of column to match with sas - 06/21
    final_feed = final_feed.with_columns(
        pl.when(pl.col('Segment')=='ALG-ONLY-TARGET')
        .then(pl.lit('AGNT'))
        .when(pl.col('Segment')=='Target')
        .then(pl.lit('T'))
        .when(pl.col('Segment')=='Non-Target')
        .then(pl.lit('NT'))
        .otherwise(pl.col('Segment'))
        .alias('Segment'))
    
    # arranging columns according to feed
    req_col = ['Geography_id', 'Product_id', 'Segment', 'Specialty', 'ReportType', 'Period', 'Decile', 
               'Call_Distribution', 'Num_Of_Targets', 'Num_Of_Calls', 'Calls_Per_Day', 'Call_Freq_Goal_Prc',
                'Prc_Reach', 'Reach_Prc_BnchMrk_Ind1', 'Reach_Prc_BnchMrk_Ind2', 'Reach_Prc_BnchMrk_Ind3', 
                'Frequency', 'Rx_Per_Call', 'Days_In_Field', 'No_Call', 'Below', 'Optimal', 'Above', 
                'Prc_Of_Optimal_And_Above', 'Sample_Distribution', 'Prc_Of_Sampled_Physicians', 
                'Total_Samples', 'Avg_Samples_Per_HCP', 'Rx_Per_Sample', 'Prc_Of_Surveyed_HCPs', 
                'Called_1_Time', 'Called_2_Times', 'Called_3_Times', 'Called_4_Times', 'Called_5_Times',
                'Called_6_Times', 'Total_Num_Of_Called_2_Times', 'Prc_Of_Called_2_Times', 
                'Num_Of_Sampled_Physicians', 'Pharmacy_Calls_Per_Day', 'Num_Of_Calls2', 
                'Calls1', 'Calls2', 'Calls3', 'Calls4', 'Calls5', 'Calls6', 'Calls7', 'Calls8', 
                'Calls9', 'Calls10', 'Calls11', 'Calls12', 'Calls13', 'Calls14', 'Calls15', 'Calls16',
                'Calls17', 'Calls18', 'Calls19', 'Calls20', 'ABBV_Visits', 'Num_Of_Called_Months_12M',
                'Thirteen_Week_Tgt_Reach', 'QTD_Tgts_Not_Reached', 'Num_Of_QTD_Tgts_3Plus_Calls']
    final_feed = final_feed.select(req_col)
    
    #Overrides : 
    final_feed = final_feed.with_columns(
        pl.when(pl.col('Segment')=='NT').then(pl.lit('\\N')).otherwise(pl.col('Num_Of_Targets')).alias('Num_Of_Targets')
    )
    
    columns_to_round1 = ['Num_Of_Called_Months_12M','Rx_Per_Call']
    columns_to_round10 = ['Thirteen_Week_Tgt_Reach','Calls_Per_Day','Call_Freq_Goal_Prc','Prc_Reach','Frequency','Days_In_Field',]
    
    final_feed = final_feed.with_columns([
        *[pl.col(col).round(1).alias(col) for col in columns_to_round1],
        *[pl.col(col).round(10).alias(col) for col in columns_to_round10],
    ])
    
    return(final_feed)
               #-----------------------------------------#

In [34]:
# for trvializing formula : 
p,sg,spc,d = 'product','segment','specialty_group','decile'
levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
OUT = 's3://vortex-staging-a65ced90/BIT/output/GeoSummary/Weekly/'

In [35]:
# Calling all functions and Exporting Feeds
for period_num,PN in zip([1,4,13,26,'qtd'],[1,2,3,4,5]):
    # if PN > 1:
    #     break
    
    temp1 = [pl.DataFrame() for _ in range(4)] # creating an empty dataframe holder list obj
    temp1 = process_1(temp1)
    temp1 = process_2(temp1)
    temp1 = process_3(temp1)
    temp1 = process_3_1(temp1)
    temp1 = process_4(temp1)
    temp1 = process_5(temp1)
    temp1 = process_6(temp1)
    temp1 = process_6_1(temp1)
    temp1 = process_7(temp1)
    temp1 = process_8(temp1)
    temp1 = process_reach_benchmark(temp1)
    feed_dataset = get_feed(temp1)
     #===================================================
    feed_dataset = feed_dataset.to_pandas()
    # Select columns of type 'object' (string)
    string_columns = feed_dataset.select_dtypes(include=['object']).columns.tolist()
    feed_dataset[string_columns] = feed_dataset[string_columns].fillna('\\N')
    feed_dataset = feed_dataset.replace('NaN', '\\N')

    feed_dataset = feed_dataset.replace([np.nan, np.inf, -np.inf], '\\N')
    feed_dataset.to_csv(f'{OUT}Weekly_GeoSummary_SalesActivity_P{PN}_Feed.txt', sep='|', lineterminator='\r\n',index=False)
    print('Exported :',PN)

Exported : 1
Exported : 2
Exported : 3
Exported : 4
Exported : 5


---