### GS Sales Activity pt1

In [1]:
import polars as pl
import pandas as pd
import gc
from datetime import datetime, timedelta,date
from dateutil.relativedelta import relativedelta
import numpy as np
import json

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

curr_date = datetime.strptime(js['curr_date'], '%Y-%m-%d').date()
quarter_start = datetime.strptime(js['quarter_start'], '%Y-%m-%d').date()
quarter_end = datetime.strptime(js['quarter_end'], '%Y-%m-%d').date()
qtr_data = js['qtr_data']
num_weeks_calls = js['num_weeks_calls']
num_weeks_rx = js['num_weeks_rx']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
geo = f's3://{bucket}/PYADM/quaterly/{qtr_data}/geography/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

def intck(interval, start_date, end_date):
    if interval == 'DAY':
        return (end_date - start_date).days
    elif interval == 'MONTH':
        rd = relativedelta(end_date, start_date)
        return rd.years * 12 + rd.months
    elif interval == 'WEEK':
        return (end_date - start_date).days // 7

In [4]:
# Imporing Dependencies
load('temp_calls')
load('temp_samples')
load('temp_abbv')
load('mp_spec_seg_dec')
load('hierarchy',geo)
load('wd_raw')
load('lirwd_call_plan')
load('laxdn_geoid_sum')

geo_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/GeographyMapping.txt',separator='|')
geo_mapping = geo_mapping.with_columns(
    Code = pl.when(pl.col('Code')!= 'NATION').then(pl.lit('1111-')+pl.col('Code')).otherwise(pl.col('Code'))
)
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')

geo_id_full = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))

#fixes for vortex import -> Probably caused by Polars Upgrades
temp_calls = temp_calls.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_samples = temp_samples.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_abbv = temp_abbv.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
wd_raw = wd_raw.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
laxdn_geoid_sum = laxdn_geoid_sum.with_columns(pl.col('geography_id').cast(pl.Int64))

In [5]:
# Processing  1. temp calls  2. temp samples 3. temp abbv datasets
# - doubt is physiian terr id same as salesrepterrid for every record?
temp_calls_mp_spec = (
    temp_calls
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)
# NOTE -
# Combining MP and dropping null geo
# Adding Area and Region Code
# Adding Working Day
# Adding call_freq_quarter

# For Supproting Calc ->
#geo_code_mapper = temp_calls_mp_spec[['geography_id','region_geography_id','area_geography_id','nation_geography_id']].unique()
geo_code_mapper = geo_id_full
geo_code_mapper.to_pandas().to_parquet(dflib+'geo_code_mapper.parquet') #exporting for other code use

###
temp_samples_mp_spec = (
    temp_samples
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)
###
temp_abbv_mp_spec = (
    temp_abbv
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)

### Functions ->

In [6]:
#num_calls, num_hcps, days_in_field,call_freq_quarter

def process_1(df):
    source_df = temp_calls_mp_spec.filter(pl.col('call_week')<=num_weeks_calls) #only keeping QTD
    for i in range(4): #looping over 4 levels-
        g = levels[i]
        f = (source_df.group_by([g,p,sg,spc,d])
            .agg(
                num_calls=pl.col('CallID').n_unique(),  # Count distinct CallIDs
                num_hcps = pl.col('AttendeeIID').n_unique(), #Count distinct IIDs
                days_in_field = pl.col('days_in_field').mean(), # WD (using mean here because all vals are same)
                call_freq_quarter = pl.col('call_freq_quarter').sum(), # doubt
                #num_called_months = pl.col('call_month').n_unique()
            ))
        df[i] = f
    
    return(df)



#total_calls
def process_2(df):
    for i in range(4):
        f = df[i]
        geo_calls_df = f.group_by(levels[i]).agg(total_calls = pl.sum('num_calls'))
        f = f.join(geo_calls_df,on=levels[i],how='left')
        df[i] = f
    return(df)



# Abbv Visits -
def process_3(df):
    for i in range(4):
        f = df[i]
        source_df = (
            temp_abbv_mp_spec.filter(pl.col('call_week')<=num_weeks_calls) #only keeping QTD
            .group_by([levels[i],p,sg,spc,d])
            .agg(num_abbv_calls=pl.col('CallID').n_unique())
        )

        f = f.join(source_df,on = [levels[i],p,sg,spc,d],how = 'left')

        df[i] = f
    return(df)



# number of targets and 13wk tgts
def process_4(df):
    for i in range(4):
        f = df[i]
        source_df = (
            mp_spec_seg_dec
            .filter(pl.col('segment')=='Target')
            .join(geo_code_mapper,on = levels[0],how = 'left')
            .group_by(levels[i])
            .agg(target_hcps = pl.col('IID').n_unique())
        )

        source_df_2 = (
            temp_calls_mp_spec
            .filter((pl.col('call_week')<=13)&(pl.col('segment')=='Target'))
            .group_by(levels[i])
            .agg(tgts_13wks = pl.col('AttendeeIID').n_unique())
        )

        f = (
            f
            .join(source_df,on=levels[i],how='left')
            .join(source_df_2,on=levels[i],how='left')
        )
        df[i] = f
    return(df)



# wk qtd
def process_5(df):
    for i in range(4):
        f = df[i]

        source_df = (
            laxdn_geoid_sum
            .join(geo_code_mapper,on=levels[0],how = 'left')
            .group_by(levels[i])
            .agg(wk_qtd = pl.col('wk_qtd').sum())
        )

        f = f.join(source_df,on = levels[i],how='left')
        df[i] = f
    return(df)



# num_samples, num_sample_hcps, total_samples
def process_6(df):
    for i in range(4):
        f = df[i]

        source_df = (
            temp_samples_mp_spec
            .filter(pl.col('sample_week')<=num_weeks_calls) 
            .group_by([levels[i],p,sg,spc,d])
            .agg(
                num_samples=pl.col('CallID').n_unique(),  # Count distinct CallIDs
                num_sample_hcps = pl.col('AttendeeIID').n_unique(), #Count distinct IIDs
            )
        )

        source_df_2 = (
            source_df
            .group_by(levels[i])
            .agg(total_samples = pl.sum('num_samples'))
        )

        f = (
            f
            .join(source_df,on =[levels[i],p,sg,spc,d],how = 'left')
            .join(source_df_2, on = levels[i],how = 'left')
        )

        df[i] = f

    return(df)



#tgts3 - Num_Of_QTD_Tgts_3Plus_Calls
def process_7(df):
    for i in range(4):
        f = df[i]

        source_df = (
            temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)&(pl.col('segment')=='Target'))
            .group_by([levels[i],p,sg,spc,d])
            .agg(
                tgts3=pl.col('AttendeeIID').n_unique(),
                calls3_tgts = pl.col('CallID').n_unique()
            )
            .filter(pl.col('calls3_tgts')>3)
            .drop('calls3_tgts')
        )

        f = f.join(source_df,on = [levels[i],p,sg,spc,d],how = 'left')

        df[i] = f
    return(df)



# Optimal and Bellow
def process_12(df):
    for i in range(4):
        f = df[i]
        source_df = temp_calls_mp_spec.filter(pl.col('call_week')<=num_weeks_calls)
        months_called_df = source_df.group_by('AttendeeIID').agg(month_call_count = pl.col('call_month').n_unique())
        source_df = source_df.join(months_called_df,on = 'AttendeeIID',how = 'left')
        source_df_o = (
            source_df.filter(pl.col('month_call_count')==3)
            .group_by(levels[i],p,sg,spc,d)
            .agg(optimal = pl.col('AttendeeIID').n_unique())
        )
        source_df_b = (
            source_df.filter(pl.col('month_call_count')<3)
            .group_by(levels[i],p,sg,spc,d)
            .agg(below = pl.col('AttendeeIID').n_unique())
        )
        source_df = source_df_b.join(source_df_o,on=[levels[i],p,sg,spc,d],how='outer_coalesce')
        f = f.join(source_df,on = [levels[i],p,sg,spc,d],how = 'left')
        df[i] = f
    return(df)



### adding rollups 
def add_rollups_calls(all_df):
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    #Looping over 4 levels (terr,reg,area,nation)
    for i in range(4):
        df = all_df[i]
        g = levels[i]
        metric_cols = df.columns[5:] #add more here if adding more columns on top
        agg_dict = {metric: pl.col(metric).sum() for metric in metric_cols}
        agg_dict['days_in_field'] = pl.col('days_in_field').mean()
        main_seq = ([g,p,sg,d,spc] + metric_cols) #used for vstack later

        # First Round - 
        sg_df = (df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        #### Processing Done ####
        df = (
            df.select(main_seq)
            .vstack(sg_df).vstack(d_df).vstack(spc_df)
            .vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df)
            .vstack(sg_d_spc_df)
        )
        # Store Data Back :
        all_df[i] = df
    return(all_df)



#### adding formula backed metrics
#call_distribution, calls_per_day
def process_8(df):
    for i in range(4):
        f = df[i]
        f = f.with_columns(
            call_distribution = pl.col('num_calls') / pl.col('total_calls'),
            calls_per_day = pl.col('num_calls')/pl.col('days_in_field')
        )
        df[i] = f
    return(df)



# Frequency (call_freq)
def process_9(df):
    for i in range(4):
        f= df[i]
        
        source_df = temp_calls_mp_spec.filter(pl.col('call_week')<=num_weeks_calls)
        call_freq_df = (
            source_df.group_by(['AttendeeIID',levels[i]])
            .agg(calls = pl.col('CallID').n_unique())
            .group_by(levels[i]).agg(call_freq = pl.col('calls').mean())
        )

        f = f.join(call_freq_df,on = levels[i],how = 'left')

        df[i] = f
    return(df)



#qtd reach , 13 week reach, QTD_Tgts_Not_Reached
def process_10(df):
    for i in range(4):
        f = df[i]
        f = (
            f
            .with_columns(
                prc_reach = pl.col('num_hcps')/pl.col('target_hcps'),
                tgt_reach_13wk = pl.col('tgts_13wks')/pl.col('target_hcps'),
                qtd_tgt_nreach = pl.col('target_hcps')- pl.col('num_hcps')
            )
            .drop('tgts_13wks')
        )
        df[i] = f
    return(df)



#Num_Of_Called_Months_12M
def process_11(df):
    for i in range(4):
        f = df[i]

        source_df = (
            temp_calls_mp_spec
            .filter(pl.col('call_month')<=12)
            .group_by([levels[i],'AttendeeIID'])
            .agg(called_months = pl.col('call_month').n_unique())
            .group_by(levels[i])
            .agg(called_12m = pl.col('called_months').mean())
        )

        f = f.join(source_df,on = levels[i],how = 'left')

        df[i] = f
    return(df)



# Trx Per Call
def process_13(df):
    for i in range(4):
        f = df[i]

        f = f.with_columns(
            rx_per_call = pl.col('wk_qtd')/pl.col('num_calls')
        )

        df[i] = f
    return(df)



# sample_distribution, prc_sampled_phy, avg_sample_per_hcp,rx_per_sample

def process_14(df):
    for i in range(4):
        f = df[i]
        f = (
            f
            .with_columns(
                sample_distribution = pl.col('num_samples')/pl.col('total_samples'),
                prc_sampled_phy = pl.col('num_sample_hcps')/pl.col('num_hcps'),
                avg_sample_per_hcp = pl.col('num_samples')/pl.col('target_hcps'),
                rx_per_sample = pl.col('wk_qtd')/pl.col('total_samples')
            ).drop('wk_qtd')
        )

        df[i] = f

    return(df)



#call_freq_goal_prc
def process_15(df):
    formula_helper_1 = intck('DAY',quarter_start,curr_date) / intck('DAY',quarter_start,quarter_end)
    for i in range(4):
        f = df[i]
        f = f.with_columns(
            call_freq_goal_prc = pl.when(pl.col('call_freq_quarter').is_null()).then(None
            ).otherwise(pl.col('total_calls')/(pl.col('call_freq_quarter') * formula_helper_1))
        )
        df[i] = f
    return(df)




#Prc_Of_Optimal_And_Above
def process_16(df):
    for i in range(4):
        f = df[i]
        f = f.with_columns(perc_opt_above = pl.col('optimal')/(pl.col('optimal')+pl.col('below')))
        df[i] = f
    return(df)



# adding upper and lower limits : (use prc_reach)
# flow -
## for terr level -
# three cols , ind1,ind2,ind3
# for ind1 ->nation , ind2->area , ind3->region
# to create an upper limit and lower limit 
# upper : meadian + 0.5*stddev
# lower : median - 0.5*stddev
#bench_ind : if var > upper then A | if var < lower then B | if lower <= var <= upper then E

def process_reach_benchmark(df): # WORKING CORRECTLY BUT NOT MODULAR , PLEASE UPDATE STRUCTURE 
    def add_indicator(df, ind_name, col1, col2, col3):
        return df.with_columns(
            pl.when(pl.col(col1) > pl.col(col2))
            .then(pl.lit('A'))
            .when(pl.col(col1) < pl.col(col3))
            .then(pl.lit('B'))
            .when((pl.col(col3) < pl.col(col1)) & (pl.col(col1) < pl.col(col2)))
            .then(pl.lit('E'))
            .otherwise(None)  # You can replace 'N/A' with any default value
            .alias(ind_name)
        )
    #Terr
    f = df[0]
    nf = f.select([levels[0],p,sg,spc,d,'prc_reach'])
    nf = nf.join(geo_code_mapper,on = levels[0],how = 'left')
    # create upper and lowers : 
    nf_n = nf.group_by([p,sg,spc,d]).agg(
        nul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        nll = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf_a = nf.group_by([levels[2],p,sg,spc,d]).agg(
        aul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        all = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf_r = nf.group_by([levels[1],p,sg,spc,d]).agg(
        rul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        rll = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf = (
        nf
        .join(nf_n, on=[p, sg, spc, d], how='left')
        .join(nf_a,on=[levels[2],p, sg, spc, d],how='left')
        .join(nf_r,on=[levels[1],p, sg, spc, d],how='left')
    ).drop(levels[1],levels[2],levels[3])


    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind1', 'prc_reach', 'nul', 'nll')
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind2', 'prc_reach', 'aul', 'all')
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind3', 'prc_reach', 'rul', 'rll').drop(['nul','nll','aul','all','rul','rll','prc_reach'])

    f = f.join(nf,on=[levels[0],p, sg, spc, d],how = 'left')
    df[0] = f
    #Region
    f = df[1]
    nf = f.select([levels[1],p,sg,spc,d,'prc_reach']).join(
        geo_code_mapper[['region_geography_id','area_geography_id']].unique(),on = levels[1],how = 'left'
    )
    # create upper and lowers : 
    nf_n = nf.group_by([p,sg,spc,d]).agg(
        nul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        nll = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf_a = nf.group_by([levels[2],p,sg,spc,d]).agg(
        aul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        all = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf = (
        nf
        .join(nf_n, on=[p, sg, spc, d], how='left')
        .join(nf_a,on=[levels[2],p, sg, spc, d],how='left')
    ).drop(levels[2],levels[3])
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind1', 'prc_reach', 'nul', 'nll')
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind2', 'prc_reach', 'aul', 'all')
    nf = nf.with_columns(pl.lit(None).alias('Reach_Prc_BnchMrk_Ind3')).drop(['nul','nll','aul','all','prc_reach'])
    f = f.join(nf,on=[levels[1],p, sg, spc, d],how = 'left')
    df[1] = f
    #Area
    f = df[2]
    nf = f.select([levels[2],p,sg,spc,d,'prc_reach'])
    # create upper and lowers : 
    nf_n = nf.group_by([p,sg,spc,d]).agg(
        nul = (pl.col('prc_reach').median() + (0.5*pl.col('prc_reach').std())),
        nll = (pl.col('prc_reach').median() - (0.5*pl.col('prc_reach').std()))
    )
    nf = (nf.join(nf_n, on=[p, sg, spc, d], how='left'))
    nf = add_indicator(nf, 'Reach_Prc_BnchMrk_Ind1', 'prc_reach', 'nul', 'nll')
    nf = nf.with_columns(pl.lit(None).alias('Reach_Prc_BnchMrk_Ind2'),pl.lit(None).alias('Reach_Prc_BnchMrk_Ind3')).drop(['nul','nll','prc_reach'])
    f = f.join(nf,on=[levels[2],p, sg, spc, d],how = 'left')
    df[2] = f
    #Nation 
    f = df[3]
    f = f.with_columns(pl.lit(None).alias('Reach_Prc_BnchMrk_Ind1'),pl.lit(None).alias('Reach_Prc_BnchMrk_Ind2'),pl.lit(None).alias('Reach_Prc_BnchMrk_Ind3'))
    df[3] = f

    return(df)

In [7]:
# For Convert To Feed Ready data
def get_feed(temp1):
    temp1[0] = temp1[0].rename({'geography_id': 'Geography_id'})
    temp1[1] = temp1[1].rename({'region_geography_id': 'Geography_id'})
    temp1[2] = temp1[2].rename({'area_geography_id': 'Geography_id'})
    temp1[3] = temp1[3].rename({'nation_geography_id': 'Geography_id'})
    
    final_feed = temp1[0].vstack(temp1[1]).vstack(temp1[2]).vstack(temp1[3])

    #fix for product_id
    pm = prod_mapping.with_columns(pl.lit('LIN').alias('product')).select(['product','product_id'])
    final_feed = final_feed.join(pm,on='product',how='left').drop('product')
    
    # droping extra columns
    final_feed = final_feed.drop(['num_hcps','call_freq_quarter','total_calls','num_samples'])
    
    #renaming columns 
    col_mapping = {
        'product_id':'Product_id',
        'segment':'Segment',
        'decile':'Decile',
        'specialty_group':'Specialty',
        'num_calls':'Num_Of_Calls',
        'days_in_field':'Days_In_Field',
        'num_abbv_calls':'ABBV_Visits',
        'target_hcps':'Num_Of_Targets',
        'num_sample_hcps':'Num_Of_Sampled_Physicians',
        'total_samples':'Total_Samples',
        'tgts3':'Num_Of_QTD_Tgts_3Plus_Calls',
        'below':'Below',
        'optimal':'Optimal',
        'call_distribution':'Call_Distribution',
        'calls_per_day':'Calls_Per_Day',
        'call_freq':'Frequency',
        'prc_reach':'Prc_Reach',
        'tgt_reach_13wk':'Thirteen_Week_Tgt_Reach',
        'qtd_tgt_nreach':'QTD_Tgts_Not_Reached',
        'called_12m':'Num_Of_Called_Months_12M',
        'rx_per_call':'Rx_Per_Call',
        'sample_distribution':'Sample_Distribution',
        'prc_sampled_phy':'Prc_Of_Sampled_Physicians',
        'avg_sample_per_hcp':'Avg_Samples_Per_HCP',
        'rx_per_sample':'Rx_Per_Sample',
        'call_freq_goal_prc':'Call_Freq_Goal_Prc',
        'perc_opt_above':'Prc_Of_Optimal_And_Above'
    }
    final_feed = final_feed.rename(col_mapping)
    
    # required columns for feed
    col_to_addrt = ['ReportType']
    col_to_addp = ['Period']
    col_to_adds = ['No_Call','Above']
    col_to_addna = ['Below','Optimal','Prc_Of_Surveyed_HCPs','Called_1_Time', 'Called_2_Times', 'Called_3_Times', 
                    'Called_4_Times', 'Called_5_Times', 'Called_6_Times', 'Total_Num_Of_Called_2_Times', 
                    'Prc_Of_Called_2_Times','Pharmacy_Calls_Per_Day', 'Num_Of_Calls2', 'Calls1', 'Calls2', 
                    'Calls3', 'Calls4', 'Calls5', 'Calls6', 'Calls7', 'Calls8', 'Calls9', 'Calls10', 'Calls11', 
                    'Calls12', 'Calls13', 'Calls14', 'Calls15', 'Calls16', 'Calls17', 'Calls18', 'Calls19', 'Calls20']
    				
    # func to add columns with desired value
    def addcol(df,columns_to_add,wtl):
        for my_col in columns_to_add:
            df = df.with_columns(pl.lit(wtl).alias(my_col))
        return df
    
    final_feed = addcol(final_feed,col_to_addrt,'WEEKLY')
    final_feed = addcol(final_feed,col_to_addp,f'{period_num}-WEEK')
    final_feed = addcol(final_feed,col_to_adds,'.')
    final_feed = addcol(final_feed,col_to_addna,'\\N')

    # changing value of column to match with sas - 06/21
    final_feed = final_feed.with_columns(
        pl.when(pl.col('Segment')=='ALG-ONLY-TARGET')
        .then(pl.lit('AGNT'))
        .when(pl.col('Segment')=='Target')
        .then(pl.lit('T'))
        .when(pl.col('Segment')=='Non-Target')
        .then(pl.lit('NT'))
        .otherwise(pl.col('Segment'))
        .alias('Segment'))
    
    # arranging columns according to feed
    req_col = ['Geography_id', 'Product_id', 'Segment', 'Specialty', 'ReportType', 'Period', 'Decile', 
               'Call_Distribution', 'Num_Of_Targets', 'Num_Of_Calls', 'Calls_Per_Day', 'Call_Freq_Goal_Prc',
                'Prc_Reach', 'Reach_Prc_BnchMrk_Ind1', 'Reach_Prc_BnchMrk_Ind2', 'Reach_Prc_BnchMrk_Ind3', 
                'Frequency', 'Rx_Per_Call', 'Days_In_Field', 'No_Call', 'Below', 'Optimal', 'Above', 
                'Prc_Of_Optimal_And_Above', 'Sample_Distribution', 'Prc_Of_Sampled_Physicians', 
                'Total_Samples', 'Avg_Samples_Per_HCP', 'Rx_Per_Sample', 'Prc_Of_Surveyed_HCPs', 
                'Called_1_Time', 'Called_2_Times', 'Called_3_Times', 'Called_4_Times', 'Called_5_Times',
                'Called_6_Times', 'Total_Num_Of_Called_2_Times', 'Prc_Of_Called_2_Times', 
                'Num_Of_Sampled_Physicians', 'Pharmacy_Calls_Per_Day', 'Num_Of_Calls2', 
                'Calls1', 'Calls2', 'Calls3', 'Calls4', 'Calls5', 'Calls6', 'Calls7', 'Calls8', 
                'Calls9', 'Calls10', 'Calls11', 'Calls12', 'Calls13', 'Calls14', 'Calls15', 'Calls16',
                'Calls17', 'Calls18', 'Calls19', 'Calls20', 'ABBV_Visits', 'Num_Of_Called_Months_12M',
                'Thirteen_Week_Tgt_Reach', 'QTD_Tgts_Not_Reached', 'Num_Of_QTD_Tgts_3Plus_Calls']
    final_feed = final_feed.select(req_col)

    
    

    return(final_feed)


#### Period :  all periods will have the same data - QTD)
---

In [8]:
# for trvializing formula : 
p,sg,spc,d = 'product','segment','specialty_group','decile'
levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
OUT = 's3://vortex-staging-a65ced90/BIT/output/GeoSummary/Weekly/'

In [9]:
# Calling all functions and Exporting Feeds
for period_num,PN in zip([1,4,13,26,'qtd'],[1,2,3,4,5]):
    temp1 = [pl.DataFrame() for _ in range(4)] # creating an empty dataframe holder list obj
    temp1 = process_1(temp1)
    temp1 = process_2(temp1)
    temp1 = process_3(temp1)
    temp1 = process_4(temp1)
    temp1 = process_5(temp1)
    temp1 = process_6(temp1)
    temp1 = process_7(temp1)
    temp1 = process_12(temp1)
    temp1 = add_rollups_calls(temp1)
    temp1 = process_8(temp1)
    temp1 = process_9(temp1)
    temp1 = process_10(temp1)
    temp1 = process_11(temp1)
    temp1 = process_13(temp1)
    temp1 = process_14(temp1)
    temp1 = process_15(temp1)
    temp1 = process_16(temp1)
    temp1 = process_reach_benchmark(temp1)
    feed_dataset = get_feed(temp1)
    feed_dataset.to_pandas().to_csv(f'{OUT}Weekly_GeoSummary_SalesActivity_P{PN}_Feed.txt', sep='|')