#### Prescriber View - Sales Activity pt1

In [1]:
import polars as pl
import pandas as pd
import gc
import numpy as np
from datetime import datetime, timedelta,date
import json

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
num_weeks_rx = js['num_weeks_rx']
num_weeks_calls = js['num_weeks_calls']
curr_date = datetime.strptime(js['curr_date'], '%Y-%m-%d').date()
num_of_months = js['num_of_months']
bucket = js['bucket']
quarter_start = datetime.strptime(js['quarter_start'], '%Y-%m-%d').date()

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'
call = f's3://{bucket}/PYADM/weekly/archive/{data_date}/calls_samples/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('temp_calls')
load('mp_spec_seg_dec')
load('lirwd_call_plan')
load('temp_samples')
load('temp_abbv')
load('roster')
load('MASTER_UNI')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

#fixes for vortex import -> Probably caused by Polars Upgrades
temp_calls = temp_calls.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_samples = temp_samples.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_abbv = temp_abbv.with_columns(pl.col('SalesRepIID').cast(pl.Int64))

### Generator Functions- 

In [5]:
# For Voucher Removal - 
def get_lin_voucher():
    vch = pl.read_parquet(f'{xpn}LIN_VOUCHER.parquet') # n_rows=500
    vch1 = pl.DataFrame()
    for prod in ['LIN1','LIN2','LIN3']: # LINV
        vch_prod = (
            vch.select(
                pl.col('IID'),
                pl.col(f'{prod}TUF1').alias(f'vTUF_1c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,5)]).alias(f'vTUF_4c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,14)]).alias(f'vTUF_13c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,27)]).alias(f'vTUF_26c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,num_weeks_rx+1)]).alias(f'vTUF_qtdc'),
                pl.col(f'{prod}TUF2').alias(f'vTUF_1p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(5,9)]).alias(f'vTUF_4p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(14,27)]).alias(f'vTUF_13p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(27,53)]).alias(f'vTUF_26p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(14,14+num_weeks_rx)]).alias(f'vTUF_qtdp'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,106)]).alias(f'vTUF_all')
            )
            .with_columns(pl.lit(f'LI{prod[-1]}').alias('PROD_CD'))
        )
        if prod[-1] == '1':
            vch1 = vch_prod.clone()
        else:
            vch1 = pl.concat([vch1, vch_prod])

    # voucher_mapping = {'LI1': 4, 'LI2': 5, 'LI3': 3, 'LIV': 2}
    # vch1 = vch1.with_columns(pl.col('PROD_CD').replace(voucher_mapping,return_dtype=pl.Int64).alias('product_id')).fill_null(0)#.drop('PROD_CD')
    vch1 = vch1.fill_null(0)

    return(vch1)

In [6]:
def get_summed_period_iid_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,106)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,4,13,26 for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,5)]).alias(metric+'_4c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,14)]).alias(metric+'_13c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,27)]).alias(metric+'_26c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,num_weeks_rx+1)]).alias(metric+'_qtdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(5,9)]).alias(metric+'_4p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,27)]).alias(metric+'_13p'),
        pl.sum_horizontal([metric+str(i) for i in range(27,53)]).alias(metric+'_26p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,14+num_weeks_rx)]).alias(metric+'_qtdp'),

        pl.sum_horizontal([metric+str(i) for i in range(1,106)]).alias(metric+'_all')
    )
    # For Voucher Removal - 
    if metric == 'TUF':
        dfv = get_lin_voucher()
        df = df.join(dfv,on=['IID','PROD_CD'],how='left').fill_null(0)
        cols_to_remove = dfv.columns[1:-1]
        df = df.with_columns(
            pl.col(f'{metric}_1c') -  pl.col(f'v{metric}_1c').alias(f'{metric}_1c'),
            pl.col(f'{metric}_4c') -  pl.col(f'v{metric}_4c').alias(f'{metric}_4c'),
            pl.col(f'{metric}_13c') -  pl.col(f'v{metric}_13c').alias(f'{metric}_13c'),
            pl.col(f'{metric}_26c') -  pl.col(f'v{metric}_26c').alias(f'{metric}_26c'),
            pl.col(f'{metric}_qtdc') -  pl.col(f'v{metric}_qtdc').alias(f'{metric}_qtdc'),
            pl.col(f'{metric}_1p') -  pl.col(f'v{metric}_1p').alias(f'{metric}_1p'),
            pl.col(f'{metric}_4p') -  pl.col(f'v{metric}_4p').alias(f'{metric}_4p'),
            pl.col(f'{metric}_13p') -  pl.col(f'v{metric}_13p').alias(f'{metric}_13p'),
            pl.col(f'{metric}_26p') -  pl.col(f'v{metric}_26p').alias(f'{metric}_26p'),
            pl.col(f'{metric}_qtdp') -  pl.col(f'v{metric}_qtdp').alias(f'{metric}_qtdp'),
            pl.col(f'{metric}_all') -  pl.col(f'v{metric}_all').alias(f'{metric}_all')
        ).drop(cols_to_remove)

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))

all_products_tuf = get_summed_period_iid_metric('TUF',fetch_products)
all_products_nuf = get_summed_period_iid_metric('NUF',fetch_products)

def add_parent_product_rows(df):
    agg_dict = {}
    for col in df.columns[2:]:
        agg_dict[col] = pl.col(col).sum()
    
    #join_cols = ['geography_id','plan_type','PlanID','IID']

    df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
    df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
    df_2_35 = df_2_35.group_by(['IID','parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
    
    df_1 = df.group_by('IID').agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

    # stack 1, 2_35 with df and return
    df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
    vstack_helper = df.columns
    df = df.vstack(
        df_2_35.select(vstack_helper)
    ).vstack(
        df_1.select(vstack_helper)
    )

    return(df)

all_products_tuf = add_parent_product_rows(all_products_tuf)
all_products_nuf = add_parent_product_rows(all_products_nuf)

# tuf1 = all_products_tuf.filter(pl.col('TUF_all')!=0).select(['IID','product_id'])
# nuf1 = all_products_nuf.filter(pl.col('NUF_all')!=0).select(['IID','product_id'])
# xponent = tuf1.join(nuf1,on=['IID','product_id'],how='outer_coalesce')

calls = (
    temp_calls.filter(pl.col('call_week')<= num_weeks_calls)
    .filter(pl.col('CallDate')>= quarter_start)
    .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
    .join(roster, on = 'SalesRepIID' , how = 'left')
    .filter(pl.col('Territory')==pl.col('GEO'))
    .rename({'AttendeeIID':'IID'})
    .select('IID').unique('IID')
    .with_columns(product_id = pl.lit(2)).with_columns(pl.col('product_id').cast(pl.Int64))
)

# xponent_calls = xponent.join(calls,on=['IID','product_id'],how='outer_coalesce').filter(~pl.col('product_id').is_in([2,3,4,5]))

#delete extra dfs when optimizing

### Functions 

In [76]:
# KPI Indicator
def process_kpi_ind(df):

    geo_id_full = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
    load('wd_raw')
    temp_calls_mp_spec = (
        temp_calls
        .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
        .join(geo_id_full,on = 'geography_id',how = 'left')
        .join(wd_raw[['SalesRepIID','days_in_field']],on = 'SalesRepIID', how = 'left')
        .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
    )
    
    source_df = (
        temp_calls_mp_spec.filter((pl.col('call_week')<=num_weeks_calls)).filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    
    num_calls_iid_month = (
        source_df
        .group_by(['AttendeeIID','call_month'])
        .agg(num_calls = pl.col('CallID').n_unique())
        .filter(pl.col('call_month')<=num_of_months).rename({'AttendeeIID':'IID'})
        .group_by('IID').agg(pl.col('call_month').n_unique().alias('num_calls'))
    )
    
    # num_calls_iid_month = temp_calls.group_by(['AttendeeIID','call_month']).agg(
    #     num_calls = pl.col('CallID').n_unique()
    # ).filter(pl.col('call_month')<=num_of_months).rename({'AttendeeIID':'IID'}
    # ).group_by('IID').agg(pl.col('call_month').n_unique().alias('num_calls')) #this was sum before- and suming num_calls
    # First Group by calls data by IID and call_month and get num of calls for each month
    # then filter that dataset to only have rows for the num of months calls variable
    # then group by and sum num_calls just on IID
    # end result will be dataset with num_calls for num_of_month worth of rows at IID level

    df = df.join(num_calls_iid_month,on = 'IID',how = 'left')

    if num_of_months == 0:
        return (df.with_columns(kpi_ind = pl.lit('\\N')))
    elif num_of_months == 1:
        result = df.with_columns(
            pl.when(pl.col('num_calls')>=1).then(pl.lit('OPTIMAL')).otherwise(pl.lit('BELOW')).alias('kpi_ind')
        ).drop('num_calls') #dropping pulled columns
    elif num_of_months == 2:
        result = df.with_columns(
            pl.when(pl.col('num_calls')>=2).then(pl.lit('OPTIMAL')).otherwise(pl.lit('BELOW')).alias('kpi_ind')
        ).drop('num_calls') 
    elif num_of_months == 3:
        result = df.with_columns(
            pl.when(pl.col('num_calls')>=3).then(pl.lit('OPTIMAL')).otherwise(pl.lit('BELOW')).alias('kpi_ind')
        ).drop('num_calls')

    result = result.join(lirwd_call_plan,on = 'IID',how = 'left'
    ).with_columns(
        pl.when(pl.col('call_freq_quarter').is_null()).then(pl.lit('\\N')).otherwise(pl.col('kpi_ind')).alias('kpi_ind')
    ).drop('call_freq_quarter')
    
    return(result)

In [77]:
#num of calls
def process_num_calls(df):
    source_df = (
        temp_calls
        .filter(pl.col('call_week')<=num_weeks_calls)
        .filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )
    calls_iid_qtd = source_df.group_by('AttendeeIID').agg(num_calls = pl.col('CallID').n_unique()
    ).rename({'AttendeeIID':'IID'})

    return(
        df.join(calls_iid_qtd,on = 'IID',how = 'left')
    )

In [9]:
#num of samples
#- this step causes nobs to go up as left dataset is joined with samples dataframe
#- that samples dataframe is at an IID and product_id level , not just IID. hence its not a 1-1 join
#- the product_id in that datframe is made manually using a dictionary variable , not some external file
def process_num_samples(df):
    cpd_pid_mapping  = {'72 mcg' : '3', '145 mcg' : '4', '290 mcg' : '5'}
    
    source_df = (
        temp_samples
        .filter(pl.col('sample_week')<=num_weeks_calls)
        .filter(pl.col('CallDate')>= quarter_start)
        .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        .join(roster, on = 'SalesRepIID' , how = 'left')
        .filter(pl.col('Territory')==pl.col('GEO'))
    )

    #samples_iid_qtd
    sid = (
        source_df
        .with_columns(pl.col('CallProductDescription').replace(cpd_pid_mapping).alias('product_id'))
        .with_columns(pl.col('product_id').cast(pl.Int64))
        .group_by(['AttendeeIID','product_id']).agg(num_samples = pl.col('CallProductQuantity').sum())
        .rename({'AttendeeIID':'IID'})
    )
    # summing up products 3 , 4 ,5 
    sid2 = (
        sid.group_by('IID').agg(num_samples = pl.col('num_samples').sum())
        .with_columns(pl.lit(2).alias('product_id')).select(sid.columns)
        .with_columns(pl.col('product_id').cast(pl.Int64))
    )
    # adding back 
    sid = sid.vstack(sid2)
    # Create a DataFrame with all possible combinations of IID and product_id
    all_combinations = (
        sid.select('IID').unique()
        .join(
            pl.DataFrame({'product_id': [2, 3, 4, 5]}), 
            how='cross'
        )
    )
    # Join with the existing sid DataFrame
    complete_samples = (
        all_combinations.join(sid, on=['IID', 'product_id'], how='left')
        .with_columns(pl.col('num_samples').fill_null(0))
    )

    df_join = df.join(complete_samples,on = 'IID',how = 'left')
    
    df_join_nulls = df_join.filter(pl.col('product_id').is_null())
    djn2 = df_join_nulls.with_columns(pl.col('product_id').fill_null(2))
    djn3 = df_join_nulls.with_columns(pl.col('product_id').fill_null(3))
    djn4 = df_join_nulls.with_columns(pl.col('product_id').fill_null(4))
    djn5 = df_join_nulls.with_columns(pl.col('product_id').fill_null(5))

    dfjn = djn2.vstack(djn3).vstack(djn4).vstack(djn5)

    df_final = (
        df_join.filter(pl.col('product_id').is_not_null())
        .vstack(dfjn)
    )

    return(df_final)

In [10]:
#Rx per sample
#- Rx dataset should be made at prod_cd = 3,4 or 5 level along with IID
#- Rx value should be QTD TUF.
def process_rx_per_sample(df):

    source_df = (
        all_products_tuf
        .select(['IID','TUF_qtdc','product_id'])
        .filter(pl.col('product_id').is_in([2,3,4,5]))
        .join(MASTER_UNI.select(['IID','PDRPOptOutFlag']),on='IID',how='left')
        .with_columns(pl.when(pl.col('PDRPOptOutFlag')=='Y').then(pl.lit(0)).otherwise(pl.col('TUF_qtdc')).alias('TUF_qtdc')).drop('PDRPOptOutFlag')
    )
    
    result = (
        df.join(source_df,on=['IID','product_id'],how='left')
        .with_columns(
            pl.when((pl.col('num_samples')!=0) & (pl.col('TUF_qtdc').is_not_null()))
            .then(pl.col('TUF_qtdc')/pl.col('num_samples'))
            .otherwise(pl.lit('\\N'))
            .alias('rx_per_sample')
        )
        .drop('TUF_qtdc')
    )

    # Override Rx_per_sample for all non targets and abbv only -
    result = (
        result
        .join(mp_spec_seg_dec.select(['IID','segment']),on = 'IID', how = 'left')
        .with_columns(
            pl.when(pl.col('segment')!='Target').then(pl.lit('\\N')).otherwise(pl.col('rx_per_sample')).alias('rx_per_sample')
        ).drop('segment')
    )

    return(result)

In [104]:
#Last Called Date
def process_last_called(df):
    source_df = (
        temp_calls
        #.filter(pl.col('call_week')<=num_weeks_calls)
        #.filter(pl.col('CallDate')>= quarter_start)
        # .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
        # .join(roster, on = 'SalesRepIID' , how = 'left')
        # .filter(pl.col('Territory')==pl.col('GEO'))
    )
    last_called_df = source_df.group_by('AttendeeIID').agg(
        last_called_date = pl.col('CallDate').max()
    ).rename({'AttendeeIID':'IID'}
    ).with_columns(
        pl.col('last_called_date').cast(pl.Utf8).str.slice(0,10).alias('last_called_date')
    )

    return(df.join(last_called_df,on='IID',how='left'))

In [12]:
#Num of calls 12 months
def process_calls_12m(df):
    calls_12m_df = temp_calls.filter(pl.col('call_month')<=12
    ).group_by('AttendeeIID').agg(num_calls_12m = pl.col('CallID').n_unique()).rename({'AttendeeIID':'IID'})

    return(df.join(calls_12m_df,on='IID',how='left'))

In [13]:
#num of Called Months in 12 Months
def process_called_months_12m(df):
    called_months_12m_df  = temp_calls.filter(pl.col('call_month')<=12
    ).group_by('AttendeeIID').agg(called_months_12m = pl.col('call_month').n_unique()
    ).rename({'AttendeeIID':'IID'})

    return(df.join(called_months_12m_df,on='IID',how='left'))

In [14]:
#Abbv Visits
def process_abbv_visits(df):
    source_df = temp_abbv.filter(pl.col('call_week')<=num_weeks_calls).filter(pl.col('CallDate')>= quarter_start)
    
    a_vist_df = source_df.group_by('AttendeeIID'
    ).agg(abbv_visits = pl.col('CallID').n_unique()
    ).rename({'AttendeeIID':'IID'})

    return(df.join(a_vist_df,on='IID',how='left'))

In [15]:
#Target Reached Status
def process_tgt_reach_st(df):
    return(
    df.join(mp_spec_seg_dec[['IID','segment']],on='IID',how='left').with_columns(
        pl.when(pl.col('segment')=='Target').then(
            pl.when(pl.col('num_calls')>=1).then(pl.lit('Yes')).otherwise(pl.lit('No'))
        ).otherwise(pl.lit('Non-Target')).alias('tgt_rch_status')).drop('segment')
    )

In [16]:
#13 Wk IW Calls
def process_num_calls_13wks(df):
    calls_iid_13wk = temp_calls.filter(pl.col('call_week')<= 13
    ).group_by('AttendeeIID').agg(num_calls_13wks = pl.col('CallID').n_unique()
    ).rename({'AttendeeIID':'IID'})

    return(
        df.join(calls_iid_13wk,on = 'IID',how = 'left')
    )

In [17]:
# For converting to Feed Ready data -
def get_feed(temp1):
    #Renaming columns according to feed
    column_mapping = {
        "IID": "Physician_ID",
        "geography_id": "Geography_id",
        "product_id": "Product_id",
        "kpi_ind": "KPI_Ind",
        "num_calls": "Num_Of_Calls",
        "num_samples": "Total_Samples",
        "rx_per_sample": "Rx_Per_Sample",
        "last_called_date": "Last_Called_Date",
        "num_calls_12m": "Num_Of_Calls_12Months",
        "called_months_12m": "Num_Of_Called_Months",
        "abbv_visits": "Num_Of_ABBV_Visits",
        "tgt_rch_status": "Target_Reached_Status",
        "num_calls_13wks": "Thirteen_Week_IW_Calls"
    }
    final_feed = temp1.rename(column_mapping)
    #required new columns for feed
    col_to_addrt = ['ReportType']
    col_to_addp = ['Period']
    col_to_addna = ['Call_Attainment_Prc','Call_Goal','Surveyed_HCP'] + ['Calls' + str(i) for i in range(1,21)]
    # func to add columns with desired value
    def addcol(df,columns_to_add,wtl):
        for my_col in columns_to_add:
            df = df.with_columns(pl.lit(wtl).alias(my_col))
        return df
    final_feed = addcol(final_feed,col_to_addrt,'WEEKLY')
    final_feed = addcol(final_feed,col_to_addp,f'{period_num}-WEEK')
    final_feed = addcol(final_feed,col_to_addna,'\\N')
    # rearranging columns accoring to feed.
    req_cols = ["Physician_ID", "Geography_id", "Product_id", "ReportType", "Period", "KPI_Ind", "Call_Attainment_Prc", "Call_Goal", 
                "Num_Of_Calls", "Total_Samples", "Rx_Per_Sample", "Surveyed_HCP", "Last_Called_Date", "Num_Of_Calls_12Months", 
                "Num_Of_Called_Months", "Calls1", "Calls2", "Calls3", "Calls4", "Calls5", "Calls6", "Calls7", "Calls8", "Calls9", 
                "Calls10", "Calls11", "Calls12", "Calls13", "Calls14", "Calls15", "Calls16", "Calls17", "Calls18", "Calls19", 
                "Calls20", "Num_Of_ABBV_Visits", "Target_Reached_Status", "Thirteen_Week_IW_Calls"]
    final_feed = final_feed.select(req_cols)#final dataset
    return(final_feed)

### Period Loop - 

In [105]:
# Calling Functions and Exporting - 
OUT = 's3://vortex-staging-a65ced90/BIT/output/Prescriber/Weekly/'
for period_num,PN in zip([1,4,13,26,'qtd'],[1,2,3,4,5]):
    if PN>1:
        break
        
    period = f'_{period_num}'
    temp1 = mp_spec_seg_dec.select(['IID','geography_id'])
    temp1 = process_kpi_ind(temp1)
    temp1 = process_num_calls(temp1)
    temp1 = process_num_samples(temp1)
    temp1 = process_rx_per_sample(temp1)
    temp1 = process_last_called(temp1)
    temp1 = process_calls_12m(temp1)
    temp1 = process_called_months_12m(temp1)
    temp1 = process_abbv_visits(temp1)
    temp1 = process_tgt_reach_st(temp1)
    temp1 = process_num_calls_13wks(temp1)
    
    #prepping xponent -
    tuf1 = (
        all_products_tuf
        .with_columns(psum = pl.col(f'TUF_{period_num}c')+pl.col(f'TUF_{period_num}p'))
        .filter(pl.col('psum')!=0).select(['IID','product_id'])
    )
    nuf1 = (
        all_products_nuf
        .with_columns(psum = pl.col(f'NUF_{period_num}c')+pl.col(f'NUF_{period_num}p'))
        .filter(pl.col('psum')!=0).select(['IID','product_id'])
    )
    xponent = tuf1.join(nuf1,on=['IID','product_id'],how='outer_coalesce')

    # for duping rows on product_id-
    # only duping product_id = 2 rows.
    temp1_dups = temp1.filter(pl.col('product_id')==2).drop('product_id')
    temp1_dups = (
        temp1_dups
        .join(xponent,on='IID',how= 'left')
        .select(temp1.columns)
        .filter(pl.col('product_id').is_not_null()) # if data not there in xpn then cant dup
        .with_columns(pl.col('product_id').cast(pl.Int64))
    )

    temp1 = temp1.vstack(temp1_dups).unique(['IID','product_id'])

    # for preserving rows where xponent information avaialble but no calls - adding a new flag:
    temp1 = temp1.join(xponent.with_columns(fl = 1),on=['IID','product_id'],how='left')

    # force keeping linzess = 
    temp1 = temp1.with_columns(
        pl.when(pl.col('product_id').is_in([2,3,4,5])).then(pl.lit(1)).otherwise(pl.lit(None)).alias('fl2')
    )

    null_check_cond = (
        (
            (pl.col('num_calls').is_not_null()) |
            (pl.col('num_samples').is_not_null()) | 
            (pl.col('num_calls_12m').is_not_null()) | 
            (pl.col('called_months_12m').is_not_null()) | 
            (pl.col('abbv_visits').is_not_null()) | 
            (pl.col('num_calls_13wks').is_not_null())
        )   & ((pl.col('fl').is_not_null()) | (pl.col('fl2').is_not_null()))
    )
    temp1 = temp1.filter(null_check_cond).drop(['fl','fl2'])

    feed_dataset = get_feed(temp1)
    #===================================================
    feed_dataset = feed_dataset.to_pandas()
    # Select columns of type 'object' (string)
    string_columns = feed_dataset.select_dtypes(include=['object']).columns.tolist()
    feed_dataset[string_columns] = feed_dataset[string_columns].fillna('\\N')
    feed_dataset = feed_dataset.replace('NaN', '\\N')

    feed_dataset = feed_dataset.replace([np.nan, np.inf, -np.inf], '\\N')
    feed_dataset.to_csv(f'{OUT}Weekly_Prescriber_SalesActivity_P{PN}_Feed.txt', sep='|',lineterminator='\r\n',index=False)
    print(f'Exported Feed {PN}')

Exported Feed 1


---