# Prescriber View- profile info

In [1]:
import polars as pl
import pandas as pd
import gc
import json
import numpy as np
from datetime import datetime, timedelta,date
from dateutil.relativedelta import relativedelta

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
num_weeks_rx = js['num_weeks_rx']
num_weeks_calls = js['num_weeks_calls']
IBSC_ptype_file = js['IBSC_ptype_file']
quarter_start = datetime.strptime(js['quarter_start'], '%Y-%m-%d').date()
qtr_data = js['qtr_data']
bucket = js['bucket']
YTD = js['YTD']
monthly_data_date = js['monthly_data_date']

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'
lincall = f's3://{bucket}/PYADM/quaterly/{qtr_data}/target/post/'
mxpn = f's3://{bucket}/PYADM/monthly/archive/{monthly_data_date}/xponent/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
ibsc_ptype = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/{IBSC_ptype_file}.xlsx'))
load('mp_spec_seg_dec')
load('MASTER_UNI')
load('temp_calls')
load('lirwd_call_plan')
load('roster')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']
geo_id_full = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))

#fixes for vortex import -> Probably caused by Polars Upgrades
temp_calls = temp_calls.with_columns(pl.col('SalesRepIID').cast(pl.Int64))
temp_calls_mp_spec = (
    temp_calls
    .join(mp_spec_seg_dec,left_on = 'AttendeeIID',right_on = 'IID', how = 'left').filter(pl.col('geography_id').is_not_null())
    .join(geo_id_full,on = 'geography_id',how = 'left')
    .join(lirwd_call_plan,left_on = 'AttendeeIID', right_on = 'IID', how = 'left')
)

Generator Functions
---

In [5]:
# For Voucher Removal - 
def get_lin_voucher():
    vch = pl.read_parquet(f'{mxpn}LIN_VOUCHER.parquet') 
    vch1 = pl.DataFrame()
    for prod in ['LIN1','LIN2','LIN3']: # LINV
        vch_prod = (
            vch.select(
                pl.col('IID'),
                pl.col(f'{prod}TUF1').alias(f'vTUF_1c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,4)]).alias(f'vTUF_3c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,7)]).alias(f'vTUF_6c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,13)]).alias(f'vTUF_12c'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(4,7)]).alias(f'vTUF_pqtrc'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,YTD+1)]).alias(f'vTUF_ytdc'),
                pl.col(f'{prod}TUF2').alias(f'vTUF_1p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(4,7)]).alias(f'vTUF_3p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(7,13)]).alias(f'vTUF_6p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(13,25)]).alias(f'vTUF_12p'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(7,10)]).alias(f'vTUF_pqtrp'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(13,13+YTD)]).alias(f'vTUF_ytdp'),
                pl.sum_horizontal([f'{prod}TUF{i}' for i in range(1,25)]).alias(f'vTUF_all') #added 105 week datacut
            )
            .with_columns(pl.lit(f'LI{prod[-1]}').alias('PROD_CD'))
        )
        if prod[-1] == '1':
            vch1 = vch_prod.clone()
        else:
            vch1 = pl.concat([vch1, vch_prod])

    # voucher_mapping = {'LI1': 4, 'LI2': 5, 'LI3': 3, 'LIV': 2}
    vch1 = vch1.fill_null(0)
    return(vch1)

In [6]:
def get_summed_period_iid_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,25)]
    df = pl.read_parquet(mxpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,3,6,12,pqtd,ytd for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,4)]).alias(metric+'_3c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,7)]).alias(metric+'_6c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,13)]).alias(metric+'_12c'),
        pl.sum_horizontal([metric+str(i) for i in range(4,7)]).alias(metric+'_pqtrc'),
        pl.sum_horizontal([metric+str(i) for i in range(1,YTD+1)]).alias(metric+'_ytdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(4,7)]).alias(metric+'_3p'),
        pl.sum_horizontal([metric+str(i) for i in range(7,13)]).alias(metric+'_6p'),
        pl.sum_horizontal([metric+str(i) for i in range(13,25)]).alias(metric+'_12p'),
        pl.sum_horizontal([metric+str(i) for i in range(7,10)]).alias(metric+'_pqtrp'),
        pl.sum_horizontal([metric+str(i) for i in range(13,13+YTD)]).alias(metric+'_ytdp'),

        pl.sum_horizontal([metric+str(i) for i in range(1,25)]).alias(metric+'_all')
    )

    # For Voucher Removal - 
    if metric == 'TUF':
        dfv = get_lin_voucher()
        df = df.join(dfv,on=['IID','PROD_CD'],how='left').fill_null(0)
        cols_to_remove = dfv.columns[1:-1]
        df = df.with_columns(
            pl.col(f'{metric}_1c') -  pl.col(f'v{metric}_1c').alias(f'{metric}_1c'),
            pl.col(f'{metric}_3c') -  pl.col(f'v{metric}_3c').alias(f'{metric}_3c'),
            pl.col(f'{metric}_6c') -  pl.col(f'v{metric}_6c').alias(f'{metric}_6c'),
            pl.col(f'{metric}_12c') -  pl.col(f'v{metric}_12c').alias(f'{metric}_12c'),
            pl.col(f'{metric}_pqtrc') -  pl.col(f'v{metric}_pqtrc').alias(f'{metric}_pqtrc'),
            pl.col(f'{metric}_ytdc') -  pl.col(f'v{metric}_ytdc').alias(f'{metric}_ytdc'),
            pl.col(f'{metric}_1p') -  pl.col(f'v{metric}_1p').alias(f'{metric}_1p'),
            pl.col(f'{metric}_3p') -  pl.col(f'v{metric}_3p').alias(f'{metric}_3p'),
            pl.col(f'{metric}_6p') -  pl.col(f'v{metric}_6p').alias(f'{metric}_6p'),
            pl.col(f'{metric}_12p') -  pl.col(f'v{metric}_12p').alias(f'{metric}_12p'),
            pl.col(f'{metric}_pqtrp') -  pl.col(f'v{metric}_pqtrp').alias(f'{metric}_pqtrp'),
            pl.col(f'{metric}_ytdp') -  pl.col(f'v{metric}_ytdp').alias(f'{metric}_ytdp'),
            pl.col(f'{metric}_all') -  pl.col(f'v{metric}_all').alias(f'{metric}_all')
        ).drop(cols_to_remove)

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))

In [7]:
def add_parent_product_rows_iid(df):
    agg_dict = {}
    for col in df.columns[2:]:
        agg_dict[col] = pl.col(col).sum()
    
    #join_cols = ['geography_id','plan_type','PlanID','IID']

    df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
    df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
    df_2_35 = df_2_35.group_by(['IID','parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
    
    df_1 = df.group_by('IID').agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

    # stack 1, 2_35 with df and return
    df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
    vstack_helper = df.columns
    df = df.vstack(
        df_2_35.select(vstack_helper)
    ).vstack(
        df_1.select(vstack_helper)
    )

    return(df)

In [8]:
all_products_tuf = get_summed_period_iid_metric('TUF',fetch_products)
all_products_nuf = get_summed_period_iid_metric('NUF',fetch_products)
all_products_tuf = add_parent_product_rows(all_products_tuf)
all_products_nuf = add_parent_product_rows(all_products_nuf)

tuf1 = all_products_tuf.filter(pl.col('TUF_12c')!=0).select(['IID','product_id'])
nuf1 = all_products_nuf.filter(pl.col('NUF_12c')!=0).select(['IID','product_id'])
xponent = tuf1.join(nuf1,on=['IID','product_id'],how='outer_coalesce')

In [9]:
calls = (
    temp_calls_mp_spec
    .filter((pl.col('call_week')<=num_weeks_calls))
    .filter(pl.col('CallDate')>= quarter_start)
    .join(MASTER_UNI.select(['IID','Territory']),left_on = 'AttendeeIID', right_on = 'IID')
    .join(roster, on = 'SalesRepIID' , how = 'left')
    .filter(pl.col('Territory')==pl.col('GEO'))
    .rename({'AttendeeIID':'IID'})
    .select('IID').unique('IID')
    .with_columns(product_id = pl.lit(2)).with_columns(pl.col('product_id').cast(pl.Int64))
)

xponent_calls = xponent.join(calls,on=['IID','product_id'],how='outer_coalesce')

#delete extra dfs when optimizing

In [10]:
# Manual Addition for Call Plan HCPs 
peds = pl.DataFrame(mp_spec_seg_dec.filter(pl.col('specialty_group')=='PED')['IID'])
callplan_peds = lirwd_call_plan[['IID']].vstack(peds)

callplan_peds = (
    callplan_peds.with_columns(pl.lit(1).alias('product_id'))
    .vstack(
        callplan_peds.with_columns(pl.lit(2).alias('product_id'))
    )
    .with_columns(pl.col('product_id').cast(pl.Int64))
)

#Merging back with xponent_calls 
xponent_calls = xponent_calls.vstack(callplan_peds).unique(['IID','product_id'])
# Note : Xponent_calls is the main filtering dataset. Any IIDs not present in it , will get dropped out.

Processing Data
---

In [11]:
# Getting Utility Columns from main MP
temp1 = MASTER_UNI.select(
    [
        'IID','FirstName','LastName','CREDENTIAL','PDRPOptOutFlag','DECILE',
        'AddressLine1','AddressLine2','AddressLine3','AddressLine4','CityName','StateCode','ZIP',
    ]
)

In [12]:
# Cleaning and Formatting Prec Info | Applying Flags etc
temp1 = MASTER_UNI.select(
    [
        'IID','FirstName','LastName','CREDENTIAL','PDRPOptOutFlag','DECILE',
        'AddressLine1','AddressLine2','AddressLine3','AddressLine4','CityName','StateCode','ZIP',
    ]
).with_columns(
    pl.concat_str([pl.col('AddressLine1'),pl.col('AddressLine2'),pl.col('AddressLine3'),pl.col('AddressLine4')],separator=' ',ignore_nulls=True).alias('Address'),
    pl.concat_str([pl.col('LastName'),pl.col('FirstName')],separator=', ',ignore_nulls=True).alias('Physician_Name'),
    pl.when(pl.col('PDRPOptOutFlag')=='Y').then(1).otherwise(0).alias('PDRPOptOutFlag')
).join(
    mp_spec_seg_dec,on='IID',how='left'
).join(
    ibsc_ptype,on='IID',how='left'
).drop(['AddressLine1','AddressLine2','AddressLine3','AddressLine4','FirstName','LastName','decile'])

Adding Product Id
---

In [13]:
#calls and Rx-
temp1 = temp1.join(xponent_calls,on='IID',how='left').unique(['IID','product_id'])

#Target
temp1 = temp1.with_columns(
    pl.when((pl.col('segment')=='Target')&(pl.col('product_id').is_null()))
    .then(pl.lit(2)).otherwise(pl.col('product_id')).alias('product_id')
)

#PED
temp1 = temp1.with_columns(
    pl.when((pl.col('specialty_group')=='PED')&(pl.col('product_id').is_null()))
    .then(pl.lit(1)).otherwise(pl.col('product_id')).alias('product_id')
)

#Dropping nulls:
temp1 = temp1.filter(pl.col('product_id').is_not_null())

In [14]:
# For Converting to Feed ready data
def get_feed(temp1):
    final_feed = temp1
    #adding extra columns according to feed.
    #required new columns for feed
    col_to_addrt = ['ReportType']
    col_to_addna = ['Urgent_Care_HCP','IMS_HCE_ID','NPI_ID','IC_Geography','AOSegment']
    for my_col in col_to_addna:
            final_feed = final_feed.with_columns(pl.lit('\\N').alias(my_col))
      
    final_feed = final_feed.with_columns(pl.lit('MONTHLY').alias('ReportType'))
    #Renaming columns
    new_col_mapping = {
        'IID':'Physician_ID',
        'product_id':'Product_id',
        'segment': 'Segment',
        'specialty_group': 'Specialty',
        'CREDENTIAL':'Credential',
        'PDRPOptOutFlag':'IsPDRP',
        'DECILE':'Decile',
        'CityName':'City',
        'StateCode':'State',
        'ZIP':'Zip',
        'geography_id':'Geography_id',
        'IBSC_VALUE':'IBSCPrimaryPayerType',
    }
    final_feed = final_feed.rename(new_col_mapping)
    # changing value of column to match with sas - 06/21
    final_feed = final_feed.with_columns(
        pl.when(pl.col('Segment')=='ALG-ONLY-TARGET')
        .then(pl.lit('AGNT'))
        .when(pl.col('Segment')=='Target')
        .then(pl.lit('T'))
        .when(pl.col('Segment')=='Non-Target')
        .then(pl.lit('NT'))
        .alias('Segment'))
    
    # rearranging columns accoring to feed.
    req_cols = ['Physician_Name', 'Physician_ID', 'Geography_id','Product_id', 'ReportType', 'Specialty', 'Segment', 
                'Urgent_Care_HCP', 'Decile', 'Address', 'City', 'State', 'Zip', 'IsPDRP', 'IMS_HCE_ID', 'NPI_ID', 
                'IBSCPrimaryPayerType', 'IC_Geography', 'AOSegment', 'Credential']
    final_feed = final_feed.select(req_cols)#Final Dataset

    final_feed = final_feed.with_columns(pl.col('IBSCPrimaryPayerType').fill_null('N/A'))

    final_feed = (final_feed.filter(~(pl.col('Physician_Name').str.starts_with('ZIP'))))
    
    return final_feed

In [15]:
#Exporting Feeds-
OUT = 's3://vortex-staging-a65ced90/BIT/output/Prescriber/Monthly/'
feed_dataset = get_feed(temp1)
#-----------------------------------#
feed_dataset = feed_dataset.to_pandas()
# Select columns of type 'object' (string)
string_columns = feed_dataset.select_dtypes(include=['object']).columns.tolist()
feed_dataset[string_columns] = feed_dataset[string_columns].fillna('\\N')
feed_dataset = feed_dataset.replace('NaN', '\\N')
feed_dataset = feed_dataset.replace([np.nan, np.inf, -np.inf], '\\N')
feed_dataset.to_csv(f'{OUT}Monthly_Prescriber_ProfileInfo_Feed.txt', sep='|', lineterminator='\r\n',index=False)
print('Presc Profile Info Exported !')

Presc Profile Info Exported !
