# DenormalizedPrescriber ProfileInfo

In [2]:
import polars as pl
import pandas as pd
import gc
import json

In [3]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
num_weeks_rx = js['num_weeks_rx']
num_weeks_calls = js['num_weeks_calls']
IBSC_ptype_file = js['IBSC_ptype_file']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'

In [4]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [5]:
# Imporing Dependencies
load('mp_spec_seg_dec')
load('MASTER_UNI')
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
ibsc_ptype = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/{IBSC_ptype_file}.xlsx'))
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))

load('temp_calls')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

#fixes for vortex import -> Probably caused by Polars Upgrades
temp_calls = temp_calls.with_columns(pl.col('SalesRepIID').cast(pl.Int64))

### Generator Functions

In [6]:
def get_summed_period_iid_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,106)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 1,4,13,26 for current and prior period for a given Metric
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.col(metric+'1').alias(metric+'_1c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,5)]).alias(metric+'_4c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,14)]).alias(metric+'_13c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,27)]).alias(metric+'_26c'),
        pl.sum_horizontal([metric+str(i) for i in range(1,num_weeks_rx+1)]).alias(metric+'_qtdc'),

        pl.col(metric+'2').alias(metric+'_1p'),
        pl.sum_horizontal([metric+str(i) for i in range(5,9)]).alias(metric+'_4p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,27)]).alias(metric+'_13p'),
        pl.sum_horizontal([metric+str(i) for i in range(27,53)]).alias(metric+'_26p'),
        pl.sum_horizontal([metric+str(i) for i in range(14,14+num_weeks_rx)]).alias(metric+'_qtdp'),

        pl.sum_horizontal([metric+str(i) for i in range(1,106)]).alias(metric+'_all')
    )

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))

all_products_tuf = get_summed_period_iid_metric('TUF',fetch_products)
all_products_nuf = get_summed_period_iid_metric('NUF',fetch_products)

def add_parent_product_rows(df):
    agg_dict = {}
    for col in df.columns[2:]:
        agg_dict[col] = pl.col(col).sum()
    
    #join_cols = ['geography_id','plan_type','PlanID','IID']

    df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
    df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
    df_2_35 = df_2_35.group_by(['IID','parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
    
    df_1 = df.group_by('IID').agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

    # stack 1, 2_35 with df and return
    df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
    vstack_helper = df.columns
    df = df.vstack(
        df_2_35.select(vstack_helper)
    ).vstack(
        df_1.select(vstack_helper)
    )

    return(df)

all_products_tuf = add_parent_product_rows(all_products_tuf)
all_products_nuf = add_parent_product_rows(all_products_nuf)

tuf1 = all_products_tuf.filter(pl.col('TUF_all')!=0).select(['IID','product_id'])
nuf1 = all_products_nuf.filter(pl.col('NUF_all')!=0).select(['IID','product_id'])
xponent = tuf1.join(nuf1,on=['IID','product_id'],how='outer_coalesce')

calls = (
    temp_calls.filter(pl.col('call_week')<= num_weeks_calls)
    .rename({'AttendeeIID':'IID'})
    .select('IID').unique('IID')
    .with_columns(product_id = pl.lit(2)).with_columns(pl.col('product_id').cast(pl.Int64))
)

xponent_calls = xponent.join(calls,on=['IID','product_id'],how='outer_coalesce')

#delete extra dfs when optimizing

---

In [15]:
# Processing - 
temp1 = MASTER_UNI.select(
    [
        'IID','FirstName','LastName','CREDENTIAL','PDRPOptOutFlag',
        'AddressLine1','AddressLine2','AddressLine3','AddressLine4','CityName','StateCode','ZIP','DECILE'
    ]
).with_columns(
    pl.concat_str([pl.col('AddressLine1'),pl.col('AddressLine2'),pl.col('AddressLine3'),pl.col('AddressLine4')],separator=' ').alias('Address'),
    pl.concat_str([pl.col('FirstName'),pl.col('LastName')],separator=' ').alias('Physician_Name'),
    pl.when(pl.col('PDRPOptOutFlag')=='Y').then(1).otherwise(0).alias('PDRPOptOutFlag'),
    pl.lit(1).alias('Product_id')
).join(
    mp_spec_seg_dec.drop('decile'),on='IID',how='left'
).join(
    ibsc_ptype,on='IID',how='left'
).drop(['AddressLine1','AddressLine2','AddressLine3','AddressLine4','FirstName','LastName']
).with_columns(
    pl.lit(None).alias('Urgent_Care_HCP'),
    pl.lit('WEEKLY').alias('ReportType')
)

In [16]:
# filter step-
temp1 = temp1.filter(
    pl.col('IID').is_in(xponent_calls['IID'].unique())
)

In [18]:
# Creating Feed Ready Data -
rename_dict = {
    'IID':'Physician_ID',
    'geography_id':'Geography_id',
    'CityName':'City',
    'StateCode':'State',
    'ZIP':'Zip',
    'PDRPOptOutFlag':'IsPDRP',
    'segment':'Segment',
    'IBSC_VALUE':'IBSCPrimaryPayerType',
    'specialty_group':'Specialty',
    'CREDENTIAL':'Credential',
    'DECILE':'Decile'
}
temp1 = temp1.rename(rename_dict)
select_list = [
    'Physician_Name','Physician_ID','Geography_id','Product_id','ReportType','Urgent_Care_HCP','Address','City',
    'State','Zip','IsPDRP','Segment','IBSCPrimaryPayerType','Specialty','Credential','Decile'
]
temp1 = temp1.select(select_list)# last dataset

In [7]:
#Exporting Feeds-
OUT = 's3://vortex-staging-a65ced90/BIT/output/DenormalizedPrescriber/Weekly/'
temp1.to_pandas().to_csv(f'{OUT}Weekly_DenormalizedPrescriber_ProfileInfo_Feed.txt', sep='|')
print('Denorm Presc Profile Info Exported !')

Denorm Presc Profile Info Exported !
