### Presc Trend Feed v2

In [18]:
import polars as pl
import pandas as pd
import gc
import json
from datetime import datetime, timedelta,date

In [19]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
num_weeks_rx = js['num_weeks_rx']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'

In [20]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [21]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('mp_spec_seg_dec')
load('MASTER_UNI')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

### Generator Functions -

In [22]:
# For Voucher Removal -
def get_lin_voucher_13vols():    
    vch = pl.read_parquet(f'{xpn}LIN_VOUCHER.parquet') 
    vch1 = pl.DataFrame()
    for prod,prod2 in zip(['LI1','LI2','LI3'],['LIN1','LIN2','LIN3']):
        rename_dict = dict(zip([f'{prod2}TUF{i}' for i in range(1,14)],[f'vVol{i}_TUF' for i in range(1,14)]))
        vch_prod = (
            vch.select(['IID'] + [f'{prod2}TUF{i}' for i in range(1,14)])
            .rename(rename_dict)
            .with_columns(pl.lit(prod).alias('PROD_CD'))
        )
        if prod == 'LI1':
            vch1 = vch_prod.clone()
        else:
            vch1 = pl.concat([vch1, vch_prod])
    vch1 = vch1.fill_null(0)
    
    return(vch1)

In [23]:
def get_volumes(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,14)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))
    rename_dict = dict(zip(columns[2:],['Vol'+str(i)+'_'+metric for i in range(1,14)]))
    df = df.rename(rename_dict)

    if metric == 'TUF':
        dfv = get_lin_voucher_13vols()
        df = (
            df
            .join(dfv,on=['IID','PROD_CD'],how='left').fill_null(0)
            .with_columns([(pl.col(f'Vol{i}_TUF') - pl.col(f'vVol{i}_TUF')).alias(f'Vol{i}_TUF') for i in range(1,14)])
            .drop(dfv.columns[1:-1])
        )

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null()
    ).drop(['specialty_group','segment','decile','geography_id'])

    return(df)

In [24]:
def get_lin_voucher_52wkvol():
    vch = pl.read_parquet(f'{xpn}LIN_VOUCHER.parquet') 
    vch1 = pl.DataFrame()
    for prod,prod2 in zip(['LI1','LI2','LI3'],['LIN1','LIN2','LIN3']):
        vch_prod = (
            vch
            .select(['IID'] + [f'{prod2}TUF{i}' for i in range(1,52)])
            .with_columns(pl.sum_horizontal([f'{prod2}TUF{i}' for i in range(1,52)]).alias(f'vTUF_52c'))
            .with_columns(pl.lit(prod).alias('PROD_CD'))
            .select(['IID','PROD_CD','vTUF_52c'])
        )
    
        if prod == 'LI1':
            vch1 = vch_prod.clone()
        else:
            vch1 = pl.concat([vch1, vch_prod])
    vch1 = vch1.fill_null(0)
    return (vch1)

In [25]:
def get_summed_52_iid_metric(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,53)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))

    # 52 wk
    df = df.select(
        pl.col('IID'),pl.col('PROD_CD'),
        pl.sum_horizontal([metric+str(i) for i in range(1,53)]).alias(metric+'_52c')
    )
    if metric == 'TUF':
        dfv = get_lin_voucher_52wkvol()
        df = (
            df
            .join(dfv,on=['IID','PROD_CD'],how='left').fill_null(0)
            .with_columns((pl.col('TUF_52c')-pl.col('vTUF_52c')).alias('TUF_52c'))
            .drop('vTUF_52c')
        )

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null())

    return(df.drop(['specialty_group','segment','decile','geography_id']))

In [26]:
##### adding parent product rows
def add_parent_product_rows(df): #pass dataframe with all products and 13 week volumes for both metrics here

    agg_dict  = {}

    for i in range(1,14):
        for metric in ['TUF','NUF']:
            col_name = f'Vol{i}_{metric}'
            agg_dict[col_name] = pl.col(col_name).sum()

    df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
    df_2_35 = df_2_35.group_by(['IID','parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
    df_1 = df.group_by(['IID']).agg(**agg_dict).with_columns(product_id = pl.lit(1)
    ).with_columns(pl.col('product_id').cast(pl.Int64))

    # stack 1, 2_35 with df and return
    df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
    vstack_helper = df.columns
    df = df.vstack(
        df_2_35.select(vstack_helper)
    ).vstack(
        df_1.select(vstack_helper)
    )

    return(df)

In [27]:
# Raw Data Prep 
all_products_volume_tuf = get_volumes('TUF',fetch_products)
all_products_volume_nuf = get_volumes('NUF',fetch_products)
all_products_volume = all_products_volume_tuf.join(all_products_volume_nuf,on = ['IID','PROD_CD'],how='left')

#for sub level groups -
prod_mapping1 = prod_mapping[['product_id','parent_product_id','code']]#.filter(pl.col('parent_product_id')!=1)

all_products_volume = all_products_volume.join(prod_mapping1,left_on='PROD_CD',right_on='code',how='left')
all_products_volume = add_parent_product_rows(all_products_volume)

In [None]:
# For Filtering Rows -
lin_52_TUF = get_summed_52_iid_metric('TUF',['LI1','LI2','LI3']).join(prod_mapping1,left_on='PROD_CD',right_on='code',how='left').drop('parent_product_id')
lin52tp = lin_52_TUF.group_by('IID').agg(TUF_52c = pl.col('TUF_52c').sum()).with_columns(product_id = pl.lit(2)).with_columns(pl.col('product_id').cast(pl.Int64))
lin_52_TUF = lin_52_TUF.select(['IID','TUF_52c','product_id']).vstack(lin52tp).filter(pl.col('TUF_52c')>0).drop('TUF_52c')

lin_52_NUF = get_summed_52_iid_metric('NUF',['LI1','LI2','LI3']).join(prod_mapping1,left_on='PROD_CD',right_on='code',how='left').drop('parent_product_id')
lin52np = lin_52_NUF.group_by('IID').agg(NUF_52c = pl.col('NUF_52c').sum()).with_columns(product_id = pl.lit(2)).with_columns(pl.col('product_id').cast(pl.Int64))
lin_52_NUF = lin_52_NUF.select(['IID','NUF_52c','product_id']).vstack(lin52np).filter(pl.col('NUF_52c')>0).drop('NUF_52c')

lin_52 = lin_52_TUF.join(lin_52_NUF,on=['IID','product_id'],how = 'outer_coalesce')
#adding row for product_id=1
lin_52p1 = lin_52.select('IID').unique('IID').with_columns(pl.lit(1).alias('product_id')).with_columns(pl.col('product_id').cast(pl.Int64))
lin_52 = lin_52.vstack(lin_52p1)

### Functions -

In [30]:
# Volume
#this will increase nobs on temp1 as its one IID to many prod rows
def add_volume_cols(df): 
    return(df.join(all_products_volume,on='IID',how='left').filter(pl.col('product_id').is_not_null()))

# Share
def process_share(df):
    rename_dict = {}
    expn_dict = {}
    for i in range(1,14):
        for metric in ['TUF','NUF']:
            rename_dict[f'Vol{i}_{metric}'] = f'lax_Vol{i}_{metric}'
            expn_dict[f'Shr{i}_{metric}'] = pl.col(f'Vol{i}_{metric}')/pl.col(f'lax_Vol{i}_{metric}')

    df_1 = df.filter(pl.col('product_id')==1).rename(rename_dict).drop(['product_id','geography_id']) # this will contain LAX volumes for each IID
    df = df.join(df_1,on='IID',how='left'
    ).with_columns(**expn_dict).drop(list(rename_dict.values()))

    return(df)

# Trend
def process_trend(df,metric):
    THRE_13 = 1/26
    THRE_4 = 1/10
    #THRE = 1/4 # not used

    df2 = df.select(['IID','geography_id','product_id']+[f'Vol{i}_{metric}' for i in range(1,14)])
    #AVG_TUF
    df2 = df2.with_columns(
        AVG_TUF = pl.mean_horizontal([f'Vol{i}_{metric}' for i in range(1,14)])
    )

    #SLOPE_13
    AVG_TUF = pl.col('AVG_TUF') #just to make formatting easier (polars.expr.expr.Expr obj)
    df2 = df2.with_columns(
        SLOPE_13 = (
        -5.5 * (pl.col(f'Vol13_{metric}') - AVG_TUF) 
        -4.5 * (pl.col(f'Vol12_{metric}') - AVG_TUF) 
        -3.5 * (pl.col(f'Vol11_{metric}') - AVG_TUF) 
        -2.5 * (pl.col(f'Vol10_{metric}') - AVG_TUF) 
        -1.5 * (pl.col(f'Vol9_{metric}') - AVG_TUF) 
        -0.5 * (pl.col(f'Vol8_{metric}') - AVG_TUF) 
        +0.5 * (pl.col(f'Vol6_{metric}') - AVG_TUF) 
        +1.5 * (pl.col(f'Vol5_{metric}') - AVG_TUF) 
        +2.5 * (pl.col(f'Vol4_{metric}') - AVG_TUF) 
        +3.5 * (pl.col(f'Vol3_{metric}') - AVG_TUF) 
        +4.5 * (pl.col(f'Vol2_{metric}') - AVG_TUF) 
        +5.5 * (pl.col(f'Vol1_{metric}') - AVG_TUF)
        ) / 143
    )

    #AVG_TUF_4
    df2 = df2.with_columns(
        AVG_TUF_4 = pl.mean_horizontal([f'Vol{i}_{metric}' for i in range(5,9)])
    )

    #SLOPE_4
    AVG_TUF_4 = pl.col('AVG_TUF_4') # just for formatting
    df2 = df2.with_columns(
        SLOPE_4 = (
        -1.5 * (pl.col(f'Vol4_{metric}') - AVG_TUF_4) 
        -0.5 * (pl.col(f'Vol3_{metric}') - AVG_TUF_4) 
        +0.5 * (pl.col(f'Vol2_{metric}') - AVG_TUF_4) 
        +1.5 * (pl.col(f'Vol1_{metric}') - AVG_TUF_4)
        ) / 15
    )

    #INDICATOR_SLOPE13
    df2 = df2.with_columns(
        pl.when(pl.col('SLOPE_13')>THRE_13).then(pl.lit(1))
        .when(pl.col('SLOPE_13')<-1*THRE_13).then(pl.lit(-1))
        .otherwise(pl.lit(0)).alias('INDICATOR_SLOPE13')
    )

    #INDICATOR_SLOPE4
    df2 = df2.with_columns(
        pl.when(pl.col('SLOPE_4')>THRE_4).then(pl.lit(1))
        .when(pl.col('SLOPE_4')<-1*THRE_4).then(pl.lit(-1))
        .otherwise(pl.lit(0)).alias('INDICATOR_SLOPE4')
    )

    #PEAK_DETECTOR
    cols_1_13 = [f'Vol{i}_{metric}' for i in range(1,14)]
    df2 = df2.with_columns(
        PEAK_DETECTOR = pl.max_horizontal(cols_1_13)/pl.sum_horizontal(cols_1_13)
    )

    #INDICATOR_PEAK
    df2 = df2.with_columns(
        pl.when(pl.col('PEAK_DETECTOR')>= 0.5).then(pl.lit(1))
        .otherwise(pl.lit(0)).alias('INDICATOR_PEAK')
    )

    #FINAL_SLOPE
    df2 = df2.with_columns(
        pl.when((pl.col('INDICATOR_SLOPE13') == 1) & (pl.col('INDICATOR_SLOPE4') >= 0))
            .then(pl.lit(1))
        .when((pl.col('INDICATOR_SLOPE13') == 0) & (pl.col('INDICATOR_SLOPE4') == 1))
            .then(pl.lit(1))
        .when((pl.col('INDICATOR_SLOPE13') == -1) & (pl.col('INDICATOR_SLOPE4') <= 0))
            .then(pl.lit(-1))
        .when((pl.col('INDICATOR_SLOPE13') == 0) & (pl.col('INDICATOR_SLOPE4') == -1))
            .then(pl.lit(-1))
        .when((pl.col('INDICATOR_SLOPE13') == -1) & (pl.col('INDICATOR_SLOPE4') == 1) & (pl.col('SLOPE_4') >= 1))
            .then(pl.lit(1))
        .when((pl.col('INDICATOR_SLOPE13') == 1) & (pl.col('INDICATOR_SLOPE4') == -1) & (pl.col('SLOPE_4') <= -1))
            .then(pl.lit(-1))
        .otherwise(pl.lit(0))
        .alias('FINAL_SLOPE')
    )

    #GROWDECL
    df2 = df2.with_columns(
        pl.when(pl.col('FINAL_SLOPE') == 1)
            .then(pl.lit("P"))
        .when(pl.col('FINAL_SLOPE') == -1)
            .then(pl.lit("Q"))
        .otherwise(pl.lit("S"))
        .alias(f'Trend_{metric}')
    )

    df2 = df2.select(['IID','product_id',f'Trend_{metric}'])
    df = df.join(df2,on = ['IID','product_id'],how = 'left')

    return(df)


---

Processing -

In [123]:
OUT = 's3://vortex-staging-a65ced90/BIT/output/Prescriber/Weekly/'

temp1 = mp_spec_seg_dec.select(['IID','geography_id'])
temp1 = add_volume_cols(temp1)
temp1 = process_share(temp1)

In [124]:
# For removal of extra rows -

#joining with 52wk flag dataset
temp1 = (
    temp1.join(lin_52.with_columns(fl = 1),on = ['IID','product_id'],how='left')
    .with_columns(
        pl.when((pl.col("product_id").is_in([2, 3, 4, 5])) & (pl.col("fl").is_null()))
        .then(0).otherwise(pl.col("fl")).alias("fl")
    )
)

In [125]:
temp1 = temp1.with_columns(
    flag_sum_TUF = pl.sum_horizontal([f'Vol{i}_TUF' for i in range(1,14)]),
    flag_sum_NUF = pl.sum_horizontal([f'Vol{i}_NUF' for i in range(1,14)])
)

In [126]:
temp1= temp1.join(mp_spec_seg_dec[['IID','specialty_group']],on='IID',how='left')

In [128]:
temp1 = temp1.filter(
    ((pl.col('fl').is_not_null()) & (pl.col('fl')==1)) | 
    ((pl.col('fl').is_null()) & ((pl.col('flag_sum_TUF')!=0) | (pl.col('flag_sum_TUF')!=0))) | 
    (pl.col('specialty_group')=='PED')
).drop('fl','flag_sum','specialty_group')

In [130]:
temp1 = process_trend(temp1,'TUF')
temp1 = process_trend(temp1,'NUF')

In [131]:
# For Converting to Feed ready data

temp1 = temp1.with_columns(ReportType = pl.lit('WEEKLY'))

trx_cols = ['IID','geography_id','product_id','ReportType'] + [col for col in temp1.columns if '_TUF' in col ]
nrx_cols = ['IID','geography_id','product_id','ReportType'] + [col for col in temp1.columns if '_NUF' in col ]

temp1_TUF = temp1.select(trx_cols).with_columns(Metric = pl.lit('TRX'))
temp1_NUF = temp1.select(nrx_cols).with_columns(Metric = pl.lit('NRX'))

for df_name in ['temp1_TUF','temp1_NUF']: #renaming for vstack
    for col in globals()[df_name].columns:
        if col.endswith('_TUF'):
            globals()[df_name] = globals()[df_name].rename({col: col.replace('_TUF', '')})
        elif col.endswith('_NUF'):
            globals()[df_name] = globals()[df_name].rename({col: col.replace('_NUF', '')})

# Setting up Sequence
final_sequence = ['IID','geography_id','product_id','Metric','ReportType','Trend'] + [f'Vol{i}' for i in range(1,14)] + [f'Shr{i}' for i in range(1,14)]

temp1_TUF = temp1_TUF.select(final_sequence).rename({'IID':'Physician_ID','geography_id':'Geography_id','product_id':'Product_id'})
temp1_NUF = temp1_NUF.select(final_sequence).rename({'IID':'Physician_ID','geography_id':'Geography_id','product_id':'Product_id'})

temp2 = temp1_TUF.vstack(temp1_NUF) # final dataframe

for new_col in ['DS1_Vol','DS2_Vol']:
    for i in range(1,14):
        col_name = f'{new_col}{i}'
        temp2 = temp2.with_columns(pl.lit('\\N').alias(col_name)) #change this to /N later ? # null --> \N (harsh)

In [132]:
conversion_columns = temp2.columns[5:]

In [133]:
temp2 = temp2.join(
    MASTER_UNI[['IID','PDRPOptOutFlag']],left_on= 'Physician_ID',right_on='IID',how='left'
)

In [135]:
for col in conversion_columns:
    temp2 = temp2.with_columns(
        pl.when(pl.col("PDRPOptOutFlag") == "Y").then(pl.lit('\\N')).otherwise(pl.col(col)).alias(col)
    )
temp2 = temp2.drop('PDRPOptOutFlag')

In [136]:
# Export
temp2.to_pandas().to_csv(f'{OUT}Weekly_Prescriber_Trend_Feed.txt', sep='|')
print('Presc Trend Feed Exported !')

Presc Trend Feed Exported !


---

### Prescriber Trend Feed

In [42]:
rx_date = datetime.strptime(data_date,'%Y%m%d')
list_of_dates = [rx_date]
serial_no = [i for i in range(1,14)]
for i in range(1,13):
    date_val = rx_date - timedelta(days = 7*i)
    list_of_dates.append(date_val)


date_df = pl.DataFrame(
    {
        'X':serial_no,
        'Name':list_of_dates
    }
)

date_df = date_df.with_columns(
   date_df['Name'].dt.strftime('%m/%d/%Y')
)

date_df.to_pandas().to_csv(f'{OUT}Weekly_Prescriber_X_Feed.txt', sep='|')
print('Presc X Feed Exported !')

Presc X Feed Exported !


---