# Prescriber Payermix Feed
- in sas this view is connected / dependant on a dataset from mgd care view

In [1]:
import polars as pl
import pandas as pd
import gc
import json

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
monthly_data_date = js['monthly_data_date']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
pln = f's3://{bucket}/PYADM/weekly/archive/{data_date}/plantrak/'
mpln = f's3://{bucket}/PYADM/monthly/archive/{monthly_data_date}/plantrak/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('mp_spec_seg_dec')
load('MASTER_UNI')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

In [5]:
#FORMULARY
group_type_mapping = {
    'HIX' : 'Commerical','Com' : 'Commerical','Cash' : 'Cash','Voucher' : 'Others',
    'FFS' : 'FFS','Mgd Medicaid' : 'Mgd Medicaid','Part D' : 'Part D','MAC A' : 'Others',
}

def classify_plan_class(status):
    status = status.upper()
    if status[:7] == "COVERED" or status[:6] == "ON PDL":
        return "COVERED"
    elif status[:9] == "PREFERRED":
        return "PREFERRED"
    elif status[:13] == "NON-PREFERRED":
        return "NON PREFERRED"
    elif status[:7] == "NON-PDL" or status[:11] == "NOT COVERED":
        return "NOT COVERED"
    else:
        return "N_A"

fm = pl.read_parquet(pln+'FORMULARY.parquet',columns = ['IMS_PLAN_ID','GROUP_TYPE','FORMULARY_GROUP_STATUS','PFAM_CD'])
fm = (
    fm
    .with_columns(
        pl.col('GROUP_TYPE').map_elements(lambda x: group_type_mapping.get(x,'Others'), return_dtype=pl.Utf8) #NOTE : IF new plan types flow , they will go to Others by default
        .fill_null('Others')
        .alias('plan_type'),
        pl.col('IMS_PLAN_ID').cast(pl.Int64)
    )
    .rename({'IMS_PLAN_ID':'PlanID'})
    .drop('GROUP_TYPE')
    .with_columns(pl.col('FORMULARY_GROUP_STATUS').fill_null(pl.lit('N_A')))
    .with_columns(pl.col('FORMULARY_GROUP_STATUS').map_elements(classify_plan_class,return_dtype=pl.String).alias('plan_class'))
    .drop('FORMULARY_GROUP_STATUS')
    .join(prod_mapping[['product_id','code']],left_on = 'PFAM_CD',right_on='code',how='left').drop('PFAM_CD')
    .filter(pl.col('product_id').is_not_null())
    .unique()
)

In [6]:
#PLANTRAK - 6 MONTH TUF at PROD , IID , PLAN LEVEL
ln = (
    pl.read_parquet(mpln+'LAX_N.parquet',columns=['IID','MonthKey','PROD_CD','PlanID','TUF']) #read req cols only
    .rename({'MonthKey':'PeriodKey'})
    .filter(pl.col('PROD_CD').is_in(fetch_products)) #only keep data for BIT products
    .with_columns(pl.col('PeriodKey').cast(pl.Utf8).str.to_date("%Y%m%d")) #Convert Categorical column Back to date
    .join(prod_mapping[['product_id','code']],left_on = 'PROD_CD',right_on='code',how='left')
    .drop('PROD_CD')
)
cut_off_date = ln['PeriodKey'].unique().sort(descending=True)[5] #filter date to only keep 6 months of data
ln = ln.filter(pl.col('PeriodKey')>= cut_off_date)

ln = (
    ln
    .filter(pl.col('PeriodKey')>= cut_off_date)
    .group_by(['IID','PlanID','product_id']) #Rolling up lax_n for 6 month TUF at : IID and PlanID, prod level. 
    .agg(TUF = pl.col('TUF').sum()) #6 month TUF
)

# NOTE: make sure lax_n data is correct, should not have missing period keys as
# top 6th unique period key is being used as a cut off date here
# if wrong , then wrong amount of TUF will flow into the code

# Doubt : (NOTE Rows will drop here since we are removing non geo IIDs) - Should I remove white space ppl?

#Adding Parent Product Rows - DEFERED TO POST PIVOT CALCULATIONS : Reason : Formulary Does not have parent product ID's joining this with formulary will cause volume to go to n_a
# def add_parent_product_rows(df):
#     df = df.join(prod_mapping[['product_id','parent_product_id']],on='product_id',how='left')
#     df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
#     df_2_35 = df_2_35.group_by(['IID','PlanID','parent_product_id']).agg(TUF = pl.col('TUF').sum()).rename({'parent_product_id':'product_id'})
#     df_1 = df.group_by('IID','PlanID').agg(TUF = pl.col('TUF').sum()).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

#     # stack 1, 2_35 with df and return
#     df = df.drop(['parent_product_id']) #dropping to make same shape
#     vstack_helper = df.columns
#     df = df.vstack(
#         df_2_35.select(vstack_helper)
#     ).vstack(
#         df_1.select(vstack_helper)
#     )

#     return(df)

# ln = add_parent_product_rows(ln)

---

In [7]:
# Processing -
# Rows will drop here as using inner join
# Doubt : what to do with rx which is not there in formulary info?
ln1 = (
    ln.join(fm, on=['PlanID', 'product_id'], how='left') 
    .with_columns(
        pl.col('plan_type').fill_null(pl.lit('Others')),
        pl.col('plan_class').fill_null(pl.lit('N_A'))
    )
)

ln_c = ln1.pivot(
    values = 'TUF',index = ['IID','product_id'],
    columns = 'plan_class',
    aggregate_function = 'sum'
).fill_null(0)

ln_t = ln1.pivot(
    values = 'TUF',index = ['IID','product_id'],
    columns = 'plan_type',
    aggregate_function = 'sum'
).fill_null(0)

# Adding Parent Product Rows
def add_parent_product_rows(df):
    agg_dict = {}
    for col in df.columns[2:]:
        agg_dict[col] = pl.col(col).sum()

    df = df.join(prod_mapping[['product_id','parent_product_id']], on = 'product_id', how = 'left')
    df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
    df_2_35 = df_2_35.group_by(['IID','parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
    
    df_1 = df.group_by('IID').agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

    # stack 1, 2_35 with df and return
    df = df.drop(['parent_product_id']) #dropping to make same shape
    vstack_helper = df.columns
    df = df.vstack(
        df_2_35.select(vstack_helper)
    ).vstack(
        df_1.select(vstack_helper)
    )

    return(df)
ln_c = add_parent_product_rows(ln_c)
ln_t = add_parent_product_rows(ln_t)

ln2 = ln_c.join(ln_t,on=['IID','product_id'],how='left')

ln2 = (
    ln2
    .with_columns(plan_type_sum = pl.sum_horizontal(['Part D', 'Commerical', 'Mgd Medicaid', 'FFS', 'Cash', 'Others']))
    .with_columns(
        Commerical_prc = pl.col('Commerical')/pl.col('plan_type_sum'),
        Medicare_Part_D_prc = pl.col('Part D')/pl.col('plan_type_sum'),
        Managed_Medicaid_prc = pl.col('Mgd Medicaid')/pl.col('plan_type_sum'),
        FFS_prc = pl.col('FFS')/pl.col('plan_type_sum'),
        Cash_prc = pl.col('Cash')/pl.col('plan_type_sum'),
        Others_prc = pl.col('Others')/pl.col('plan_type_sum')
    )
    .with_columns(
        qc = pl.sum_horizontal(['Commerical_prc','Medicare_Part_D_prc','Managed_Medicaid_prc', 'FFS_prc', 'Cash_prc', 'Others_prc'])
    )# ONLY FOR QC | Will use it to drop 0 TUF rows
    .filter(~pl.col('qc').is_nan())
    .drop(['Part D', 'Commerical', 'Mgd Medicaid', 'FFS', 'Cash', 'Others','plan_type_sum','qc'])
    .with_columns(
        pl.lit(None).alias('Managed_Medicaid_prc'),
        pl.lit(None).alias('FFS_prc')
    )# removing data from mgd , ffs
)

ln2 = ln2.join(
    MASTER_UNI.with_columns((pl.col('LastName')+', '+pl.col('FirstName')).alias('Physician_Name')).select(['IID','Physician_Name']),
    on='IID',how='left'
)

In [8]:
# BUG : What to do with plans with more than one coverage status ? eg : IID 1631628

----

In [9]:
# Final Feed Creation -
final_feed = ln2

#Renaming existing columns according to feed
column_mapping = {
    "IID": "Physician_ID",
    "product_id": "Product_id",
    "PREFERRED": "Preferred",
    "COVERED": "Covered",
    "NOT COVERED": "Not_Covered",
    "N_A": "Not_Available",
    "Commerical_prc": "Commercial",
    "Medicare_Part_D_prc": "Medicare_Part_D",
    "Managed_Medicaid_prc": "Managed_Medicaid",
    "FFS_prc": "FFS",
    "Cash_prc": "Cash",
    "Others_prc": "Other"
}
final_feed = final_feed.rename(column_mapping)
#required new columns for feed
col_to_addrt = ['ReportType']
col_to_addna = ['Covered_PA_ST','Unknown','Not_Applicable']#Not_Applicable also have 0 in feed
# func to add columns with desired value
def addcol(df,columns_to_add,wtl):
    for my_col in columns_to_add:
        df = df.with_columns(pl.lit(wtl).alias(my_col))
    return df
final_feed = addcol(final_feed,col_to_addrt,'WEEKLY')
final_feed = addcol(final_feed,col_to_addna,'\\N')
# rearranging columns accoring to feed.
req_cols = ["Physician_Name",
    "Physician_ID",
    "Product_id",
    "ReportType",
    "Preferred",
    "Covered",
    "Not_Covered",
    "Not_Available",
    "Commercial",
    "Medicare_Part_D",
    "Managed_Medicaid",
    "FFS",
    "Cash",
    "Other",
    "Covered_PA_ST",
    "Unknown",
    "Not_Applicable"]
final_feed = final_feed.select(req_cols)# final data set.

In [10]:
# Exporting -
OUT = 's3://vortex-staging-a65ced90/BIT/output/Prescriber/Weekly/'
final_feed.to_pandas().to_csv(f'{OUT}Weekly_Prescriber_PayerMix_Feed.txt', sep='|')