#  Managed Care - 1

In [1]:
# importing modules
import polars as pl
import gc
import pandas as pd
from datetime import datetime, timedelta,date
import json
import numpy as np

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

bucket = js['bucket']
# data_date = js['data_date']
# monthly_data_date = js['monthly_data_date']
data_date = '20240712'
monthly_data_date = '202406'
QTD = 3
YTD = 6 
#TODO: CONNECT TO JSON LATER

dflib = f's3://{bucket}/BIT/dataframes/'
pln = f's3://{bucket}/PYADM/weekly/archive/{data_date}/plantrak/' 
mpln = f's3://{bucket}/PYADM/monthly/archive/{monthly_data_date}/plantrak/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('mp_spec_seg_dec')
#load('MASTER_UNI')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']

---

## Importing Raw Data

### Formulary
- _Using Both Weekly and Monthly for payer name list_
- _only using Monthly for plan_type and plan_class_

In [5]:
# Processing Formulary Datasets -
columns_to_read = ['IMS_PLAN_ID','GROUP_TYPE','FORMULARY_GROUP_STATUS','PFAM_CD','PFAM_NAME','IRWD_FGN_NAME','BRAND']

fm_monthly = pl.read_parquet(
    mpln+'FORMULARY.parquet',columns = columns_to_read
)

fm_weekly = pl.read_parquet(
    pln+'FORMULARY.parquet',columns = columns_to_read
)

# Consolidating list of Unique Payer Names -
payer_names = (
    fm_monthly.select('IRWD_FGN_NAME')
    .vstack(fm_weekly.select('IRWD_FGN_NAME'))
    .unique()
    .sort('IRWD_FGN_NAME')
    .with_row_index(offset=1)
    .rename({'index':'payer_id'})
)

#FORMULARY
group_type_mapping = {
    'HIX' : 'Commercial','Com' : 'Commercial','Cash' : 'Cash','Voucher':'Voucher',
    'FFS' : 'FFS','Mgd Medicaid' : 'Mgd Medicaid','Part D' : 'Part D','MAC A' : 'Others',
}

def classify_plan_class(status):
    status = status.upper()
    if status[:7] == "COVERED" or status[:6] == "ON PDL":
        return "COVERED"
    elif status[:9] == "PREFERRED":
        return "PREFERRED"
    elif status[:13] == "NON-PREFERRED":
        return "NON PREFERRED"
    elif status[:7] == "NON-PDL" or status[:11] == "NOT COVERED":
        return "NOT COVERED"
    else:
        return "N_A"

fm = fm_monthly.with_columns(
        pl.when(pl.col('BRAND')=='IBR')
        .then(pl.lit('IRL'))
        .otherwise(pl.col('BRAND'))
        .alias('BRAND')
)

fm = fm.filter((pl.col('PFAM_CD')==(pl.col('BRAND'))) | (pl.col('BRAND')==''))

fm = (
    fm
    .with_columns(
        pl.col('GROUP_TYPE').map_elements(lambda x: group_type_mapping.get(x,'Others'), return_dtype=pl.Utf8) #NOTE : IF new plan types flow , they will go to Others by default
        .fill_null('Others')
        .alias('plan_type'),
        pl.col('IMS_PLAN_ID').cast(pl.Int64)
    )
    .rename({'IMS_PLAN_ID':'PlanID'})
    .drop('GROUP_TYPE')
    .with_columns(pl.col('FORMULARY_GROUP_STATUS').fill_null(pl.lit('N_A')))
    .with_columns(pl.col('FORMULARY_GROUP_STATUS').map_elements(classify_plan_class,return_dtype=pl.String).alias('plan_class'))
    .drop('FORMULARY_GROUP_STATUS')
    .unique()
)

###############
# HARD CODED - 
fm = fm.with_columns(pl.when(pl.col("PlanID") == 13670614).then(pl.lit('Others')).otherwise(pl.col("plan_type")).alias("plan_type"))
###############
fm2 = (
    fm
    .select('PFAM_CD','IRWD_FGN_NAME','plan_class').unique()
    .group_by(['IRWD_FGN_NAME','PFAM_CD'])
    .agg(
        pl.col('plan_class').unique().str.concat(' / ').alias('plan_class')
    )
    .with_columns(pl.col('plan_class').str.to_titlecase())
)

### Plantrak

In [6]:
# Import and prepare Raw data - # INPUT : Adm Files # OUTPUT : ln
ln = (
    pl.read_parquet(mpln+'LAX_N.parquet',columns=['IID','MonthKey','PFAM_CD','PROD_CD','PlanID','TUF','TRX','TUN']) 
    .rename({'MonthKey':'PeriodKey'})
    .filter(pl.col('PROD_CD').is_in(fetch_products)) #only keep data for BIT products
    .with_columns(pl.col('PeriodKey').cast(pl.Utf8).str.to_date("%Y%m%d")) #Convert Categorical column Back to date
)
date_list = ln['PeriodKey'].unique().sort(descending=True)

# Any PlanIds startign with -0000002 should be excluded
ln = (
    ln
    .with_columns(pl.col('PlanID').cast(pl.Utf8).str.zfill(10).alias('planid_chr'))
    .filter(~pl.col('planid_chr').str.starts_with('000002'))
    .drop('planid_chr')
)

ln = ln.join(
    (pl.DataFrame(date_list).with_row_index(offset = 1).rename({'index':'num_month'})),
    on = 'PeriodKey', how = 'left'
)

ln = (
    ln
    .join(fm.select(['PlanID','IRWD_FGN_NAME']).unique(),on='PlanID',how='left')
)

In [7]:
# GENETATOR FUNCTION FOR DATACUTS  # INPUT : ln # OUTPUT : ln1

#helper dict object - 
filter_cond_dict = {
    '1c' : pl.col('num_month')==1,'1p' : pl.col('num_month')==2,
    '3c' : pl.col('num_month').is_in([1,2,3]),'3p' : pl.col('num_month').is_in([4,5,6]),
    '6c' : pl.col('num_month').is_in([1,2,3,4,5,6]),'6p' : pl.col('num_month').is_in([7,8,9,10,11,12]),
    '12c' : pl.col('num_month').is_in([i for i in range(1,13)]),'12p' : pl.col('num_month').is_in([i for i in range(13,25)]),
    'qtdc' : pl.col('num_month').is_in([i for i in range(1,QTD+1)]),'qtdp' : pl.col('num_month').is_in([i for i in range(4,4+QTD)]),
    'ytdc' : pl.col('num_month').is_in([i for i in range(1,YTD+1)]),'ytdp' : pl.col('num_month').is_in([i for i in range(13,13+YTD)])
}

def get_data_cuts(df):
    result = pl.DataFrame()
    for period,cond in filter_cond_dict.items():
        df_filter = df.filter(cond)
        df_filter = (
            df_filter
            .group_by(['IID','IRWD_FGN_NAME','PFAM_CD','PROD_CD'])
            .agg(
                pl.col('TUF').sum().alias(f'TUF_{period}'),
                pl.col('TRX').sum().alias(f'TRX_{period}'),
                pl.col('TUN').sum().alias(f'TUN_{period}')
            )
        )

        if period == '1c':
            result = df_filter
        else:
            result = result.join(df_filter,on =['IID','IRWD_FGN_NAME','PFAM_CD','PROD_CD'],how = 'outer_coalesce')

    # Pulling in Plan Type -
    result = (
        result
        .join(
            fm.select(['IRWD_FGN_NAME','PFAM_CD','plan_type']).unique(),
            on = ['IRWD_FGN_NAME', 'PFAM_CD'], how = 'left'
        )
        .with_columns(
            pl.col('plan_type').fill_null(pl.lit('Others')),
        )
    )

    # Pulling in Plan Class
    result = (
        result.join(fm2, on=['IRWD_FGN_NAME', 'PFAM_CD'], how='left')
        .with_columns(
            pl.col('plan_class').fill_null(pl.lit('N_a'))
        )
    )

    # Dropping Records with Voucher , FFS , Medicaid
    result = result.filter(
        ~(pl.col('plan_type').is_in(['Voucher','Mgd Medicaid','FFS']))
    )

    #Joining Payer ID-
    result = result.join(payer_names, on ='IRWD_FGN_NAME', how = 'left')

    # adding product_id
    result = (
        result
        .join(
            prod_mapping.select(['code','product_id','parent_product_id']),
            left_on = 'PROD_CD', right_on='code', how = 'left'
        )
    )

    return (result)

ln1 = get_data_cuts(ln)

In [8]:
# Adding Parent Product Rows - # INPUT ln1 # OUTPUT : ln2
data_cut_list = [f'TUF_{p}' for p in filter_cond_dict.keys()] + [f'TRX_{p}' for p in filter_cond_dict.keys()] + [f'TUN_{p}' for p in filter_cond_dict.keys()]

prod_agg_expn_list = {
    col : pl.col(col).sum() for col in data_cut_list
}
prod_agg_expn_list.update({'plan_type':pl.col('plan_type').first()})

#lin and amt-

ln1_235 = (
    ln1
    .filter(pl.col('parent_product_id').is_in([2,35]))
    .group_by(['IID','IRWD_FGN_NAME','payer_id','parent_product_id'])
    .agg(
        **{**prod_agg_expn_list,'plan_class':pl.col('plan_class').first()}
    )
    .rename({'parent_product_id':'product_id'})
)


#for lax mkt - 
ln1_1 = (
    ln1
    .group_by(['IID','IRWD_FGN_NAME','payer_id'])
    .agg(**prod_agg_expn_list)
    .with_columns(pl.lit(1).alias('product_id').cast(pl.Int64),pl.lit('N_a').alias('plan_class'))
    .select(ln1_235.columns)
)

ln2 = (
    ln1.select(ln1_235.columns)
    .vstack(ln1_235)
    .vstack(ln1_1)
)

# Adding Geography Information and Removing Plans not present in Formulary & any White Space HCPs-
ln2 = (
    ln2
    .join(mp_spec_seg_dec[['IID','geography_id']],on='IID',how='left')
    .join(geo_code_mapper,on = 'geography_id', how = 'left')
    .filter(pl.col('payer_id').is_not_null())
    .filter(pl.col('geography_id').is_not_null()) 
    .fill_null(0.0) # Filling Nulls inside Data Cuts for Consistency.

    # DTYPE FIXES 
    .with_columns(
        pl.col('IID').cast(pl.Int64),
        pl.col('payer_id').cast(pl.Int64),
        pl.col('geography_id').cast(pl.Int64),
        pl.col('region_geography_id').cast(pl.Int64),
        pl.col('area_geography_id').cast(pl.Int64),
        pl.col('nation_geography_id').cast(pl.Int64),
    )
)

---

## Ranking

1. top_plans  - For a given Geography ID and given Payer Type : Top 10 Payer IDs [Based on IBSC 6m Volume]
2. top_hcps - For a given payer Top 30 HCPs.

In [24]:
#Top 10 Payers For a Given Geography and PlanType -> #INPUT : ln2 # OUTPUT : top_payers

levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
def get_top_payers(ln2,g):

    ln2 = ln2.filter(product_id = 1) # Only Keeping IBSC Market Volume.
    
    df = (
        ln2
        .group_by([g,'plan_type','payer_id'])
        .agg(TUF = pl.col('TUF_6c').sum())
        .with_columns(
            pl.col('TUF')
            .rank("ordinal",descending=True)
            .over([g,'plan_type'])
            .alias("rank")
        )
        .filter(pl.col('rank') <= 10)
    )
    
    df_total = (
        ln2
        .group_by([g,'payer_id'])
        .agg(TUF = pl.col('TUF_6c').sum())
        .with_columns(plan_type = pl.lit('Total'))
        .with_columns(
            pl.col('TUF')
            .rank("ordinal",descending=True)
            .over([g,'plan_type'])
            .alias("rank")
        )
        .filter(pl.col('rank') <= 10)
        .select(df.columns)
    )
    
    df_pdc = (
        ln2
        .filter(pl.col('plan_type').is_in(['Part D', 'Commercial']))
        .group_by([g,'payer_id'])
        .agg(TUF = pl.col('TUF_6c').sum())
        .with_columns(plan_type = pl.lit('Part D and Commercial'))
        .with_columns(
            pl.col('TUF')
            .rank("ordinal",descending=True)
            .over([g,'plan_type'])
            .alias("rank")
        )
        .filter(pl.col('rank') <= 20)
        .select(df.columns)
    )
    
    df = df.vstack(df_total).vstack(df_pdc).sort(by = [g,'plan_type','rank']).drop(['TUF','rank'])
    
    return (df)

# Consolidating results for all Geography Levels - 
top_payers = [ 
    get_top_payers(ln2,levels[0]),
    get_top_payers(ln2,levels[1]),
    get_top_payers(ln2,levels[2]),
    get_top_payers(ln2,levels[3])
]

In [25]:
#Top 30 HCPs For a Given Geography and PlanType and Payer_ID -> #INPUT : ln2 # OUTPUT : top_hcps | needs top_payers to be in memory
def get_top_hcps(ln2,g,i):
    
    # Pick Up LN2 - >
    ln2 = (
        ln2
        .filter(product_id = 1)
        .with_columns(
            pl.lit('Total').alias('plan_type_group1'),
            pl.when(pl.col('plan_type').is_in(['Part D', 'Commercial'])).then(pl.lit('Part D and Commercial')).otherwise(None).alias('plan_type_group2')
        )
    )
    
    # Join LN2 with top_payers to limit dataset
    ln2_filter = (
        ln2.join(top_payers[i],on = [g,'plan_type','payer_id'],how = 'inner')
    )
    
    ln2_filter_t = (
        ln2.join(
            top_payers[i],
            left_on = [g,'plan_type_group1','payer_id'],
            right_on = [g,'plan_type','payer_id'],how = 'inner'
        )
    )
    
    ln2_filter_pdc = (
        ln2.join(
            top_payers[i],
            left_on = [g,'plan_type_group2','payer_id'],
            right_on = [g,'plan_type','payer_id'],how = 'inner'
        )
    )
    
    # Top 30 HCPs -
    df = (
        ln2_filter
        .group_by([g,'plan_type','payer_id','IID'])
        .agg(TUF = pl.col('TUF_6c').sum())
        .with_columns(pl.col('TUF').rank("ordinal",descending=True).over([g,'plan_type','payer_id']).alias("rank"))
        .filter(pl.col('rank') <= 30)
    )
    
    df_total = (
        ln2_filter_t
        .group_by([g,'plan_type_group1','payer_id','IID'])
        .agg(TUF = pl.col('TUF_6c').sum())
        .with_columns(pl.col('TUF').rank("ordinal",descending=True).over([g,'plan_type_group1','payer_id']).alias("rank"))
        .filter(pl.col('rank') <= 30)
        .rename({'plan_type_group1':'plan_type'})
    )
    
    df_pdc = (
        ln2_filter_pdc
        .group_by([g,'plan_type_group2','payer_id','IID'])
        .agg(TUF = pl.col('TUF_6c').sum())
        .with_columns(pl.col('TUF').rank("ordinal",descending=True).over([g,'plan_type_group2','payer_id']).alias("rank"))
        .filter(pl.col('rank') <= 30)
        .rename({'plan_type_group2':'plan_type'})
    )
    
    df = (
        df
        .vstack(df_total)
        .vstack(df_pdc)
        .sort(by = [g,'plan_type','payer_id','rank'])
        .drop(['TUF','rank'])
    )
    
    return (df)

top_hcps = [
    get_top_hcps(ln2,levels[0],0),
    get_top_hcps(ln2,levels[1],1),
    get_top_hcps(ln2,levels[2],2),
    get_top_hcps(ln2,levels[3],3)
]

In [26]:
#adding columns to facilitate filter joins 
ln2 = (
    ln2
    .with_columns(
        pl.lit('Total').alias('plan_type_group1'),
        pl.when(pl.col('plan_type').is_in(['Part D', 'Commercial'])).then(pl.lit('Part D and Commercial')).otherwise(None).alias('plan_type_group2')
    )
)

---

First Drill-down functions -

In [46]:
# cur_vol , pri_vol , vol_change, prc_vol_growth, vol_change_ind, cur_trx, cur_tun
def process_1():
    res = []
    for i in range(4):
        g = levels[i]
        source_df = (
            ln2
            .select([g,'plan_type','plan_type_group1','plan_type_group2','payer_id','product_id',f'TUF{period}c',f'TUF{period}p',f'TRX{period}c',f'TUN{period}c'])
            .rename({f'TUF{period}c':'cur_vol',f'TUF{period}p':'pri_vol',f'TRX{period}c':'cur_trx',f'TUN{period}c':'cur_tun'})
        )
        agg_expn = {
			'cur_vol':pl.col('cur_vol').sum(),'pri_vol':pl.col('pri_vol').sum(),
			'cur_trx':pl.col('cur_trx').sum(),'cur_tun':pl.col('cur_tun').sum()
		}
        df = (source_df.group_by([g,'plan_type','product_id']).agg(**agg_expn))
        df_t = (source_df.group_by([g,'plan_type_group1','product_id']).agg(**agg_expn).rename({'plan_type_group1' : 'plan_type'}).select(df.columns))
        df_pdc = (
            source_df.filter(pl.col('plan_type_group2').is_not_null()).group_by([g,'plan_type_group2','product_id'])
            .agg(**agg_expn).rename({'plan_type_group2' : 'plan_type'}).select(df.columns)
        )
        df = df.vstack(df_t).vstack(df_pdc)

        df = (
            df
            .with_columns(
                vol_change = pl.col('cur_vol') - pl.col('pri_vol'),
                prc_vol_growth = ((pl.col('cur_vol')/pl.col('pri_vol'))-1).replace([np.inf,np.nan],[None,None]),
                avg_trx_size = (pl.col('cur_tun')/pl.col('cur_trx')).replace([np.inf,np.nan],[None,None])
            )
            .with_columns(
                pl.when(pl.col('vol_change')/pl.col('pri_vol') > 0.02).then(pl.lit('P'))
                .when(pl.col('vol_change')/pl.col('pri_vol') < -0.02).then(pl.lit('Q'))
                .when(pl.col('vol_change')==0).then(None)
                .otherwise(None).alias('vol_change_ind')
            )
            .drop(['cur_trx','cur_tun'])
        )
        res.append(df)
    return (res)


In [33]:
# sales_dist
def process_2(df):
    for i in range(4):
        g = levels[i]
        f = df[i]
        f_total = (
            f
            .filter(pl.col('plan_type')=='Total').select([g,'product_id','cur_vol'])
            .rename({'cur_vol':'Total_cur_vol'})
        )
        f = (
            f
            .join(f_total,on = [g,'product_id'])
            .with_columns((pl.col('cur_vol')/pl.col('Total_cur_vol')).replace(np.nan,0).alias('sales_dist'))
            .drop('Total_cur_vol')
        )
        df[i] = f
    return (df)

In [34]:
# sales_dist_bnch
def process_3(df):
    for i in range(4):
        g = levels[i]
        f = df[i]
        if i != 3:
            f_parent = (
                df[i+1]
                .select([levels[i+1],'plan_type','product_id','sales_dist'])
                .rename({'sales_dist':'sales_dist_bnch'})
            )
            f = (
                f
                .join(geo_code_mapper.select(levels[i],levels[i+1]).unique(),on = g , how='left')
                .join(f_parent, on = [levels[i+1],'plan_type','product_id'],how = 'left')
                .drop(levels[i+1])
            )
        else:
            f = (
                f
                .with_columns(sales_dist_bnch = pl.col('sales_dist'))
            )
        df[i] = f
    return (df)

In [35]:
# prc_vol_growth_bnch
def process_4(df):
    for i in range(4):
        g = levels[i]
        f = df[i]
        # for terr ->
        f_region = (
            df[1].select([levels[1],'plan_type','product_id','prc_vol_growth']).rename({'prc_vol_growth':'prc_vol_growth_bnch'})
        )
        # for Region, Area ->
        f_nation = (
            df[3].select([levels[3],'plan_type','product_id','prc_vol_growth']).rename({'prc_vol_growth':'prc_vol_growth_bnch'})
        )

        if i == 0:
            f = (
                f
                .join(geo_code_mapper.select(levels[i],levels[i+1]).unique(),on = g , how='left')
                .join(f_region, on = [levels[i+1],'plan_type','product_id'],how = 'left')
                .drop(levels[i+1])
            )
        elif (( i==1 ) | (i ==2)):
            f = (
                f
                .join(geo_code_mapper.select(levels[i],levels[3]).unique(),on = g , how='left')
                .join(f_nation, on = [levels[3],'plan_type','product_id'],how = 'left')
                .drop(levels[3])
            )
        else:
            f = (
                f
                .with_columns(prc_vol_growth_bnch = pl.col('prc_vol_growth'))
            )
        df[i] = f

    return (df)

In [36]:
# cur_shr, pri_shr, shr_change, prc_shr_growth, shr_change_ind
def process_5(df):
    for i in range(4):
        g = levels[i]
        f = df[i]
        f_ibsc = (
            f
            .filter(product_id = 1)
            .select([g,'plan_type','cur_vol','pri_vol'])
            .rename({'cur_vol':'lax_cur_vol','pri_vol':'lax_pri_vol'})
        )
        f = (
            f
            .join(f_ibsc,on = [g,'plan_type'],how = 'left')
            .with_columns(
                (pl.col('cur_vol') / pl.col('lax_cur_vol')).alias('cur_shr'),
                (pl.col('pri_vol') / pl.col('lax_pri_vol')).alias('pri_shr')
            )
            .with_columns(
                shr_change = pl.col('cur_shr') - pl.col('pri_shr'),
                prc_shr_growth = ((pl.col('cur_shr')/pl.col('pri_shr'))-1).replace([np.inf,np.nan],[None,None])
            )
            .with_columns(
                pl.when(pl.col('shr_change')/pl.col('pri_shr') > 0.02).then(pl.lit('P'))
                .when(pl.col('shr_change')/pl.col('pri_shr') < -0.02).then(pl.lit('Q'))
                .when(pl.col('shr_change')==0).then(None)
                .otherwise(None).alias('shr_change_ind')
            )
            .drop(['lax_cur_vol','lax_pri_vol'])
        )
        df[i] = f
    return (df)

In [37]:
# prc_shr_growth_bnch
def process_6(df):
    for i in range(4):
        g = levels[i]
        f = df[i]
        # for terr ->
        f_region = (
            df[1].select([levels[1],'plan_type','product_id','prc_shr_growth']).rename({'prc_shr_growth':'prc_shr_growth_bnch'})
        )
        # for Region, Area ->
        f_nation = (
            df[3].select([levels[3],'plan_type','product_id','prc_shr_growth']).rename({'prc_shr_growth':'prc_shr_growth_bnch'})
        )

        if i == 0:
            f = (
                f
                .join(geo_code_mapper.select(levels[i],levels[i+1]).unique(),on = g , how='left')
                .join(f_region, on = [levels[i+1],'plan_type','product_id'],how = 'left')
                .drop(levels[i+1])
            )
        elif (( i==1 ) | (i ==2)):
            f = (
                f
                .join(geo_code_mapper.select(levels[i],levels[3]).unique(),on = g , how='left')
                .join(f_nation, on = [levels[3],'plan_type','product_id'],how = 'left')
                .drop(levels[3])
            )
        else:
            f = (
                f
                .with_columns(prc_shr_growth_bnch = pl.col('prc_shr_growth'))
            )
        df[i] = f

    return (df)

In [38]:
# prc_vol_growth_ind ,prc_shr_growth_ind
def process_7(df):
    for i in range(4):
        g = levels[i]
        f = df[i]
        f = (
            f
            .with_columns(
                pl.when((pl.col('prc_vol_growth') >  pl.col('prc_vol_growth_bnch'))).then(pl.lit('L')).otherwise(pl.lit('\\N')).alias('prc_vol_growth_ind'),
                pl.when((pl.col('prc_shr_growth') >  pl.col('prc_shr_growth_bnch'))).then(pl.lit('L')).otherwise(pl.lit('\\N')).alias('prc_shr_growth_ind')
            )
        )
        df[i] = f
    return (df)

---

#### Period Loop

In [51]:
period = '_1'
temp1 = process_1() # cur_vol , pri_vol , vol_change, prc_vol_growth, vol_change_ind, avg_trx_size
temp1 = process_2(temp1) # sales dist
temp1 = process_3(temp1) # sales dist_bnch
temp1 = process_4(temp1) # prc_vol_growth_bnch
temp1 = process_5(temp1)# cur_shr, pri_shr, shr_change, prc_shr_growth, shr_change_ind
temp1 = process_6(temp1) # prc_shr_growth_bnch
temp1 = process_7(temp1)# prc_vol_growth_ind ,prc_shr_growth_ind

In [64]:
ln.head()

IID,PeriodKey,PFAM_CD,PROD_CD,PlanID,TUF,TRX,TUN,num_month,IRWD_FGN_NAME
i64,date,str,str,i64,f64,f64,f64,u32,str
32746,2024-01-31,"""LIN""","""LI2""",12830002,3.007,1.002,90.21,6,"""Bcbs Louisiana (La) (Com)"""
2542575,2022-11-30,"""LIN""","""LI2""",24291362,1.063,1.063,31.89,20,"""Uhc/Pacificare/Aarp Med D (Pa…"
1164750,2023-03-31,"""LIN""","""LI3""",7000271211,0.863,0.863,25.88,16,"""Caremark Unspec (Com)"""
158522,2023-12-31,"""LAC""","""LAC""",24290725,0.651,1.123,530.96,7,"""Uhc/Pacificare/Aarp Med D (Pa…"
336679,2022-12-31,"""LAC""","""LAC""",50720,2.207,1.333,1799.86,19,"""Cigna (Part D)"""


In [78]:
def get_data_cuts_planid(df):
    result = pl.DataFrame()
    for period,cond in filter_cond_dict.items():
        df_filter = df.filter(cond)
        df_filter = (df_filter.group_by(['IID','PlanID','PFAM_CD','PROD_CD']).agg(pl.col('TUF').sum().alias(f'TUF_{period}')))
        if period == '1c':
            result = df_filter
        else:
            result = result.join(df_filter,on =['IID','PlanID','PFAM_CD','PROD_CD'],how = 'outer_coalesce')
    
    result = (
        result
        # Pulling Payer Name
        .join(fm.select(['PlanID','IRWD_FGN_NAME']).unique(),on='PlanID',how='left') 
        # dropping PlanIDs not present in Formulary
        .filter(pl.col('IRWD_FGN_NAME').is_not_null())
        # Pulling Plan Type
        .join(fm.select(['IRWD_FGN_NAME','PFAM_CD','plan_type']).unique(),on = ['IRWD_FGN_NAME', 'PFAM_CD'], how = 'left') 
        .with_columns(pl.col('plan_type').fill_null(pl.lit('Others')))
        #Pulling Plan Class
        .join(fm2, on=['IRWD_FGN_NAME', 'PFAM_CD'], how='left') 
        .with_columns(pl.col('plan_class').fill_null(pl.lit('N_a')))
        # Dropping Records with Voucher , FFS , Medicaid
        .filter(~(pl.col('plan_type').is_in(['Voucher','Mgd Medicaid','FFS'])))
        #Joining Payer ID-
        .join(payer_names, on ='IRWD_FGN_NAME', how = 'left')
        # adding product_id
        .join(prod_mapping.select(['code','product_id','parent_product_id']),left_on = 'PROD_CD', right_on='code', how = 'left')
    )
    return(result)

In [80]:
ln1_planid = get_data_cuts_planid(ln)

In [81]:
temp1[0].head()

geography_id,plan_type,product_id,cur_vol,pri_vol,vol_change,prc_vol_growth,avg_trx_size,vol_change_ind,sales_dist,sales_dist_bnch,prc_vol_growth_bnch,cur_shr,pri_shr,shr_change,prc_shr_growth,shr_change_ind,prc_shr_growth_bnch,prc_vol_growth_ind,prc_shr_growth_ind
i64,str,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,str,f64,str,str
55,"""Cash""",5.0,1.946,0.976,0.97,0.993852,30.0,"""P""",0.000911,0.003712,-0.23063,0.056044,0.018526,0.037518,2.025203,"""P""",-0.139224,"""L""","""L"""
35,"""Cash""",3.0,0.0,4.918,-4.918,-1.0,,"""Q""",0.0,0.000795,-0.5198,0.0,0.143323,-0.143323,-1.0,"""Q""",-0.452921,"""\N""","""\N"""
65,"""Part D""",8.0,948.947,951.618,-2.671,-0.002807,1110.447728,,0.525013,0.56771,-0.102582,0.22055,0.210154,0.010395,0.049465,"""P""",-0.02105,"""L""","""L"""
81,"""Commercial""",35.0,439.871,428.128,11.743,0.027429,78.891459,"""P""",0.638851,0.548539,-0.039255,0.11043,0.104077,0.006353,0.061037,"""P""",0.000549,"""L""","""L"""
95,"""Commercial""",2.0,2541.306,2614.423,-73.117,-0.027967,50.45237,"""Q""",0.557729,0.498686,-0.025971,0.568248,0.56647,0.001778,0.003139,,-0.000916,"""\N""","""L"""
