In [1]:
import polars as pl
import gc
import pandas as pd
from datetime import datetime, timedelta,date
import json
import numpy as np

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

bucket = js['bucket']
data_date = js['data_date']
monthly_data_date = js['monthly_data_date']
num_wk_rx = js['num_weeks_rx']
dflib = f's3://{bucket}/BIT/dataframes/'
pln = f's3://{bucket}/PYADM/weekly/archive/{data_date}/plantrak/' 
mpln = f's3://{bucket}/PYADM/monthly/archive/{monthly_data_date}/plantrak/'
OUT = 's3://vortex-staging-a65ced90/BIT/output/PrescriberPayer/Weekly/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('mp_spec_seg_dec')
load('MASTER_UNI')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL']
unique_iid = pl.read_parquet(dflib+'unique_iid.parquet')

---

<h2><b>Profile_info</b></h2>

In [5]:
# Getting Utility Columns from main MP
temp1 = MASTER_UNI.select(
    [
        'IID','FirstName','LastName','PDRPOptOutFlag','DECILE',
        'AddressLine1','AddressLine2','AddressLine3','AddressLine4','CityName','StateCode','ZIP'
    ]
)

In [6]:
temp1 = MASTER_UNI.select(
    [
        'IID','FirstName','LastName','PDRPOptOutFlag','DECILE',
        'AddressLine1','AddressLine2','AddressLine3','AddressLine4','CityName','StateCode','ZIP'
    ]
).with_columns(
    pl.concat_str([pl.col('AddressLine1'),pl.col('AddressLine2'),pl.col('AddressLine3'),pl.col('AddressLine4')],separator=' ',ignore_nulls=True).alias('Address'),
    pl.concat_str([pl.col('LastName'),pl.col('FirstName')],separator=', ',ignore_nulls=True).alias('Physician_Name'),
    pl.when(pl.col('PDRPOptOutFlag')=='Y').then(1).otherwise(0).alias('PDRPOptOutFlag')
).join(
    mp_spec_seg_dec,on='IID',how='left'
).join(
    unique_iid,on='IID',how='inner'
).drop(['AddressLine1','AddressLine2','AddressLine3','AddressLine4','FirstName','LastName','decile'])

In [7]:
#Adding Product id
temp1 = temp1.with_columns(
    pl.lit('1').alias('product_id'))


In [8]:
#feed generation
final_feed = temp1
col_mapping = {
    'IID':'PHYSICIAN_ID',
    'Physician_Name':'PHYSICIAN_NAME',
    'specialty_group':'SPECIALTY',
    'Address':'ADDRESS',
    'CityName':'CITY',
    'StateCode':'STATE',
    'segment':'SEGMENT',
    'geography_id':'GEOGRAPHY_ID',
    'product_id':'PRODUCT_ID'}
final_feed = final_feed.rename(col_mapping)

# required columns for feed
col_to_addrt = ['REPORTTYPE']
col_to_addna = ['CATEGORY']
col_to_addn = ['AOSEGMENT']
# func to add columns with desired value
def addcol(df,columns_to_add,wtl):
    for my_col in columns_to_add:
        df = df.with_columns(pl.lit(wtl).alias(my_col))
    return df

final_feed = addcol(final_feed,col_to_addrt,'WEEKLY')
final_feed = addcol(final_feed,col_to_addna,'N/A')
final_feed = addcol(final_feed,col_to_addn,'\\N')

#rearranging columns
req_col = ['PHYSICIAN_NAME', 'PHYSICIAN_ID', 'GEOGRAPHY_ID', 'PRODUCT_ID', 'REPORTTYPE', 'SPECIALTY', 'SEGMENT', 
           'DECILE', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'CATEGORY', 'AOSEGMENT']
final_feed = final_feed.select(req_col)
# changing value of column to match with sas
final_feed = final_feed.with_columns(
        pl.when(pl.col('SEGMENT')=='ALG-ONLY-TARGET')
        .then(pl.lit('AGNT'))
        .when(pl.col('SEGMENT')=='Target')
        .then(pl.lit('T'))
        .when(pl.col('SEGMENT')=='Non-Target')
        .then(pl.lit('NT'))
        .alias('SEGMENT'))

In [9]:
#Exporting Feeds-
OUT = 's3://vortex-staging-a65ced90/BIT/output/PrescriberPayer/Weekly/'
 #===================================================
final_feed = final_feed.to_pandas()
# Select columns of type 'object' (string)
string_columns = final_feed.select_dtypes(include=['object']).columns.tolist()
final_feed[string_columns] = final_feed[string_columns].fillna('\\N')
final_feed = final_feed.replace('NaN', '\\N')

final_feed = final_feed.replace([np.nan, np.inf, -np.inf], '\\N')
final_feed.to_csv(f'{OUT}Weekly_PrescriberPayer_ProfileInfo_Feed.txt', sep='|',lineterminator='\r\n',index=False)
print('Presc Payer Profile Info Exported !')

Presc Payer Profile Info Exported !


<h1><b>PayerInfo and SalesPerf</b></h1>

Formulary -
---

In [10]:
#FORMULARY
group_type_mapping = {
    'HIX' : 'Commerical','Com' : 'Commerical','Cash' : 'Cash','Voucher':'Voucher',
    'FFS' : 'FFS','Mgd Medicaid' : 'Mgd Medicaid','Part D' : 'Part D','MAC A' : 'Others',
}

def classify_plan_class(status):
    status = status.upper()
    if status[:7] == "COVERED" or status[:6] == "ON PDL":
        return "COVERED"
    elif status[:9] == "PREFERRED":
        return "PREFERRED"
    elif status[:13] == "NON-PREFERRED":
        return "NON PREFERRED"
    elif status[:7] == "NON-PDL" or status[:11] == "NOT COVERED":
        return "NOT COVERED"
    else:
        return "N_A"

# Reading 
fm = pl.read_parquet(pln+'FORMULARY.parquet',columns = ['IMS_PLAN_ID','GROUP_TYPE','FORMULARY_GROUP_STATUS','PFAM_CD','PFAM_NAME','IRWD_FGN_NAME','BRAND'])
fm = fm.with_columns(
        pl.when(pl.col('BRAND')=='IBR')
        .then(pl.lit('IRL'))
        .otherwise(pl.col('BRAND'))
        .alias('BRAND')
)
fm = fm.filter((pl.col('PFAM_CD')==(pl.col('BRAND'))) | (pl.col('BRAND')==''))
fm = (
    fm
    .with_columns(
        pl.col('GROUP_TYPE').map_elements(lambda x: group_type_mapping.get(x,'Others'), return_dtype=pl.Utf8) #NOTE : IF new plan types flow , they will go to Others by default
        .fill_null('Others')
        .alias('plan_type'),
        pl.col('IMS_PLAN_ID').cast(pl.Int64)
    )
    .rename({'IMS_PLAN_ID':'PlanID'})
    .drop('GROUP_TYPE')
    .with_columns(pl.col('FORMULARY_GROUP_STATUS').fill_null(pl.lit('N_A')))
    .with_columns(pl.col('FORMULARY_GROUP_STATUS').map_elements(classify_plan_class,return_dtype=pl.String).alias('plan_class'))
    .drop('FORMULARY_GROUP_STATUS')
    .unique()
)
#
mfm = pl.read_parquet(mpln+'FORMULARY.parquet',columns=['IRWD_FGN_NAME'])

payer_month = mfm['IRWD_FGN_NAME'].unique().sort().to_frame()# wfm = monthly formulary
payer_week = fm['IRWD_FGN_NAME'].unique().sort().to_frame() # fm = weekly formulary
delta_payer = payer_week.join(payer_month, on="IRWD_FGN_NAME", how="anti")
payer_map_temp = payer_month.vstack(delta_payer)
payer_map = payer_map_temp.select([
    pl.col('IRWD_FGN_NAME'),
    pl.arange(1, len(payer_map_temp) + 1).alias('payer_id')
])
fm = fm.join(payer_map,on='IRWD_FGN_NAME',how='left')
###############
# HARD CODED - 
fm = fm.with_columns(pl.when(pl.col("PlanID") == 13670614).then(pl.lit('Others')).otherwise(pl.col("plan_type")).alias("plan_type"))
###############

#fm1 = fm.filter(pl.col('BRAND')=='LIN')
fm2 = (
    fm
    .select('PFAM_CD','IRWD_FGN_NAME','plan_class').unique()
    .group_by(['IRWD_FGN_NAME','PFAM_CD']) #,'PFAM_CD'
    .agg(
        pl.col('plan_class').unique().str.concat(' / ').alias('plan_class')
    )
    .with_columns(pl.col('plan_class').str.to_titlecase())
)

Plantrak -
---

In [11]:
# #ranking
# payer_info_sort = ln4.sort(["IID",'plan_type','TUF_c','product_id'], descending=True)
# payer_info_sort = payer_info_sort.to_pandas()
# # Calculate the rank within each group
# payer_info_sort.loc[:, 'RANK'] = payer_info_sort.groupby(["IID", "product_id",'plan_type']).cumcount() + 1

# # Filter the DataFrame to keep only rows where RANK is less than or equal to 5
# payer_info_final = (payer_info_sort[payer_info_sort['RANK'] <= 5]).drop_duplicates(subset=["IID", "product_id",'plan_type','IRWD_FGN_NAME'])

# #Select the required columns (KEEP equivalent)
# payer_info_final = payer_info_final[["IID", "product_id",'IRWD_FGN_NAME','plan_type','plan_class','TUF_c','RANK']] #change IRWD_FGN_NAME,plan_class
# payer_info_final = pl.from_pandas(payer_info_final).filter(pl.col('TUF_c')!=0)

In [12]:
# # Output -> ln6
# # Feed Creation -
# ln6 = (
#     ln5_f
#     .drop(['TUF_p'])
#     .with_columns(
#         pl.col('vol_growth_prc').replace([np.nan, np.inf, -np.inf,None], '\\N'),
#         pl.when(pl.col('vol_growth_prc').is_in([np.nan, np.inf, -np.inf,None])).then(pl.lit('\\N')).otherwise(pl.col('vol_growth_ind')).alias('vol_growth_ind'),
#         pl.col('shr').replace([np.nan, np.inf, -np.inf,None,0], '\\N'),
#         pl.col('TUF_c').round(3).replace(0.0,'\\N'),
#         pl.col('plan_type').str.to_uppercase()
#     )
#     .rename(
#         {
#             'IID' : 'DOCTORID',
#             'IRWD_FGN_NAME' : 'PAYER',
#             'plan_type' : 'PAYERTYPE',
#             'plan_class' : 'COVERAGESTATUS',
#             'TUF_c' : 'VOL',
#             'product_id' : 'PRODUCT_ID',
#             'vol_growth_prc' : 'VOL_GROWTH_PRC',
#             'shr' : 'SHR',
#             'vol_growth_ind' : 'VOL_GROWTH_IND'
#         }
#     )
#     .select(['DOCTORID','PRODUCT_ID','PAYER','PAYERTYPE','COVERAGESTATUS','VOL','VOL_GROWTH_PRC','SHR','VOL_GROWTH_IND'])
# )

# #PDRP override - 
# pdrp = MASTER_UNI.select(['IID','PDRPOptOutFlag'])
# override_columns = ['VOL','VOL_GROWTH_PRC','SHR','VOL_GROWTH_IND']
# expression_list = [
#     pl.when(pl.col('PDRPOptOutFlag')=='Y').then(pl.lit('\\N')).otherwise(pl.col(c)).alias(c)
#     for c in override_columns
# ]
# ln6 = (
#     ln6
#     .join(pdrp, left_on = 'DOCTORID',right_on='IID',how='left')
#     .with_columns(expression_list)
#     .drop('PDRPOptOutFlag')
# )

In [13]:
# Output -> ln1

#TUF at PROD , IID , PLAN LEVEL ->
ln = (
    pl.read_parquet(pln+'LAX_N.parquet',columns=['IID','WK_END_DATE','PFAM_CD','PROD_CD','PlanID','TUF']) #read req cols only
    .rename({'WK_END_DATE':'PeriodKey'})
    .filter(pl.col('PROD_CD').is_in(fetch_products)) #only keep data for BIT products
    #.with_columns(pl.col('PeriodKey').cast(pl.Utf8).str.to_date("%Y%m%d")) #Convert Categorical column Back to date
)

# Any PlanIds startign with -0000002 should be excluded
ln = (
    ln
    .with_columns(pl.col('PlanID').cast(pl.Utf8).str.zfill(10).alias('planid_chr'))
    .filter(~pl.col('planid_chr').str.starts_with('000002'))
    .drop('planid_chr')
)

# Adding Payer from formulary -
ln = (
    ln
    .join(fm.select(['PlanID','IRWD_FGN_NAME','payer_id']).unique(),on='PlanID',how='left') #.unique() PFAM_CD
    #.rename({'IRWD_FGN_NAME':'PAYER'})
)
# NOTE : THIS WILL HAVE NULLS IN PAYER (NAME)

date_list = ln['PeriodKey'].unique().sort(descending=True)

#current 6 months -

ln1_1 = (
    ln
    .filter(pl.col('PeriodKey') >= date_list[25])
    .group_by(['IID','IRWD_FGN_NAME','PFAM_CD','PROD_CD','payer_id']).agg(TUF_P4 = pl.col('TUF').sum())
)
ln1_2 = (
    ln
    .filter(pl.col('PeriodKey') >= date_list[12])
    .group_by(['IID','IRWD_FGN_NAME','PFAM_CD','PROD_CD','payer_id']).agg(TUF_P3 = pl.col('TUF').sum())
)
ln1_3 = (
    ln
    .filter(pl.col('PeriodKey') >= date_list[3])
    .group_by(['IID','IRWD_FGN_NAME','PFAM_CD','PROD_CD','payer_id']).agg(TUF_P2 = pl.col('TUF').sum())
)
ln1_4 = (
    ln
    .filter(pl.col('PeriodKey') >= date_list[0])
    .group_by(['IID','IRWD_FGN_NAME','PFAM_CD','PROD_CD','payer_id']).agg(TUF_P1 = pl.col('TUF').sum())
)
ln1_5 = (
    ln
    .filter(pl.col('PeriodKey') >= date_list[num_wk_rx - 1])
    .group_by(['IID','IRWD_FGN_NAME','PFAM_CD','PROD_CD','payer_id']).agg(TUF_P5 = pl.col('TUF').sum())
)
# -----------------------

ln1 = ln1_1.join(ln1_2, on=['IID', 'IRWD_FGN_NAME', 'PFAM_CD', 'PROD_CD','payer_id'], how='left')

# Then join the result with ln1_3
ln1 = ln1.join(ln1_3, on=['IID', 'IRWD_FGN_NAME', 'PFAM_CD', 'PROD_CD','payer_id'], how='left')

# Continue joining with ln1_4
ln1 = ln1.join(ln1_4, on=['IID', 'IRWD_FGN_NAME', 'PFAM_CD', 'PROD_CD','payer_id'], how='left')

# Finally, join with ln1_5
ln1 = ln1.join(ln1_5, on=['IID', 'IRWD_FGN_NAME', 'PFAM_CD', 'PROD_CD','payer_id'], how='left').fill_null(0)

In [14]:
# Output -> ln2
# Pulling in Plan Type -
ln2 = (
    ln1
    .join(
        fm.select(['IRWD_FGN_NAME','PFAM_CD','plan_type']).unique(),
        on = ['IRWD_FGN_NAME', 'PFAM_CD'], how = 'left' #, 'PFAM_CD'
    )
    .with_columns(
        pl.col('plan_type').fill_null(pl.lit('Others')),
    )
)

# Pulling in Plan Class
ln2 = (
    ln2.join(fm2, on=['IRWD_FGN_NAME', 'PFAM_CD'], how='left') #, 'PFAM_CD'
    .with_columns(
        pl.col('plan_class').fill_null(pl.lit('N_a'))
    )
)

# Dropping Records with Voucher , FFS , Medicaid
ln2 = ln2.filter(
    ~(pl.col('plan_type').is_in(['Voucher','Mgd Medicaid','FFS']))
)

<h2><b>PayerInfo</b></h2>

In [15]:
# For Payer Info
ln3 = (
    ln2
    .group_by(['IID','IRWD_FGN_NAME','payer_id'])
    .agg(
        pl.col('TUF_P4').sum().alias('TUF_P4'),
        pl.col('plan_type').first().alias('plan_type'),
        pl.col('plan_class').unique().str.concat(' / ').alias('plan_class'),
    )
    #.with_columns(pl.lit('N_a').alias('plan_class')) #pl.lit(1).alias('product_id').cast(pl.Int64),
   # .select(ln3_235.columns)
)
# for plan_type = total
ln4 = ln3.group_by(['IID','IRWD_FGN_NAME','payer_id']).agg([
    pl.col("TUF_P4").sum().alias("TUF_P4"), 
    pl.lit("TOTAL").alias("plan_type"),
    pl.col('plan_class').first().alias('plan_class')
]).select(ln3.columns)
ln4 = ln3.vstack(ln4).with_columns(pl.lit(1).alias('product_id').cast(pl.Int64))

In [16]:
#ranking using pandas
payer_info_sort = ln4.sort(["IID",'plan_type','TUF_P4','product_id'], descending=True)
payer_info_sort = payer_info_sort.to_pandas()
# Calculate the rank within each group
payer_info_sort.loc[:, 'RANK'] = payer_info_sort.groupby(["IID", "product_id",'plan_type']).cumcount() + 1

# Filter the DataFrame to keep only rows where RANK is less than or equal to 5
payer_info_final = (payer_info_sort[payer_info_sort['RANK'] <= 5]).drop_duplicates(subset=["IID", "product_id",'plan_type','IRWD_FGN_NAME','payer_id'])

#Select the required columns (KEEP equivalent)
payer_info_final = payer_info_final[["IID", "product_id",'IRWD_FGN_NAME','payer_id','plan_type','plan_class','TUF_P4','RANK']] #change IRWD_FGN_NAME,plan_class
payer_info_final = pl.from_pandas(payer_info_final).filter(pl.col('TUF_P4')!=0)

In [17]:
#Feed1 ready for export
feed1_df = payer_info_final
feed1_df = payer_info_final.drop('TUF_P4')
#renaming column
col_mapping = {'payer_id':'PAYER_ID',
    'IID':'PHYSICIAN_ID',
    'IRWD_FGN_NAME':'PAYER_NAME',
    'plan_type':'PAYERTYPE',
    'product_id':'PRODUCT_ID',
    'plan_class':'FORMULARY'}

final_feed1 = feed1_df.rename(col_mapping)
# required columns for feed
col_to_addrt = ['REPORTTYPE']
col_to_addna = ['ACCESS','FORMULARY_BY_PROD','PA_REQUIRED']
# func to add columns with desired value
def addcol(df,columns_to_add,wtl):
    for my_col in columns_to_add:
        df = df.with_columns(pl.lit(wtl).alias(my_col))
    return df

final_feed1 = addcol(final_feed1,col_to_addrt,'WEEKLY')
final_feed1 = addcol(final_feed1,col_to_addna,'\\N')
#final_feed1 = addcol(final_feed1,col_to_addmkt,'1')
final_feed1 = final_feed1.with_columns(
    pl.when(pl.col('PAYERTYPE')=='Commerical')
    .then(pl.lit('COMMERCIAL'))
    .when(pl.col('PAYERTYPE')=='Cash')
    .then(pl.lit('CASH'))
    .when(pl.col('PAYERTYPE')=='Part D')
    .then(pl.lit('PART D'))
    .when(pl.col('PAYERTYPE')=='Others')
    .then(pl.lit('ALL OTHERS'))
    .otherwise(pl.col('PAYERTYPE'))
    .alias('PAYERTYPE'))
#rearranging col
req_col =  ['PAYER_NAME','PAYER_ID', 'PAYERTYPE', 'PHYSICIAN_ID', 'PRODUCT_ID', 'REPORTTYPE', 'RANK', 'FORMULARY', 
            'ACCESS', 'FORMULARY_BY_PROD', 'PA_REQUIRED']
final_feed1 = final_feed1.select(req_col)


In [18]:
#exporting
final_feed1 = final_feed1.to_pandas()
# Select columns of type 'object' (string)
string_columns = final_feed1.select_dtypes(include=['object']).columns.tolist()
final_feed1[string_columns] = final_feed1[string_columns].fillna('\\N')
final_feed1 = final_feed1.replace('NaN', '\\N')

final_feed1 = final_feed1.replace([np.nan, np.inf, -np.inf], '\\N')
final_feed1.to_csv(f'{OUT}Weekly_PrescriberPayer_PayerInfo_Feed.txt', sep='|',lineterminator='\r\n',index=False)
print('Weekly_PrescriberPayer_PayerInfo_Feed exported!!!!')

Weekly_PrescriberPayer_PayerInfo_Feed exported!!!!


<h2><b>SalesPerf</b></h2>

In [19]:
# for sales performance

ln5 = (
    ln2
    .join(
        prod_mapping.select(['code','product_id','parent_product_id']),
        left_on = 'PROD_CD', right_on='code', how = 'left'
    )
    .select(['IID','IRWD_FGN_NAME','payer_id','product_id','parent_product_id','TUF_P5','TUF_P1','TUF_P2','TUF_P3','TUF_P4','plan_type','plan_class'])
)

#lin and amt-

ln5_235 = (
    ln5
    .filter(pl.col('parent_product_id').is_in([2,35]))
    .group_by(['IID','IRWD_FGN_NAME','payer_id','parent_product_id'])
    .agg(
        pl.col('TUF_P4').sum().alias('TUF_P4'),
        pl.col('TUF_P3').sum().alias('TUF_P3'),
        pl.col('TUF_P2').sum().alias('TUF_P2'),
        pl.col('TUF_P1').sum().alias('TUF_P1'),
        pl.col('TUF_P5').sum().alias('TUF_P5'),
        pl.col('plan_type').first().alias('plan_type'),
        pl.col('plan_class').first().alias('plan_class')
    )
    .rename({'parent_product_id':'product_id'})
)

#lax mkt-

ln5_1 = (
    ln5
    .group_by(['IID','IRWD_FGN_NAME','payer_id'])
    .agg(
        pl.col('TUF_P4').sum().alias('TUF_P4'),
        pl.col('TUF_P3').sum().alias('TUF_P3'),
        pl.col('TUF_P2').sum().alias('TUF_P2'),
        pl.col('TUF_P1').sum().alias('TUF_P1'),
        pl.col('TUF_P5').sum().alias('TUF_P5'),
        pl.col('plan_type').first().alias('plan_type'),
    )
    .with_columns(pl.lit(1).alias('product_id').cast(pl.Int64),pl.lit('N_a').alias('plan_class'))
    .select(ln5_235.columns)
)

ln5 = (
    ln5.drop('parent_product_id').select(ln5_235.columns)
    .vstack(ln5_235)
    .vstack(ln5_1)
)

In [20]:
ln6 = (
    ln5
    .join(mp_spec_seg_dec, on='IID', how='left')
    .filter(pl.col('geography_id').is_not_null())
    .drop(['specialty_group', 'segment', 'decile', 'plan_class'])
)

ln7 = (
    ln6
    .join(prod_mapping[['product_id', 'code']], on='product_id', how='left')
    .drop('product_id')
)


In [21]:
ln8 = ln7.pivot(
    values=["TUF_P4",'TUF_P3','TUF_P2','TUF_P1','TUF_P5'],   
    index=["geography_id",'IRWD_FGN_NAME','payer_id','IID', 'plan_type'],  
    columns="code"   
).unique(['geography_id','IRWD_FGN_NAME','IID','plan_type']).fill_null(0)

ln8_1 = ln8.group_by(['geography_id', 'IRWD_FGN_NAME','payer_id','IID']).agg(
    *[pl.col(col).sum() for col in ln8.columns[5:]],
    pl.lit('TOTAL').alias('plan_type')
).select(ln8.columns)
ln9 = ln8.vstack(ln8_1)
ln9 = ln9.filter(pl.col('TUF_P4_code_LAX')!=0)

In [22]:
# For Converting To feed ready Data
def getfeed(df):
    col_mapping = {
    'IID':'PHYSICIAN_ID',
    'plan_type':'PAYERTYPE',
    'payer_id':'PAYER_ID',
    'geography_id':'GEOGRAPHY_ID',
    'LAX':'MARKET',
    'LIN':'PRODUCT1',
    'LI3':'PRODUCT2',
    'LI1':'PRODUCT3',
    'LI2':'PRODUCT4',
    'TAMT':'PRODUCT5',
    'AMT':'PRODUCT6',
    'LUB':'PRODUCT7',
    'TRU':'PRODUCT8',
    'MOT':'PRODUCT9',
    'IRL':'PRODUCT14',
    'LAC':'PRODUCT11'}
    df = df.rename(col_mapping)
    # required columns for feed
    col_to_addrt = ['REPORTTYPE']
    col_to_addtrx = ['METRIC']
    col_to_addprod = ['PRODUCT_ID']
    col_to_addna = ['PRODUCT10', 'PRODUCT12', 'PRODUCT13', 'PRODUCT15', 'PRODUCT16', 'PRODUCT17', 'PRODUCT18', 'PRODUCT19', 'PRODUCT20', 'BENCHMARK']
    # func to add columns with desired value
    def addcol(df,columns_to_add,wtl):
        for my_col in columns_to_add:
            df = df.with_columns(pl.lit(wtl).alias(my_col))
        return df
    df = addcol(df,col_to_addrt,'WEEKLY')
    df = addcol(df,col_to_addprod,'1')
    df = addcol(df,col_to_addtrx,'TRX')
    df = addcol(df,col_to_addna,'\\N')
    df = df.with_columns(
    pl.when(pl.col('PAYERTYPE')=='Commerical')
    .then(pl.lit('COMMERCIAL'))
    .when(pl.col('PAYERTYPE')=='Cash')
    .then(pl.lit('CASH'))
    .when(pl.col('PAYERTYPE')=='Part D')
    .then(pl.lit('PART D'))
    .when(pl.col('PAYERTYPE')=='Others')
    .then(pl.lit('ALL OTHERS'))
    .otherwise(pl.col('PAYERTYPE'))
    .alias('PAYERTYPE'))
    #rearranging columns
    req_col = columns = ["GEOGRAPHY_ID", "PRODUCT_ID", "METRIC", "PHYSICIAN_ID", "PAYER_ID", "PAYERTYPE", "REPORTTYPE", 
                         "MARKET", "PRODUCT1", "PRODUCT2", "PRODUCT3", "PRODUCT4", "PRODUCT5", "PRODUCT6", "PRODUCT7", 
                         "PRODUCT8", "PRODUCT9", "PRODUCT10", "PRODUCT11", "PRODUCT12", "PRODUCT13", "PRODUCT14", 
                         "PRODUCT15", "PRODUCT16", "PRODUCT17", "PRODUCT18", "PRODUCT19", "PRODUCT20", "BENCHMARK", "SHARE"]

    df = df.select(req_col)
    return df



In [23]:
for i in range(1,6):
    sales_perf_df1 = ln9.select(['geography_id', 'IRWD_FGN_NAME', 'payer_id', 'IID', 'plan_type'] + 
                    [pl.col(f"^.*P{i}.*$")]
    )
    renamed_columns = {
    col: col.replace(f"TUF_P{i}_code_", "") if col.startswith(f"TUF_P{i}_code_") else col for col in sales_perf_df1.columns
    }

    sales_perf_df2 = sales_perf_df1.rename(renamed_columns)
    sales_perf_df3 = sales_perf_df2.with_columns(
        pl.when(pl.col('LAX') != 0)
        .then(pl.col('LIN') / pl.col('LAX'))
        .otherwise(pl.lit("0"))
        .alias("SHARE")
    )
    sales_perf_final = getfeed(sales_perf_df3)
     #===================================================
    feed_dataset = sales_perf_final.to_pandas()
    # Select columns of type 'object' (string)
    string_columns = feed_dataset.select_dtypes(include=['object']).columns.tolist()
    feed_dataset[string_columns] = feed_dataset[string_columns].fillna('\\N')
    feed_dataset = feed_dataset.replace('NaN', '\\N')

    feed_dataset = feed_dataset.replace([np.nan, np.inf, -np.inf], '\\N')
    feed_dataset.to_csv(f'{OUT}Weekly_PrescriberPayer_SalesPerformance_P{i}_Feed.txt', sep='|',lineterminator='\r\n',index=False)
    print(f'PrescriberPayer_SalesPerformance_P{i}_exported!')

PrescriberPayer_SalesPerformance_P1_exported!


PrescriberPayer_SalesPerformance_P2_exported!


PrescriberPayer_SalesPerformance_P3_exported!


PrescriberPayer_SalesPerformance_P4_exported!


PrescriberPayer_SalesPerformance_P5_exported!
