# mp

In [1]:
import polars as pl
import pandas as pd
import datetime
import gc
import json

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
qtr_data = js['qtr_data']
qtr_ntnw = js['qtr_ntnw']
fir_nqrt = datetime.datetime.strptime(js['fir_nqrt'],'%Y-%m-%d').date()
targeting_folder = js['targeting_folder']

bucket = js['bucket']

# FOR QC
print([v for k,v in js.items()])

['20240524', '2024Q2', 'Q2', '2024-07-01', '2024 Q2', 'vortex-staging-a65ced90', '2024-06-07', '2024-04-01', '2024-06-30', 10, 8, 3, '202405', '20240610', '2024-02-23', '20240603', 'IBSC Primary Payer Type_2024Q1']


In [3]:
frzmstr = f's3://{bucket}/PYADM/quaterly/{qtr_data}/reference/'
master = f's3://{bucket}/PYADM/weekly/archive/{data_date}/reference/'
inex = f's3://{bucket}/PYADM/reference/{qtr_data}/'
geo = f's3://{bucket}/PYADM/quaterly/{qtr_data}/geography/'
lincall = f's3://{bucket}/PYADM/quaterly/{qtr_data}/target/post/'
dflib = f's3://{bucket}/BIT/dataframes/'

In [4]:
# Reading Frozen MP -
FROZEN_MASTER = pl.read_parquet(
    f'{frzmstr}CUSTOMER_MASTER_IC_LIN.parquet'
)

# Adding this Rename Step to match Net New column name
FROZEN_MASTER = FROZEN_MASTER.rename(
    {
        'Territory':'Territory_IW1',
        'Territory_Name':'Territory_Name_IW1'
    }
)

# Filtering -
FROZEN_MASTER = FROZEN_MASTER.filter(
    (pl.col('SPEC_INCL_LIN') == 'Y') &
    (pl.col('CustomerStatusCode') == 'Active') &
    (pl.col('MatchCode') != '01')
)

FROZEN_MASTER = FROZEN_MASTER.drop(['ACCT_TERR_END_DATE', 'ACCT_TERR_START_DATE']) 
# dropping to make schema same as Net New

In [5]:
# Reading Net New - 
NET_NEW = pl.read_parquet(
    f'{master}NET_NEW_{qtr_ntnw}.parquet'
)

# Filtering -
NET_NEW = NET_NEW.filter(
    (pl.col('SPEC_INCL_LIN') == 'Y') &
    (pl.col('CustomerStatusCode') == 'Active') &
    (pl.col('IC_INCL_LIN') == 1) &
    (pl.col('MatchCode') != '01') & 
    (pl.col('CustomerEffectiveStartDate') < fir_nqrt)
)

NET_NEW = NET_NEW.drop(['IC_INCL_LIN', 'SOURCE','NPI_ID','NPI_StartDate','NPI_EndDate']) # Dropping to name schema same as Frozen MP
NET_NEW = NET_NEW.with_columns(pl.col('DeceasedYear').cast(pl.Utf8))

In [6]:
## To fix Schema error -
for c in ['FormerName','DegreeName','AddressSiteEmail','SpecialtyGroupCode','TAXONOMYID']:
    NET_NEW = NET_NEW.with_columns(pl.col(c).cast(pl.Utf8))

In [7]:
# Combining Net New and Frozen MP
FROZEN_MASTER = FROZEN_MASTER.select(pl.col(NET_NEW.columns)) # Equalizing Schema 

MASTER_UNI = pl.DataFrame()

MASTER_UNI = NET_NEW.vstack(FROZEN_MASTER) 
#Note for Dev : vstack,concat,extend all work differently to get same result, read more on them.

#cleaning -
del FROZEN_MASTER
del NET_NEW
gc.collect()

0

In [8]:
#"Update by devanshi" - #CHANGE:Change Sequence for cond2 and cond3 and test - UPDATE  | REMOVE cond3
MASTER_UNI = MASTER_UNI.with_columns(pl.lit('Other').alias('CREDENTIAL'))

cond1 = MASTER_UNI["ProfessionalDesignation"].is_in(
    ["DC", "DDS", "DMD", "DO", "DOM", "DPM", "DVM", "MD", "ND", "OD", "OP", "PHD", "VMD"]
)
cond2 = MASTER_UNI["ProfessionalDesignation"].is_in(["NP", "PA"])
cond3 = (MASTER_UNI["ProfessionalDesignation"] == "NP") & (MASTER_UNI["DegreeName"] == "REGISTERED NURSE")

MASTER_UNI = MASTER_UNI.with_columns(
    pl.when(cond1).then(pl.lit("MD/DO"))
    .when(cond2).then(pl.lit("NP/PA"))
    .when(cond3).then(pl.lit("Other"))
    .otherwise(MASTER_UNI["CREDENTIAL"])
    .alias("CREDENTIAL")
)

#sorting - 
MASTER_UNI = MASTER_UNI.sort('IID')

In [9]:
# Removing legal exclusions
LEGAL_EXCLUSIONS = pl.read_parquet(
    f'{inex}jami_inclexcl.parquet',
    columns = ['IID','TYPE']
)

LEGAL_EXCLUSIONS = LEGAL_EXCLUSIONS.filter(
    (pl.col('TYPE') == 'Legal Removals') | (pl.col('TYPE') == 'Unknown Address')
)

MASTER_UNI_1 = MASTER_UNI.filter(
    ~pl.col('IID')
    .is_in(LEGAL_EXCLUSIONS['IID'].unique())
)

#Dropping Geo columns so that we can pull then in from ztt
MASTER_UNI_2 = MASTER_UNI_1.drop(['Territory_IW1','Territory_Name_IW1','Region','Region_Name','Area','Area_Name'])

#cleaning - 
del MASTER_UNI
del MASTER_UNI_1
del LEGAL_EXCLUSIONS
gc.collect()

0

In [10]:
# # FOR QC ##
# Number of Recors with no ZIP
zip_freq = MASTER_UNI_2['ZIP'].value_counts().sort('ZIP')
zip_freq.columns = ['ZIP', 'Frequency']
zip_freq[0]

ZIP,Frequency
str,u32
,1334


In [11]:
# ADDING ZIP ALIGNMENT AND HIERARCHY INFORMATION
ZIP_TO_TERR = pl.read_parquet(
    f'{geo}zip_to_terr.parquet'
)
ZIP_TO_TERR = ZIP_TO_TERR.rename({'Zip':'ZIP'})

MASTER_UNI_3 = MASTER_UNI_2.join(
    ZIP_TO_TERR,
    on = 'ZIP',
    how = 'left'
)

#cleaning -
del MASTER_UNI_2
del ZIP_TO_TERR
gc.collect()

whitespace_terrs = [
    "", "1111-99999-11", "1111-99999-21", "1111-99999-12", "1111-99999-13",  "1111-99999-99"
]
#removing whitepsace
MASTER_UNI_3 = MASTER_UNI_3.filter(
    ~pl.col('Territory')
    .is_in(whitespace_terrs)
)

#sorting - 
MASTER_UNI_3 = MASTER_UNI_3.sort('IID')

In [12]:
#Adding target and call plan info to master
IWCALL = pl.read_parquet(
    f'{lincall}IRWD_CALL_PLAN.parquet',
    columns = ['IID','P1']
)
IWCALL = IWCALL.sort('IID')
IWCALL = IWCALL.rename({'P1':'IW_P1'})
IWCALL = IWCALL.with_columns(
    [
        pl.lit('').alias('IW_P2'), 
        pl.lit('').alias('IW_P3'),
        pl.lit(1).alias('IW_CALL_PLAN_FLAG')
    ]
)

ALG_CALL_PLAN = pl.read_parquet(
    f'{lincall}ABBVIE_TARGET.parquet',
    columns = ['IID']
)

ALG_CALL_PLAN = ALG_CALL_PLAN.with_columns([pl.lit(1).alias('ALG_CALL_PLAN_FLAG')])

ALG_CALL_PLAN = ALG_CALL_PLAN.sort('IID')

MASTER_UNI_4 = MASTER_UNI_3.rename({'SpecialtyCode':'specialty_cd'})
MASTER_UNI_4 = MASTER_UNI_4.join(IWCALL, on='IID', how='left').join(ALG_CALL_PLAN, on='IID', how='left')
MASTER_UNI_4 = MASTER_UNI_4.with_columns(
    [
        pl.when(pl.col('IW_CALL_PLAN_FLAG').is_not_null()).then(1).otherwise(0).alias('iw_target_flag'),
        pl.when(pl.col('ALG_CALL_PLAN_FLAG').is_not_null() & pl.col('IW_CALL_PLAN_FLAG').is_null())
        .then(1).otherwise(0).alias('alg_tgt_flag')
    ]
)
MASTER_UNI_4 = MASTER_UNI_4.drop(['IW_CALL_PLAN_FLAG','ALG_CALL_PLAN_FLAG'])

del MASTER_UNI_3
del IWCALL
del ALG_CALL_PLAN
gc.collect()

0

In [13]:
#Adding specialty inclusion/exclusion and ff info to master
SPEC_INCL = pl.read_parquet(
    f'{frzmstr}qtrspec_SPEC_INCL_LIN.parquet',
    columns = ['SPECIALTY_CD','SPEC_INCL']
)
SPEC_INCL = SPEC_INCL.rename({'SPEC_INCL':'spec_incl_lin',
                              'SPECIALTY_CD':'specialty_cd'})

In [14]:
MASTER_UNI_5 = MASTER_UNI_4.join(
    SPEC_INCL,
    on = 'specialty_cd',
    how = 'left'
)

MASTER_UNI_5 = MASTER_UNI_5.with_columns(
    pl.col('Territory').str.slice(offset=11, length=2)
    .alias('ff')
) #Unsure of Use , subjet to removal



In [15]:
#Adding Spec_Group info
SPEC_GROUPS = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/Product Def.xlsx',sheet_name='Specialty Groupings'))
SPEC_GROUPS = SPEC_GROUPS.drop('Market')
SPEC_GROUPS = SPEC_GROUPS.rename({'Specialty Group':'specialty_group','Specialty':'specialty_cd'})

MASTER_UNI_5 = MASTER_UNI_5.join(
    SPEC_GROUPS,
    on = 'specialty_cd',
    how = 'left'
)
MASTER_UNI_5 = MASTER_UNI_5.with_columns(pl.col('specialty_group').fill_null("A/O"))

MASTER_UNI_5 = MASTER_UNI_5.with_columns(
    pl.when(pl.col('specialty_group') == 'All Others')
    .then(pl.lit('A/O'))
    .otherwise(pl.col('specialty_group'))
    .alias('specialty_group')
)

In [16]:
MASTER_UNI_5 = MASTER_UNI_5.select(
    pl.col(
        [
            'IID','specialty_cd','SpecialtyDescription','specialty_group','CREDENTIAL','ZIP',
            'ff','Territory','Territory_Name','Region','Region_Name','Area',
            'Area_Name','IW_P1', 'IW_P2','IW_P3', 'CustomerStatusCode',  
            'iw_target_flag', 'alg_tgt_flag', 'spec_incl_lin',
            'FirstName','LastName','MiddleName','PDRPOptOutFlag', 'AddressLine1','AddressLine2','AddressLine3','AddressLine4',
            'CityName','StateCode'
        ]
    )
)

In [17]:
#Moved this section UP
decile = pl.read_parquet(f'{lincall}ironwood_tgts_decile_{qtr_ntnw.lower()}pass1.parquet', columns = ['IID','KMK_LINZESS_DECILE'])

decile.columns = ['IID','LINZESS_RATING'] #rename step
decile[['LINZESS_RATING']] = decile[['LINZESS_RATING']].fill_null('zero')
decile_mapping = {
    0:'0-2',1:'0-2',2:'0-2',
    3:'3-4',4:'3-4',
    5:'5-7',6:'5-7',7:'5-7',
    8:'8-10',9:'8-10',10:'8-10'
}
decile = decile.with_columns(LINZESS_RATING2 = pl.col('LINZESS_RATING').replace(decile_mapping))
#decile = decile.drop('LINZESS_RATING') 



In [18]:
MASTER_UNI_5 = MASTER_UNI_5.join(decile[['IID','LINZESS_RATING']],on='IID',how='left'
).rename({'LINZESS_RATING':'DECILE'}
).with_columns(pl.col('DECILE').fill_null(0))

decile = decile.drop('LINZESS_RATING') #this is called 'deciles' in sas

In [19]:
MASTER_UNI_5.to_pandas().to_parquet(f'{dflib}MASTER_UNI.parquet', compression='snappy')
# Master Profile Done !

In [20]:
# segment spec and decile ->

In [21]:
geo_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/GeographyMapping.txt',separator='|')
geo_mapping = geo_mapping.with_columns(
    Code = pl.when(pl.col('Code').str.len_chars() == 8).then(pl.lit('1111-')+pl.col('Code')).otherwise(pl.col('Code'))
)

In [22]:
mp_spec_seg_dec = MASTER_UNI_5.clone()
###
seg_cond1 = mp_spec_seg_dec['alg_tgt_flag']==1
seg_cond2 = mp_spec_seg_dec['iw_target_flag']==1

mp_spec_seg_dec = mp_spec_seg_dec.with_columns(
    segment = pl.when(seg_cond1).then(pl.lit('ALG-ONLY-TARGET')).when(seg_cond2).then(pl.lit('Target')).otherwise(pl.lit('Non-Target'))
)
mp_spec_seg_dec = mp_spec_seg_dec.join(
    decile,
    on = ['IID'],how='left'
)

mp_spec_seg_dec = mp_spec_seg_dec.join(
    geo_mapping,
    left_on = 'Territory',right_on='Code',how = 'left'
)

mp_spec_seg_dec = mp_spec_seg_dec.select(['IID','specialty_group','segment','LINZESS_RATING2','Geography_id'])
mp_spec_seg_dec.columns = ['IID','specialty_group','segment','decile','geography_id']

In [23]:
# update :
# to account for net new hcps , making null decile as '0-2'

mp_spec_seg_dec = mp_spec_seg_dec.with_columns(pl.col('decile').fill_null('0-2'))

In [24]:
mp_spec_seg_dec.to_pandas().to_parquet(f'{dflib}mp_spec_seg_dec.parquet',compression='snappy')