## calls and samples & abbv

In [1]:
import polars as pl
import pandas as pd
import json
import gc
from datetime import datetime, timedelta,date
from dateutil.relativedelta import relativedelta

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
qtr_data = js['qtr_data']
qtr_ntnw = js['qtr_ntnw']
fir_nqrt = datetime.strptime(js['fir_nqrt'],'%Y-%m-%d').date()
targeting_folder = js['targeting_folder']
working_day_file = js['working_day_file']
roster_file = js['roster_file']
curr_date = datetime.strptime(js['curr_date'], '%Y-%m-%d').date()
quarter_start = datetime.strptime(js['quarter_start'], '%Y-%m-%d').date()
num_weeks_calls = js['num_weeks_calls']
num_weeks_rx = js['num_weeks_rx']
curr_date_p26 = curr_date- timedelta(weeks=26)
curr_date_p13 = curr_date- timedelta(weeks=13) 
curr_date_m13 = curr_date - timedelta(weeks=53)

bucket = js['bucket']

In [3]:
frzmstr = f's3://{bucket}/PYADM/quaterly/{qtr_data}/reference/'
master = f's3://{bucket}/PYADM/weekly/archive/{data_date}/reference/'
inex = f's3://{bucket}/PYADM/reference/{qtr_data}/'
geo = f's3://{bucket}/PYADM/quaterly/{qtr_data}/geography/'
lincall = f's3://{bucket}/PYADM/quaterly/{qtr_data}/target/post/'
dflib = f's3://{bucket}/BIT/dataframes/'
call = f's3://{bucket}/PYADM/weekly/archive/{data_date}/calls_samples/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'

In [4]:
def intck(interval, start_date, end_date):
    """
    Calculates the difference between two dates in terms of the specified interval.

    Args:
        interval (str): The interval ('DAY', 'MONTH', 'WEEK', etc.).
        start_date (datetime.date): The start date.
        end_date (datetime.date): The end date.

    Returns:
        int: The difference between the dates in terms of the specified interval.
    """
    if interval == 'DAY':
        return (end_date - start_date).days
    elif interval == 'MONTH':
        end_date_m = end_date.replace(day=1)
        start_date_m = start_date.replace(day = 1)
        rd = relativedelta(end_date_m, start_date_m)
        return rd.years * 12 + rd.months
    elif interval == 'WEEK':
        return (end_date - start_date).days // 7
    # Add more intervals as needed

# Example usage
# start_date = date(2023, 1, 1)
# end_date = date(2023, 2, 15)
# interval = 'DAY'

# result = intck(interval, start_date, end_date)
# print(f"Difference in {interval}: {result}")
    
def filter_duplicate(df,col,val,new):
    dict = {val:new}
    filtered_df = df.filter(pl.col(col)==val)
    filtered_df = filtered_df.with_columns([
        pl.col(col).map_elements(lambda x : dict.get(x,x)).cast(pl.Utf8)
    ])
    return (df.vstack(filtered_df))

In [5]:
# All Calls-

# Specifiing which columns to keep-
read_cols = ['CallID','CallProductDescription','CallDateTime','SalesRepIID','AttendeeIID','CallProductQuantity',
             'CALL_VARNM','SalesRepTerritoryID','PhysicianTerritoryID','CallType']

# Reading the file from ADM - 
AC = pl.read_parquet(f'{call}ALL_CALLS.parquet',columns=read_cols)

# Adding CallDate column - 
AC = AC.with_columns(pl.col("CallDateTime").cast(pl.Date).alias('CallDate'))
AC = AC.drop("CallDateTime") # dropping redundant column

In [6]:
# filtering for just current 
active_calls = AC.filter(
    (pl.col('CallDate') >= curr_date_m13) & (pl.col('CallDate') <= curr_date)
) #used to be pr_13_wk_date
active_calls = active_calls.with_columns(pl.col('CALL_VARNM').str.slice(3,3).alias('product'))
active_calls = active_calls.with_columns(pl.col('CALL_VARNM').str.slice(0,3).alias('source'))
active_calls = active_calls.with_columns(pl.col('CALL_VARNM').str.slice(7,2).alias('type'))
active_calls = active_calls.with_columns(pl.col('CALL_VARNM').str.slice(6,1).alias('priority'))

active_calls = active_calls.drop('CALL_VARNM')
active_calls = active_calls.with_columns(pl.col('AttendeeIID').cast(pl.Int64))

del AC
gc.collect()

0

#### temp_calls
- this should containt all non frx and non sample only rows

In [7]:
#/*Eliminating all calls made by FRX and Sample Only calls*/
temp_calls = active_calls.filter(
    (pl.col('source').is_in(['EGM', 'IRN', 'REG', 'RFT', 'ROT', 'RTE', 'RWE', 'RZO'])) & ~
    (pl.col('type').is_in(['SO']))
)

# Adding week number and month number - 
temp_calls = temp_calls.with_columns(
    pl.col('CallDate')
    .map_elements(
        lambda x : (intck('DAY',x,curr_date) // 7) + 1,return_dtype=pl.Int64
    )
    .alias('call_week')
)

temp_calls = temp_calls.with_columns(
    pl.col('CallDate')
    .map_elements(
        lambda x : intck('MONTH',x,curr_date) + 1,return_dtype=pl.Int64
    )
    .alias('call_month')
)

temp_calls = temp_calls.filter(pl.col('call_month')<=13) # only have 13 months of data
temp_calls.to_pandas().to_parquet(dflib+'temp_calls.parquet')

#### temp_samples
- this should contain all rows with CallType = 'Group Detail with Sample' or 'Detail with Sample'
- source should still be non FRX

In [8]:
sample_CallProductDescriptions = ["72 MCG","145 MCG","145 MCG SAMPLE 30CT","290 MCG","290 MCG SAMPLE 30CT","CANASA","DELZICOL"]
sample_CallTypes = ['Group Detail with Sample','Detail with Sample']
in_active_samples = active_calls.filter(
    (pl.col('CallProductDescription').str.to_uppercase().is_in(sample_CallProductDescriptions)) & (pl.col('CallType').is_in(sample_CallTypes))
)
temp_samples = in_active_samples.filter(
    pl.col('source').is_in(['EGM', 'IRN', 'REG', 'RFT', 'ROT', 'RTE', 'RWE', 'RZO'])
)
# Adding week number and month number - 
temp_samples = temp_samples.with_columns(
    pl.col('CallDate')
    .map_elements(
        lambda x : (intck('DAY',x,curr_date) // 7) + 1,return_dtype=pl.Int64
    )
    .alias('sample_week')
)

temp_samples = temp_samples.with_columns(
    pl.col('CallDate')
    .map_elements(
        lambda x : intck('MONTH',x,curr_date) + 1,return_dtype=pl.Int64
    )
    .alias('sample_month')
)

temp_samples = temp_samples.filter(pl.col('sample_month')<=13) # only have 13 months of data
temp_samples.to_pandas().to_parquet(dflib+'temp_samples.parquet')

#### temp_abbv
- this should contain all FRX and AEM rows with non Sample Only Records

In [9]:
temp_abbv = active_calls.filter(
    (pl.col('source').is_in(['FRX','AEM'])) & 
    (pl.col('type') != 'SO')
)
# Adding week number and month number - 
temp_abbv = temp_abbv.with_columns(
    pl.col('CallDate')
    .map_elements(
        lambda x : (intck('DAY',x,curr_date) // 7) + 1,return_dtype=pl.Int64
    )
    .alias('call_week')
)

temp_abbv = temp_abbv.with_columns(
    pl.col('CallDate')
    .map_elements(
        lambda x : intck('MONTH',x,curr_date) + 1,return_dtype=pl.Int64
    )
    .alias('call_month')
)

temp_abbv = temp_abbv.filter(pl.col('call_month')<=13) # only have 13 months of data
temp_abbv.to_pandas().to_parquet(dflib+'temp_abbv.parquet')

##### Working Day File
- Used to get number of working days (aggregated at rep_id level)

In [10]:
#importing working day file 
wd_raw = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/working_day/Working Day Data for KMK_{working_day_file}.xlsx'))
wd_raw.columns = ['rep_name','rep_id','day','wd']
wd_raw = (
    wd_raw
    .with_columns(pl.col('day').cast(pl.Date))
    .filter((pl.col('day') >= quarter_start) & (pl.col('day') <= curr_date))
    .group_by(['rep_name','rep_id'])
    .agg(days_in_field=pl.col('wd').sum())
)

  warn("Workbook contains no default style, apply openpyxl's default")


In [11]:
# roster
roster = pl.read_parquet(
    f's3://{bucket}/BIT/roster/MasterRoster_{roster_file}.parquet',columns = ['EmpCode','SalesRepIID']
)
roster = roster.with_columns(pl.col('EmpCode').cast(pl.Int64))
roster = roster.with_columns(pl.col('SalesRepIID').cast(pl.Int64))

#
wd_raw = wd_raw.join(roster,left_on= 'rep_id',right_on = 'EmpCode' ,how = 'left')
wd_raw.to_pandas().to_parquet(dflib+'wd_raw.parquet')

##### call plan file import 
- importing and exporting ironwood call plan (we use the call_freq column here)

In [12]:
lirwd_call_plan = pl.read_parquet(lincall + 'IRWD_CALL_PLAN.parquet',
                                  columns=['IID','CALL_FREQ'])
lirwd_call_plan.columns = ['IID','call_freq_quarter']
lirwd_call_plan.to_pandas().to_parquet(dflib+'lirwd_call_plan.parquet')

In [13]:
### Cleaning up to make space for RX import
del wd_raw
del lirwd_call_plan
del temp_abbv
del temp_calls
del temp_samples
del active_calls
gc.collect()

14

In [14]:
# importing mp so that we can pull in geo_id and roll up rx data
mp_spec_seg_dec = pl.read_parquet(dflib+'mp_spec_seg_dec.parquet')

# Selecting columns that we need 
cols = ['IID'] + ['LINFTUF' + str(i) for i in range(1,num_weeks_rx+1)]
laxdn = pl.read_parquet(xpn+'LAX_DN.parquet',columns=cols)

# Summing and Rolling up at geo_id level
laxdn = laxdn.with_columns(wk_qtd = pl.sum_horizontal(['LINFTUF' + str(i) for i in range(1,num_weeks_rx+1)]))
laxdn = laxdn.select(['IID','wk_qtd'])
laxdn = laxdn.join(mp_spec_seg_dec[['IID','geography_id']],on = 'IID', how = 'left')
laxdn = laxdn.group_by('geography_id').agg(wk_qtd = pl.col('wk_qtd').sum())
laxdn.to_pandas().to_parquet(dflib+'laxdn_geoid_sum.parquet')