Import Libraries

In [1]:
import pandas as pd
import datetime
from datetime import datetime, timedelta
import os
import polars as pl
import gc
import numpy as np
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq

import s3fs
import boto3
from io import BytesIO as bo

pl.enable_string_cache()

Date Variables

In [2]:
#variables -
this_day = datetime.today()
# ### FOR TESTING - REMOVE LATER ### 
# this_day = this_day - timedelta(days=7)
# ####
days_to_monday = (this_day.weekday() - 0) % 7
monday = this_day - timedelta(days=days_to_monday)

CUR_PROC_WK = monday.strftime("%Y%m%d")

PRE_PROC_WK0 = monday - timedelta(days=7)
PRE_PROC_WK = str(PRE_PROC_WK0.year) + str(PRE_PROC_WK0.month).zfill(2) + str(PRE_PROC_WK0.day).zfill(2)

CUR_WK0 = monday - timedelta(days=17)
CUR_WK = str(CUR_WK0.year) + str(CUR_WK0.month).zfill(2) + str(CUR_WK0.day).zfill(2)

PRE_WK0 = monday - timedelta(days=24)
PRE_WK = PRE_WK0.strftime("%Y%m%d")


In [3]:
bucket = 'vortex-staging-a65ced90'

Library Names and Paths

In [4]:
raw_path = f'PYADM/raw/{CUR_PROC_WK}/inbound/'
curwk = f'PYADM/raw/{CUR_PROC_WK}/dataframes/'
prewk = f'PYADM/raw/{PRE_PROC_WK}/dataframes/'
wkxpn = 'PYADM/weekly/staging/xponent/'
mthxpn = 'PYADM/monthly/staging/xponent/'
pwkxpn = f'PYADM/weekly/archive/{PRE_WK}/xponent/'

In [5]:
# Picking Up Data from Rx_2
WK_TMP1_LAX = pl.read_parquet(f's3://{bucket}/{curwk}temp_WK_TMP1_LAX.parquet') #Source for this is subject to change , may add full version in future?

In [6]:
#For Weekly 
group_cols = ['IID','PROD_CD','ChannelID','ChannelName','WK_END_DATE','MKT_CD','MarketName','G_B','RO_TYPE','PFAM_CD','PROD_NAME']
sum_cols = ['TRX','NRX','TUN','NUN','TUF','NUF']

# #perform groupby and calc function 
WK_TMP2_LAX = pl.DataFrame()
WK_TMP2_LAX = WK_TMP1_LAX.fill_null("").group_by(group_cols).agg(
    [pl.col(c).sum() for c in sum_cols]
)

WK_TMP2_LAX = WK_TMP2_LAX.with_columns(
    pl.when(pl.col("MKT_CD") == "")
    .then(pl.lit(""))
    .otherwise(pl.col("MarketName"))
    .alias("MarketName")
)

WK_TMP2_LAX.sort(by=['IID','PROD_CD']) # unsure of use

del WK_TMP1_LAX
gc.collect()

10

In [7]:
#Picking up Date Parm from previous code to get week number and merging it rx data
date_parm_wk = pl.read_parquet(f's3://{bucket}/{curwk}curwk_DATE_PARM_WK.parquet')
date_parm_wk = date_parm_wk.with_columns(pl.col('WK_END_DATE').dt.date()) 

#fix for dtype fix | join not working | NEW
WK_TMP2_LAX = WK_TMP2_LAX.with_columns(pl.col('WK_END_DATE').cast(pl.Date))

WK_TMP2_LAX = WK_TMP2_LAX.join(date_parm_wk,on='WK_END_DATE',how='left') # SHOULD I CHANGE THIS TO INNER ? 

In [8]:
chnl_dict = {"1":'RTL',"2":'MO'} # 1 is RTL, 2 is MO

WK_TMP3_LAX = WK_TMP2_LAX.with_columns([
    (pl.col('ChannelID').map_elements(lambda x: chnl_dict.get(x,x),return_dtype=pl.String).cast(pl.Utf8) + "_" + pl.col("I").cast(pl.Utf8)).alias('CH_WK')
])

del WK_TMP2_LAX
gc.collect()

0

In [9]:
# Only keeping the columns being used in bit (New step)
WK_TMP3_LAX= WK_TMP3_LAX.drop(
    ['ChannelID','ChannelName','WK_END_DATE','I','ChannelName','MKT_CD','G_B','RO_TYPE','PROD_NAME','PFAM_CD']
)
WK_TMP3_LAX = WK_TMP3_LAX.sort('IID')

In [10]:
#QC
ucols = list(WK_TMP3_LAX['CH_WK'].unique())
tdf = pl.DataFrame({
    'chn' : [s.split('_')[0] for s in ucols],
    'wk' : [s.split('_')[1] for s in ucols],
})
tdf.sort(by=['chn'])
tdf1 = tdf.filter(pl.col('chn')=='MO').sort(by='wk')
tdf2 = tdf.filter(pl.col('chn')=='RTL').sort(by='wk')
print("Number of Weeks of Data for RTL type records : ",tdf1.shape[0])
print("Number of Weeks of Data for MO type records : ",tdf2.shape[0])

Number of Weeks of Data for RTL type records :  105
Number of Weeks of Data for MO type records :  105


In [11]:
unique_vals = list(WK_TMP3_LAX['CH_WK'].unique())
metrics = ['TRX','NRX','TUF','NUF','TUN','NUN']
full_unique_vals = []
def unique_vals_prod_wk(col_name):  
    parts = col_name.split('_')
    for m in metrics:
        full = parts[0]+'_'+m+parts[-1]
        full_unique_vals.append(full)
for i in unique_vals:
    unique_vals_prod_wk(i)
full_unique_vals.sort()

full_unique_vals = ['IID','PROD_CD','MarketName'] + full_unique_vals

In [12]:
unique_iids = WK_TMP3_LAX['IID'].unique() 
chunk_size = 30000 
iid_chunks = [unique_iids[i:i + chunk_size] for i in range(0, len(unique_iids), chunk_size)]

In [13]:
writer = None  
wkxpn_LAX = pl.DataFrame()  
loop_counter = 0 

for iid_chunk in tqdm(iid_chunks):

    df_chunk = WK_TMP3_LAX.filter(pl.col('IID').is_in(iid_chunk))

    df_pivot_chunk = df_chunk.pivot(
        values = metrics, 
        index = ['IID','PROD_CD','MarketName'], 
        columns = 'CH_WK', 
        aggregate_function = None, sort_columns = True
    )

    df_pivot_chunk = df_pivot_chunk.select(pl.all().name.map(
        lambda col_name: col_name.split('_')[-2] + '_' + col_name.split('_')[0] + col_name.split('_')[-1] if(('MO' in col_name) or ('RTL' in col_name)) else col_name)
    )

    #This step might be redundant 
    missing_cols = [col for col in full_unique_vals if col not in df_pivot_chunk.columns]
    for col in missing_cols: # This Takes 5 Seconds
        null_series = pl.Series(col, [None]*len(df_pivot_chunk), dtype=pl.Float64)
        df_pivot_chunk = df_pivot_chunk.with_columns(null_series)

    #This is used to hot fix the column order sequence glitch, useful for later on 
    code_columns = [col for col in df_pivot_chunk.columns if (('MO' in col) or ('RTL' in col))]
    metrics = ['TRX','NRX','TUF','NUF','TUN','NUN']
    prefixes = ['MO', 'RTL']
    def sort_key(col):
        for prefix in prefixes:
            for metric in metrics:
                if f'{prefix}_{metric}' in col:
                    return (prefix, metric, int(col.replace(f'{prefix}_{metric}', '')))

    # Sort the columns
    code_columns.sort(key=sort_key)
    code_columns = ['IID','PROD_CD','MarketName'] + code_columns
    # APPLY FIX - 
    df_pivot_chunk = df_pivot_chunk.select(code_columns)

    for metric in metrics:
        relevant_columns = [col for col in code_columns if any(prefix+'_'+metric in col for prefix in prefixes)]
        week_numbers = sorted(set(int(col.split(metric)[-1]) for col in relevant_columns))
        for week_number in week_numbers:
            new_column = metric+str(week_number)
            week_columns = [col for col in relevant_columns if col.endswith(metric + str(week_number))]
            #df_pivot_chunk = df_pivot_chunk.with_columns(sum(pl.col(c) for c in week_columns).alias(new_column))
            df_pivot_chunk = df_pivot_chunk.with_columns(pl.sum_horizontal(week_columns).alias(new_column))

    # cant combine yet - need to add summs first - 
    wkxpn_LAX = wkxpn_LAX.vstack(df_pivot_chunk)
    
    if loop_counter % 10 == 0 and loop_counter != 0: 
        table = wkxpn_LAX.to_arrow()
        if writer is None:
            #writer = pq.ParquetWriter(wkxpn+'\\LAX.parquet', table.schema)
            writer = pq.ParquetWriter(f's3://{bucket}/{wkxpn}LAX.parquet', table.schema)
        writer.write_table(table)
        wkxpn_LAX = pl.DataFrame() 

    loop_counter += 1

# Write any remaining chunks to the Parquet file
if len(wkxpn_LAX) > 0:
    table = wkxpn_LAX.to_arrow()
    if writer is None:
        #writer = pq.ParquetWriter(wkxpn+'\\LAX.parquet', table.schema)
        writer = pq.ParquetWriter(f's3://{bucket}/{wkxpn}LAX.parquet', table.schema)
    writer.write_table(table)

# Close the ParquetWriter
if writer is not None:
    writer.close()

100%|██████████| 26/26 [02:16<00:00,  5.26s/it]


In [14]:
del WK_TMP3_LAX
gc.collect()

0

In [15]:
# POTENTIAL SOLLUTION 1 - FOR IW DEV - Read Me ----------------------------
# PIVOT THE DATA 2 METRICS AT A TIME AND COMBINE VERTICALLY AT THE END
# Test Results - System is able to successfully pivot the data , and the shape of the datasets
# after pivoting for TRX,NRX | TUF,NUF | TUN,NUN is the same - So technically they can be combined.
# But Practial runs suggests that kernel is unable to allocate any more memory after just 1 pivot operation.
# Work Around - store df_pivot_1 externally , and delete , do the same for 2 and 3 then -
# read , combine , export in memory 
# or come up with a way to do the same outside memory . 
# Time taken for a single pivot action was pretty reasonable , around 30 sec, considering the fact that this is done on data containing
# ALL HCPs for ALL products and weeks, POTENTIAL SOLUTION 1 is a good approach for future RAM upgrades.
##### pseudo code - 
# df_pivot_1 = sub_test_1.pivot(values = ['TRX','NRX'], index = ['IID','PROD_CD','MarketName'], columns = 'CH_WK', aggregate_function = None, sort_columns = True)
# df_pivot_2 = sub_test_1.pivot(values = ['TUF','NUF'], index = ['IID','PROD_CD','MarketName'], columns = 'CH_WK', aggregate_function = None, sort_columns = True)
# df_pivot_3 = sub_test_1.pivot(values = ['TUN','NUN'], index = ['IID','PROD_CD','MarketName'], columns = 'CH_WK', aggregate_function = None, sort_columns = True)
# df_final = pd.concat([df_pivot_1, df_pivot_2, df_pivot_3], axis=1)

### Monthly Level

In [16]:
#WK_TMP1_LAX= pl.read_parquet(curwk+'\\temp_WK_TMP1_LAX.parquet')
WK_TMP1_LAX= pl.read_parquet(f's3://{bucket}/{curwk}temp_WK_TMP1_LAX.parquet')

In [17]:
# #For Monthly

group_cols = ['IID','PROD_CD','ChannelID','ChannelName','CCYYMM','MKT_CD','MarketName','G_B','RO_TYPE','PFAM_CD','PROD_NAME']
sum_cols = ['TRX','NRX','TUN','NUN','TUF','NUF']

# #perform groupby and calc function 
MTH_TMP2_LAX = pl.DataFrame()
MTH_TMP2_LAX = WK_TMP1_LAX.fill_null("").group_by(group_cols).agg(
    [pl.col(c).sum() for c in sum_cols]
)

MTH_TMP2_LAX = MTH_TMP2_LAX.with_columns(
    pl.when(pl.col("MKT_CD") == "")
    .then(pl.lit(""))
    .otherwise(pl.col("MarketName"))
    .alias("MarketName")
)

MTH_TMP2_LAX.sort(by=['IID','PROD_CD']) # unsure of use
del WK_TMP1_LAX
gc.collect()

0

In [18]:
#Picking up Date Parm from previous code to get week number and merging it rx data
#date_parm_mt = pl.read_parquet(curwk+"\\curwk_DATE_PARM_MTH.parquet")
date_parm_mt = pl.read_parquet(f's3://{bucket}/{curwk}curwk_DATE_PARM_MTH.parquet')
MTH_TMP2_LAX = MTH_TMP2_LAX.join(date_parm_mt,on='CCYYMM',how='inner') 

In [19]:
# INVESTIGATE - DATE PARM LIGIC HAD TO BE MADE INNER FOR MONTH LEVEL BUT NOT IN WEEKLY

In [20]:
chnl_dict = {"1":'RTL',"2":'MO'} # 1 is RTL , 2 is MO

MTH_TMP3_LAX = MTH_TMP2_LAX.with_columns([
    (pl.col('ChannelID').map_elements(lambda x: chnl_dict.get(x,x),return_dtype=pl.String).cast(pl.Utf8) + "_" + pl.col("I").cast(pl.Utf8)).alias('CH_MT')
])
del MTH_TMP2_LAX
gc.collect()

0

In [21]:
# Only keeping the columns being used in bit (New step)
MTH_TMP3_LAX= MTH_TMP3_LAX.drop(
    ['ChannelID','ChannelName','CCYYMM','I','ChannelName','MKT_CD','G_B','RO_TYPE','PROD_NAME','PFAM_CD','DATE_AS_OF']
)
MTH_TMP3_LAX = MTH_TMP3_LAX.sort('IID')

In [22]:
#QC
ucols = list(MTH_TMP3_LAX['CH_MT'].unique())
tdf = pl.DataFrame({
    'chn' : [s.split('_')[0] for s in ucols],
    'mt' : [s.split('_')[1] for s in ucols],
})
tdf.sort(by=['chn'])
tdf1 = tdf.filter(pl.col('chn')=='MO').sort(by='mt')
tdf2 = tdf.filter(pl.col('chn')=='RTL').sort(by='mt')
print("Number of Months of Data for RTL type records : ",tdf1.shape[0])
print("Number of Months of Data for MO type records : ",tdf2.shape[0])

Number of Months of Data for RTL type records :  24
Number of Months of Data for MO type records :  24


In [23]:
unique_vals = list(MTH_TMP3_LAX['CH_MT'].unique())
metrics = ['TRX','NRX','TUF','NUF','TUN','NUN']
full_unique_vals = []
def unique_vals_prod_wk(col_name):  
    parts = col_name.split('_')
    for m in metrics:
        full = parts[0]+'_'+m+parts[-1]
        full_unique_vals.append(full)
for i in unique_vals:
    unique_vals_prod_wk(i)
full_unique_vals.sort()

full_unique_vals = ['IID','PROD_CD','MarketName'] + full_unique_vals

In [24]:
unique_iids = MTH_TMP3_LAX['IID'].unique() 
chunk_size = 30000 
iid_chunks = [unique_iids[i:i + chunk_size] for i in range(0, len(unique_iids), chunk_size)]

In [25]:
writer = None  
mthxpn_LAX = pl.DataFrame()  
loop_counter = 0 

for iid_chunk in tqdm(iid_chunks):

    df_chunk = MTH_TMP3_LAX.filter(pl.col('IID').is_in(iid_chunk))

    df_pivot_chunk = df_chunk.pivot(
        values = metrics, 
        index = ['IID','PROD_CD','MarketName'], 
        columns = 'CH_MT', 
        aggregate_function = None, sort_columns = True
    )

    df_pivot_chunk = df_pivot_chunk.select(pl.all().name.map(
        lambda col_name: col_name.split('_')[-2] + '_' + col_name.split('_')[0] + col_name.split('_')[-1] if(('MO' in col_name) or ('RTL' in col_name)) else col_name)
    )

    #This step might be redundant 
    missing_cols = [col for col in full_unique_vals if col not in df_pivot_chunk.columns]
    for col in missing_cols: # This Takes 5 Seconds
        null_series = pl.Series(col, [None]*len(df_pivot_chunk), dtype=pl.Float64)
        df_pivot_chunk = df_pivot_chunk.with_columns(null_series)

    #This is used to hot fix the column order sequence glitch, useful for later on 
    code_columns = [col for col in df_pivot_chunk.columns if (('MO' in col) or ('RTL' in col))]
    metrics = ['TRX','NRX','TUF','NUF','TUN','NUN']
    prefixes = ['MO', 'RTL']
    def sort_key(col):
        for prefix in prefixes:
            for metric in metrics:
                if f'{prefix}_{metric}' in col:
                    return (prefix, metric, int(col.replace(f'{prefix}_{metric}', '')))

    # Sort the columns
    code_columns.sort(key=sort_key)
    code_columns = ['IID','PROD_CD','MarketName'] + code_columns
    # APPLY FIX - 
    df_pivot_chunk = df_pivot_chunk.select(code_columns)

    for metric in metrics:
        relevant_columns = [col for col in code_columns if any(prefix+'_'+metric in col for prefix in prefixes)]
        month_numbers = sorted(set(int(col.split(metric)[-1]) for col in relevant_columns))
        for month_number in month_numbers:
            new_column = metric+str(month_number)
            month_columns = [col for col in relevant_columns if col.endswith(metric + str(month_number))]
            df_pivot_chunk = df_pivot_chunk.with_columns(sum(pl.col(c) for c in month_columns).alias(new_column))

    
    mthxpn_LAX = mthxpn_LAX.vstack(df_pivot_chunk)
    
    if loop_counter % 10 == 0 and loop_counter != 0: 
        table = mthxpn_LAX.to_arrow()
        if writer is None:
            #writer = pq.ParquetWriter(mthxpn+'\\LAX.parquet', table.schema)
            writer = pq.ParquetWriter(f's3://{bucket}/{mthxpn}LAX.parquet', table.schema)
        writer.write_table(table)
        mthxpn_LAX = pl.DataFrame() 

    loop_counter += 1

# Write any remaining chunks to the Parquet file
if len(mthxpn_LAX) > 0:
    table = mthxpn_LAX.to_arrow()
    if writer is None:
        writer = pq.ParquetWriter(f's3://{bucket}/{mthxpn}LAX.parquet', table.schema)
    writer.write_table(table)

# Close the ParquetWriter
if writer is not None:
    writer.close()

100%|██████████| 25/25 [00:34<00:00,  1.36s/it]
