Import Libraries

In [1]:
import pandas as pd
import datetime
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import os
import polars as pl
import gc
import numpy as np
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import s3fs
import boto3
from io import BytesIO as bo

Date Variables

In [2]:
#variables -
this_day = datetime.today()
# ### FOR TESTING - REMOVE LATER ### 
# this_day = this_day - timedelta(days=7)
# ####
days_to_monday = (this_day.weekday() - 0) % 7
monday = this_day - timedelta(days=days_to_monday)

CUR_PROC_WK = monday.strftime("%Y%m%d")

PRE_PROC_WK0 = monday - timedelta(days=7)
PRE_PROC_WK = str(PRE_PROC_WK0.year) + str(PRE_PROC_WK0.month).zfill(2) + str(PRE_PROC_WK0.day).zfill(2)

CUR_WK0 = monday - timedelta(days=17)
CUR_WK = str(CUR_WK0.year) + str(CUR_WK0.month).zfill(2) + str(CUR_WK0.day).zfill(2)

PRE_WK0 = monday - timedelta(days=24)
PRE_WK = PRE_WK0.strftime("%Y%m%d")

month_end_date = CUR_WK0.replace(day=1) - timedelta(days=1)
if 0 <= (CUR_WK0 - month_end_date).days < 7: # When its Month Ending (we use MONTH OFF to start subtracting)
    CCYYMM_OFF = CUR_WK0.replace(day=1).strftime("%Y%m")

else: #Not Month ending
    CCYYMM_OFF = (CUR_WK0.replace(day=1) + relativedelta(months=1)).strftime("%Y%m")



In [3]:
bucket = 'vortex-staging-a65ced90'

Library names and paths

In [4]:
raw_path = f'PYADM/raw/{CUR_PROC_WK}/inbound/'
curwk = f'PYADM/raw/{CUR_PROC_WK}/dataframes/'
prewk = f'PYADM/raw/{PRE_PROC_WK}/dataframes/'
wkxpn = 'PYADM/weekly/staging/xponent/'
mthxpn = 'PYADM/monthly/staging/xponent/'
pwkxpn = f'PYADM/weekly/archive/{PRE_WK}/xponent/'

In [5]:
# Picking Up Data from Rx_2
# monthly -
mthxpn_LAX_N = pl.read_parquet(f's3://{bucket}/{curwk}mthxpn_LAX_N.parquet') #Source for this is subject to change , may add full version in future?
mthxpn_LAX_N = mthxpn_LAX_N.filter(mthxpn_LAX_N['CCYYMM'] != pl.lit(CCYYMM_OFF))
#Dropping rows where product is null
mthxpn_LAX_N = mthxpn_LAX_N.filter(mthxpn_LAX_N['PROD_CD'] != "")
date_parm_mth = pl.read_parquet(f's3://{bucket}/{curwk}curwk_DATE_PARM_MTH.parquet')
date_parm_mth = date_parm_mth.drop('DATE_AS_OF')
LAX_N_1 = mthxpn_LAX_N.join(date_parm_mth,on='CCYYMM',how='inner')
LAX_N_1 = LAX_N_1.drop(['MKT_CD','MarketName','G_B','PFAM_NAME','PROD_NAME','WK_END_DATE','RO_TYPE'])

In [6]:
# INVESTIGATE - 3% data had a period key outside date range , WHY ?
#             - Number of HCPs same or not? [ Chunks were 100 instead of 101]

In [7]:
metrics = ['TRX','NRX','TUN','NUN','TUF','NUF']
# Adding a new column called PROD_WK , will contain PROD_CD and the week number of transaction
# this the column on which we transpose the data  
df_with_new_cols = LAX_N_1.with_columns([(pl.col("PROD_CD").cast(pl.Utf8) + "P_" + pl.col("I").cast(pl.Utf8)).alias("PROD_MT")])

# Since PROD_WK now has prod and week info, we dont these columns
df_dropped = df_with_new_cols.drop(['PFAM_CD','PROD_CD','I'])

# Sorting data at IID level to chunk and filter effectively
df_sorted = df_dropped.sort('IID')

#For memory protection
del LAX_N_1
gc.collect()

0

In [8]:
# 'full_unique_vals' contains a list of all possible column names after transposing data
# we will use it to standardize the shape of every chunk, as it will help us concat / stack them each iteration

unique_vals = list(df_sorted['PROD_MT'].unique()) #using full data to get all unique values here
# NOTE : It might very well be possible that some weeks of data may be missing, we might have to add those columns manually at some point

full_unique_vals = []
def unique_vals_prod_wk(col_name):   #this function breaks down PROD_WK to your regular column names like LI1PTUF
    parts = col_name.split('_')
    for m in metrics:
        full = parts[0]+m+parts[-1]
        full_unique_vals.append(full)
    
for i in unique_vals:
    unique_vals_prod_wk(i)

# Could add a modifier here to check and have full 105 weeks of data ?
    
full_unique_vals.sort()
full_unique_vals.insert(0,'IID') #Adding IID because i will also use this list to standardize the order of columns in each chunk

In [9]:
mt_tst = pl.DataFrame()
mt_tst = mt_tst.with_columns(pl.Series(name='col_names_raw',values=full_unique_vals[1:]))

def split_col_names(value):
    prod = value[:3]
    metric = value[4:7]
    mtnum = value[7:]
    return prod, metric, mtnum

mt_tst = mt_tst.with_columns([pl.col("col_names_raw").map_elements(split_col_names, return_dtype=pl.Object).alias("split_values")])

mt_tst = mt_tst.with_columns([
    pl.col("split_values").map_elements(lambda x: x[0], return_dtype=pl.Utf8).alias("prod"),
    pl.col("split_values").map_elements(lambda x: x[1], return_dtype=pl.Utf8).alias("metric"),
    pl.col("split_values").map_elements(lambda x: x[2], return_dtype=pl.Utf8).alias("mtnum"),
])
mt_tst = mt_tst.drop(["split_values","col_names_raw"])

res = mt_tst.group_by(['prod','metric']).agg([pl.col('mtnum').n_unique().alias('num_of_mnts')])
missing_mtnum = res.filter(pl.col('num_of_mnts') != 24)
missing_mtnum = missing_mtnum.sort(by='prod')
print("Number of products in data which do not have 24 months of data : ",len(missing_mtnum['prod'].unique()))
print(list((missing_mtnum['prod'].unique())))
missmps = list((missing_mtnum['prod'].unique()))
missing_mtnum_print = missing_mtnum.select(pl.col(['prod','num_of_mnts']))
missing_mtnum_print = missing_mtnum_print.unique(subset=['prod','num_of_mnts'])
print(missing_mtnum_print)

wk_conti = pl.DataFrame()
wk_conti = mt_tst.filter(pl.col('prod').is_in(missmps))
wk_conti = wk_conti.drop('metric')
wk_conti = wk_conti.unique(subset=['prod','mtnum'])
wk_conti = wk_conti.with_columns(pl.col("mtnum").cast(pl.Int32))

print('BUT !  - ')
for prod in missmps:
    f1 = wk_conti.filter(pl.col('prod')== prod )
    if (len(f1['mtnum'].unique()) != f1['mtnum'].max()):
        print(prod," has gaps in weeks")
    

Number of products in data which do not have 24 months of data :  1
['ZEL']
shape: (1, 2)
┌──────┬─────────────┐
│ prod ┆ num_of_mnts │
│ ---  ┆ ---         │
│ str  ┆ u32         │
╞══════╪═════════════╡
│ ZEL  ┆ 14          │
└──────┴─────────────┘
BUT !  - 
ZEL  has gaps in weeks


In [10]:
unique_iids = df_sorted['IID'].unique() #(This will be the number of rows in final result)
chunk_size = 30000 #Each chunk will contain 5000 HCPs worth of transactions (NOT ROWS, they may differ each chunk)

iid_chunks = [unique_iids[i:i + chunk_size] for i in range(0, len(unique_iids), chunk_size)]
#So IID_chunks is a list of lists, each list contains 5000 HCPs and number of lists is our number of chunks

In [11]:
writer = None  #this is used by pyarrow to write data in chunks to external file
df_final2 = pl.DataFrame()  #so, df_final2 will be the object holding final data and being used for exporting
loop_counter = 0 #just for utility

for iid_chunk in tqdm(iid_chunks):

    df_chunk = df_sorted.filter(pl.col('IID').is_in(iid_chunk))
    df_pivot_chunk = df_chunk.pivot(values=metrics,index='IID',columns='PROD_MT',maintain_order=True,sort_columns=True)
    df_pivot_chunk = df_pivot_chunk.select(pl.all().name.map(lambda col_name: col_name.split('_')[3] + col_name.split('_')[0] + col_name.split('_')[-1] if 'PROD_MT_' in col_name else col_name))

    missing_cols = [col for col in full_unique_vals if col not in df_pivot_chunk.columns]
    for col in missing_cols: # This Takes 5 Seconds
        null_series = pl.Series(col, [None]*len(df_pivot_chunk), dtype=pl.Float64)
        df_pivot_chunk = df_pivot_chunk.with_columns(null_series)

    df_pivot_chunk = df_pivot_chunk.select(full_unique_vals)

    df_final2 = df_final2.vstack(df_pivot_chunk)

    if loop_counter % 10 == 0 and loop_counter != 0: # THIS CONTROLLS HOW MANY CHUNKS TO APPEND BEFORE WRITING
        table = df_final2.to_arrow()
        if writer is None:
            #writer = pq.ParquetWriter(curwk+'\\df_final2.parquet', table.schema)
            writer = pq.ParquetWriter(f's3://{bucket}/{curwk}df_final2.parquet',table.schema)
        writer.write_table(table)
        df_final2 = pl.DataFrame() # Reset df_final2 after writing to file

    loop_counter += 1

# Write any remaining chunks to the Parquet file
if len(df_final2) > 0:
    table = df_final2.to_arrow()
    if writer is None:
        #writer = pq.ParquetWriter(curwk+'\\df_final2.parquet', table.schema)
        writer = pq.ParquetWriter(f's3://{bucket}/{curwk}df_final2.parquet',table.schema)
    writer.write_table(table)

# Close the ParquetWriter
if writer is not None:
    writer.close()


100%|██████████| 18/18 [00:32<00:00,  1.80s/it]


In [12]:
prod_family_market_buckets = {
    "MRXF" : ["MRGP","MRBP","GLYP"],
    "LINF" : ["LI1P","LI2P","LI3P"],
    "LUBF" : ["AMTP","LUBP"],
    "GENM" : ["FLXP","LACP","LUBP","MRGP","GLYP"],
    "BRDM" : ["AMTP","MRBP","LI1P","LI2P","LI3P","TRUP","MOTP","ZELP","IRLP"],
    "LAXM" : ["AMTP","FLXP","LACP","LUBP","MRGP","MRBP","LI1P","LI2P","LI3P","TRUP","GLYP","MOTP","ZELP","IRLP"]
}
metrics = ['TRX','NRX','TUN','NUN','TUF','NUF']

In [13]:
#parquet_file = pq.ParquetFile(curwk+'\\df_final2.parquet')
parquet_file = pq.ParquetFile(f's3://{bucket}/{curwk}df_final2.parquet')

writer = None
mthxpn_LAX_DN = pl.DataFrame()
loop_counter = 0

for batch in tqdm(parquet_file.iter_batches(batch_size=30000)):
    
    pl_batch = pl.from_arrow(batch)
    all_columns = pl_batch.columns 
    
    for prod_family, prod_codes in prod_family_market_buckets.items():
        for metric in metrics:
            relevant_columns = [col for col in all_columns if any(prod_code + metric in col for prod_code in prod_codes)]
            month_numbers = sorted(set(int(col.split(metric)[-1]) for col in relevant_columns))
            for month_number in month_numbers:
                new_column = prod_family + metric + str(month_number)
                month_columns = [col for col in relevant_columns if col.endswith(metric + str(month_number))]
                #pl_batch = pl_batch.with_columns(sum(pl.col(c) for c in month_columns).alias(new_column))
                pl_batch = pl_batch.with_columns(pl.sum_horizontal(month_columns).alias(new_column))

    mthxpn_LAX_DN = mthxpn_LAX_DN.vstack(pl_batch)
    
    if loop_counter % 40 == 0 and loop_counter != 0:
        table = mthxpn_LAX_DN.to_arrow()
        if writer is None:
            #writer = pq.ParquetWriter(mthxpn+'\\LAX_DN.parquet', table.schema)
            writer = pq.ParquetWriter(f's3://{bucket}/{mthxpn}LAX_DN.parquet', table.schema)
        writer.write_table(table)
        mthxpn_LAX_DN = pl.DataFrame()
    
    loop_counter += 1

# Write any remaining chunks to the Parquet file
if len(mthxpn_LAX_DN) > 0:
    table = mthxpn_LAX_DN.to_arrow()
    if writer is None:
        #writer = pq.ParquetWriter(mthxpn+'\\LAX_DN.parquet', table.schema)
        writer = pq.ParquetWriter(f's3://{bucket}/{mthxpn}LAX_DN.parquet', table.schema)
    writer.write_table(table)

# Close the ParquetWriter
if writer is not None:
    writer.close()

#LAX_DN FOR MONTHLY level is complete !

18it [01:37,  5.40s/it]
