### GS Trend Feed pt1

In [1]:
import polars as pl
import pandas as pd
import gc
import json
from datetime import datetime, timedelta,date

In [2]:
# load variables from JSON
with open('vars_wk.json', 'r') as json_file:
    js = json.load(json_file)

data_date = js['data_date']
num_weeks_rx = js['num_weeks_rx']
bucket = js['bucket']

dflib = f's3://{bucket}/BIT/dataframes/'
xpn = f's3://{bucket}/PYADM/weekly/archive/{data_date}/xponent/'

In [3]:
# Utility Functions -
def load(df, lib=dflib):
    globals()[df] = pl.read_parquet(f'{lib}{df}.parquet')

In [4]:
# Imporing Dependencies
prod_mapping = pl.read_csv(f's3://{bucket}/BIT/docs/productmapping_pybit.txt',separator='|')
geo_code_mapper = pl.from_pandas(pd.read_excel(f's3://{bucket}/BIT/docs/geo_id_full.xlsx'))
load('mp_spec_seg_dec')
fetch_products = ['LI1','LI2','LI3','TRU','AMT','LAC','MOT','LUB','IRL'] # only these products are to be read from lax

### Generator Functions

In [5]:
def get_volumes_geos(metric,prod_cd):
    columns = ['IID','PROD_CD'] + [metric+str(i) for i in range(1,14)]
    df = pl.read_parquet(xpn+'LAX.parquet',columns=columns).filter(pl.col('PROD_CD').is_in(prod_cd))
    rename_dict = dict(zip(columns[2:],['Vol'+str(i)+'_'+metric for i in range(1,14)]))
    df = df.rename(rename_dict)

    # Adding MP related columns
    df = df.join(mp_spec_seg_dec,on='IID',how='left').filter(pl.col('geography_id').is_not_null()
    )
    volume_cols = [f'Vol{i}_{metric}' for i in range(1,14)]
    agg_dict = {col: pl.col(col).sum() for col in volume_cols}

    df_terr = df.group_by(['geography_id','specialty_group','segment','decile','PROD_CD']).agg(**agg_dict)

    df_reg = df.join(geo_code_mapper[['geography_id','region_geography_id']],on='geography_id',how='left'
    ).group_by(['region_geography_id','specialty_group','segment','decile','PROD_CD']).agg(**agg_dict)

    df_area = df.join(geo_code_mapper[['geography_id','area_geography_id']],on='geography_id',how='left'
    ).group_by(['area_geography_id','specialty_group','segment','decile','PROD_CD']).agg(**agg_dict)

    df_nation = df.join(geo_code_mapper[['geography_id','nation_geography_id']],on='geography_id',how='left'
    ).group_by(['nation_geography_id','specialty_group','segment','decile','PROD_CD']).agg(**agg_dict)

    return(
        df_terr,df_reg,df_area,df_nation
    )


In [6]:
def add_parent_product_rows(all_prod_df):
    # converting tuple to list , because i cant assign the processed df back to it
    all_prod_df = list(all_prod_df)
    for i in range(4): 
        df = all_prod_df[i]
        agg_dict = {}
        for col in df.columns[5:]:
            agg_dict[col] = pl.col(col).sum()
        
        join_cols = df.columns[0:4]

        df = df.join(prod_mapping[['code','product_id','parent_product_id']], left_on = 'PROD_CD',right_on = 'code', how = 'left')
        df_2_35 = df.filter(pl.col('parent_product_id').is_in([2,35]))
        df_2_35 = df_2_35.group_by(join_cols + ['parent_product_id']).agg(**agg_dict).rename({'parent_product_id':'product_id'})
        df_1 = df.group_by(join_cols).agg(**agg_dict).with_columns(product_id = pl.lit(1)).with_columns(pl.col('product_id').cast(pl.Int64))

        # stack 1, 2_35 with df and return
        df = df.drop(['PROD_CD','parent_product_id']) #dropping to make same shape
        vstack_helper = df.columns
        df = df.vstack(
            df_2_35.select(vstack_helper)
        ).vstack(
            df_1.select(vstack_helper)
        )

        all_prod_df[i] = df
    return(tuple(all_prod_df))


In [7]:
def add_full_rollups(all_prod_df):
    # converting the tuple of dfs into a list for processing
    all_prod_df = list(all_prod_df)
    # for trivializing formulas - 
    p,sg,d,spc = 'product_id','segment','decile','specialty_group'
    sg_roll_up,d_roll_up,spc_roll_up = pl.lit('UNI'),pl.lit('0-10'),pl.lit('ALL SPEC')
    
    #Looping over 4 levels (terr,reg,area,nation)
    for i in range(4):
        df = all_prod_df[i]
        g = df.columns[0] #should contain geo level
        metric_cols = df.columns[4:-1] #should contain the tuf / nuf columns
        main_seq = ([g,p,sg,d,spc] + metric_cols) #used for vstack later
        agg_dict = {metric: pl.col(metric).sum() for metric in metric_cols}
        # First Round - 
        sg_df = (df.group_by([g,p,d,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg)).select(main_seq))
        d_df = (df.group_by([g,p,sg,spc]).agg(**agg_dict).with_columns(d_roll_up.alias(d)).select(main_seq))
        spc_df = (df.group_by([g,p,d,sg]).agg(**agg_dict).with_columns(spc_roll_up.alias(spc)).select(main_seq))
        # Second Round - 
        sg_d_df = (df.group_by([g,p,spc]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d)).select(main_seq))
        sg_spc_df = (df.group_by([g,p,d]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),spc_roll_up.alias(spc)).select(main_seq))
        d_spc_df = (df.group_by([g,p,sg]).agg(**agg_dict).with_columns(d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        # Third Round
        sg_d_spc_df = (df.group_by([g,p]).agg(**agg_dict).with_columns(sg_roll_up.alias(sg),d_roll_up.alias(d),spc_roll_up.alias(spc)).select(main_seq))
        #### Processing Done ####
        df = (
            df.select(main_seq)
            .vstack(sg_df).vstack(d_df).vstack(spc_df)
            .vstack(sg_d_df).vstack(sg_spc_df).vstack(d_spc_df)
            .vstack(sg_d_spc_df)
        )
        # Store Data Back :
        all_prod_df[i] = df
    
    return(tuple(all_prod_df))

In [8]:
# Raw Data Prep - ETA 5 Seconds
all_products_tuf = get_volumes_geos('TUF',fetch_products)
all_products_tuf = add_parent_product_rows(all_products_tuf)
all_products_tuf = add_full_rollups(all_products_tuf)

all_products_nuf = get_volumes_geos('NUF',fetch_products)
all_products_nuf = add_parent_product_rows(all_products_nuf)
all_products_nuf = add_full_rollups(all_products_nuf)

### Functions ->

In [9]:
def process_1(df):
    for i in range(4):
        g = levels[i]
        gb_helper = [g,spc,sg,d,p]
        f = (
            all_products_tuf[i]
            .join(all_products_nuf[i],on = gb_helper,how = 'left')
        )
        df[i] = f
    return(df)
	
def process_2(df):
    for j in range(4): #changing this from i to j because of a local var conflict
        f = df[j]

        rename_dict = {}
        expn_dict = {}
        for i in range(1,14):
            for metric in ['TUF','NUF']:
                rename_dict[f'Vol{i}_{metric}'] = f'lax_Vol{i}_{metric}'
                expn_dict[f'Shr{i}_{metric}'] = pl.col(f'Vol{i}_{metric}')/pl.col(f'lax_Vol{i}_{metric}')

        f_1 = f.filter(pl.col(p)==1).rename(rename_dict).drop(p)
        f = (
            f
            .join(f_1,on=[levels[j],spc,sg,d],how='left')
            .with_columns(**expn_dict)
            .drop(list(rename_dict.values()))
        )

        df[j] = f
    return(df)

In [10]:
# For Converting To feed ready Data
def get_feed(temp1):
    temp1[0] = temp1[0].rename({'geography_id': 'Geography_id'})
    temp1[1] = temp1[1].rename({'region_geography_id': 'Geography_id'})
    temp1[2] = temp1[2].rename({'area_geography_id': 'Geography_id'})
    temp1[3] = temp1[3].rename({'nation_geography_id': 'Geography_id'})
    final_feed = temp1[0].vstack(temp1[1]).vstack(temp1[2]).vstack(temp1[3])
    #function to diving dataframe in two levels('Trx','Nrx')
    def select_columns_by_condition(df,metric):
        # Get the column names to be excluded based on the condition
        excluded_columns = [col for col in df.columns if not col.endswith(metric)]
        
        # Select all columns except the excluded ones
        selected_df = df.select(excluded_columns)
        return selected_df
    final_feed_nrx = select_columns_by_condition(final_feed,'TUF')# dataframe not including trx columns
    final_feed_nrx = final_feed_nrx.with_columns(pl.lit('NRX').alias("Metric"))
    final_feed_trx = select_columns_by_condition(final_feed,'NUF')
    final_feed_trx = final_feed_trx.with_columns(pl.lit('TRX').alias("Metric"))
    #function to remove _trx or _nrx from final_feed_nrx and final_feed_trx
    def rename_columns_by_condition(df,metric):
        renamed_columns = {col: col[:-4] if col.endswith(metric) else col for col in df.columns}
        renamed_df = df.rename(renamed_columns)
        return renamed_df
    # making trx feed columns and nrx feed columns similar so that we can vstack them
    final_feed_nrx = rename_columns_by_condition(final_feed_nrx,'NUF')
    final_feed_trx = rename_columns_by_condition(final_feed_trx,'TUF')
    final_feed = final_feed_trx.vstack(final_feed_nrx)
    #required new columns for feed
    col_to_addrt = ['ReportType']
    col_to_addna = [
        "Pres1", "Pres2", "Pres3", "Pres4", "Pres5", "Pres6", "Pres7", "Pres8", "Pres9", "Pres10", "Pres11", "Pres12", "Pres13", 
        "DS1_Vol1", "DS1_Vol2", "DS1_Vol3", "DS1_Vol4", "DS1_Vol5", "DS1_Vol6", "DS1_Vol7", "DS1_Vol8", "DS1_Vol9", "DS1_Vol10", 
        "DS1_Vol11", "DS1_Vol12", "DS1_Vol13", "DS2_Vol1", "DS2_Vol2", "DS2_Vol3", "DS2_Vol4", "DS2_Vol5", "DS2_Vol6", "DS2_Vol7",
          "DS2_Vol8", "DS2_Vol9", "DS2_Vol10", "DS2_Vol11", "DS2_Vol12", "DS2_Vol13"
    
    ]
    for my_col in col_to_addna:
            final_feed = final_feed.with_columns(pl.lit('\\N').alias(my_col))
      
    final_feed = final_feed.with_columns(pl.lit('WEEKLY').alias('ReportType'))
    #Renaming columns
    new_col_mapping = {
        'product_id': 'Product_id',
        'segment': 'Segment',
        'specialty_group': 'Specialty',
        'decile': 'Decile'
    }
    final_feed = final_feed.rename(new_col_mapping)
    # rearranging columns accoring to feed.
    req_cols = [
        'Geography_id', 'Product_id', 'Segment', 'Specialty', 'Metric', 'ReportType', 'Decile', 'Vol1', 'Vol2', 'Vol3', 'Vol4', 'Vol5', 
        'Vol6', 'Vol7', 'Vol8', 'Vol9', 'Vol10', 'Vol11', 'Vol12', 'Vol13', 'Shr1', 'Shr2', 'Shr3', 'Shr4', 'Shr5', 'Shr6', 'Shr7', 
        'Shr8', 'Shr9', 'Shr10', 'Shr11', 'Shr12', 'Shr13', 'Pres1', 'Pres2', 'Pres3', 'Pres4', 'Pres5', 'Pres6', 'Pres7', 'Pres8', 
        'Pres9', 'Pres10', 'Pres11', 'Pres12', 'Pres13', 'DS1_Vol1', 'DS1_Vol2', 'DS1_Vol3', 'DS1_Vol4', 'DS1_Vol5', 'DS1_Vol6', 'DS1_Vol7', 
        'DS1_Vol8', 'DS1_Vol9', 'DS1_Vol10', 'DS1_Vol11', 'DS1_Vol12', 'DS1_Vol13', 'DS2_Vol1', 'DS2_Vol2', 'DS2_Vol3', 'DS2_Vol4', 'DS2_Vol5', 
        'DS2_Vol6', 'DS2_Vol7', 'DS2_Vol8', 'DS2_Vol9', 'DS2_Vol10', 'DS2_Vol11', 'DS2_Vol12', 'DS2_Vol13'
    ]
    final_feed = final_feed.select(req_cols)
    
    return(final_feed)

---

In [11]:
# for trvializing formula : 
p,sg,spc,d = 'product_id','segment','specialty_group','decile'
levels = ['geography_id','region_geography_id','area_geography_id','nation_geography_id']
OUT = 's3://vortex-staging-a65ced90/BIT/output/GeoSummary/Weekly/'

In [12]:
# Calling Functions and Exporting Feeds-
temp1 = [pl.DataFrame() for _ in range(4)] # creating an empty dataframe holder list obj
temp1 = process_1(temp1)
temp1 = process_2(temp1)
feed_dataset = get_feed(temp1)
feed_dataset.to_pandas().to_csv(f'{OUT}Weekly_GeoSummary_Trend_Feed.txt', sep='|')
print('GS Trend Feed Exported !')

GS Trend Feed Exported !


### Geosummary X Feed -

In [13]:
rx_date = datetime.strptime(data_date,'%Y%m%d')
list_of_dates = [rx_date]
serial_no = [i for i in range(1,14)]
for i in range(1,13):
    date_val = rx_date - timedelta(days = 7*i)
    list_of_dates.append(date_val)


date_df = pl.DataFrame(
    {
        'X':serial_no,
        'Name':list_of_dates
    }
)

date_df = date_df.with_columns(
   date_df['Name'].dt.strftime('%m/%d/%Y')
)

date_df.to_pandas().to_csv(f'{OUT}Weekly_GeoSummary_X_Feed.txt', sep='|')
print('GS X Feed Exported !')

GS X Feed Exported !
