In [None]:
#tag cell with parameters
PROJECT_ID =  'divg-groovyhoon-pr-d2eab4'
DATASET_ID = 'nba_features_prospect'
TABLE_ID = 'master_features_set_prospect_train_vw'
RESOURCE_BUCKET = 'divg-groovyhoon-pr-d2eab4-default'
FILE_BUCKET = 'divg-groovyhoon-pr-d2eab4_hs_nba_prospects' # change
REGION = 'northamerica-northeast1'
MODEL_ID = '7002' # change
STACK_NAME = 'ffh_nba'
MODEL_NAME = 'hs_nba_prospects' # change 
SERVICE_TYPE = 'hs_nba_prospects' # change
SERVICE_TYPE_NAME = 'hs-nba-prospects' # change
PIPELINE_TYPE='training_pipeline'
PIPELINE_PATH = 'vertex_pipelines/hs_nba_prospects/serving_pipeline' # change
HS_NBA_UTILS_PATH = 'vertex_pipelines/hs_nba_utils/notebook' 
MODEL_TYPE='acquisition' # change 
LOAD_SQL='load_train_data.sql'
PREPROCESS_OUTPUT_CSV='df_train.csv' 
SAVE_FILE_NAME='df_test_exp.csv'
STATS_FILE_NAME='df_stats.csv'
AGGREGATE_RESULTS_TABLE_ID= 'bq_product_recommendation_ranked'

In [None]:
project_id = PROJECT_ID
dataset_id = DATASET_ID
aggregate_results_table_id = AGGREGATE_RESULTS_TABLE_ID 
resource_bucket = RESOURCE_BUCKET
stack_name = STACK_NAME
pipeline_path = PIPELINE_PATH
hs_nba_utils_path = HS_NBA_UTILS_PATH
model_type=MODEL_TYPE

In [None]:
# import global modules
from google.cloud import storage
from google.cloud import bigquery
from pathlib import Path
from yaml import safe_load
import sys
import os
from typing import Any
# global import
import pandas as pd
import numpy as np

# set global vars
pth_project = Path(os.getcwd())
pth_output_table_config = pth_project / 'output_table_config.yaml'
pth_queries = pth_project / 'queries'
sys.path.insert(0, pth_project.as_posix())

# init gcp clients
storage_client = storage.Client()
bq_client = bigquery.Client(project=project_id)

def extract_dir_from_bucket(
    bucket: Any, local_path: Path, prefix: str, split_prefix: str = 'serving_pipeline' 
):    
    """
    Download files from a specified bucket to a local path, excluding a specified prefix.

    Parameters:
    - bucket: The bucket object from which to download files.
    - local_path: The local path where the files will be downloaded to.
    - prefix: The prefix to filter the files in the bucket. Only files with this prefix will be downloaded.
    - split_prefix: The prefix to exclude from the downloaded file paths. Default is 'serving_pipeline'.
    """
    for blob in bucket.list_blobs(prefix=prefix):
        if not blob.name.endswith("/"):
            path = local_path / blob.name.split(f'{split_prefix}/')[-1]
            str_path = path.as_posix()
            Path(str_path[:str_path.rindex('/')]).mkdir(parents=True, exist_ok=True)
            blob.download_to_filename(str_path)

# download utils and output table config locally
storage_client = storage.Client()
bucket = storage_client.bucket(resource_bucket)
extract_dir_from_bucket(
    bucket, pth_project, f'{stack_name}/{hs_nba_utils_path}', split_prefix='notebook'
)
extract_dir_from_bucket(
    bucket, pth_project, f'{stack_name}/{pipeline_path}/queries'
)
blob = bucket.blob(f'{stack_name}/{pipeline_path}/output_table_config.yaml')
blob.download_to_filename(pth_output_table_config)

# import local modules
from hs_nba_utils.etl.extract import extract_bq_data
from hs_nba_utils.etl.load import create_temp_table, insert_from_temp_table

# load output table config
d_output_table_config = safe_load(pth_output_table_config.open())

# load scores from bq
pth_extract_scores = pth_queries / 'extract_model_scores.sql'
df_scores = extract_bq_data(bq_client, pth_query=pth_extract_scores)

# # postprocess output
# df_to_load = build_output_dataframe(df_scores, d_output_table_config)


In [None]:
df = df_scores
d_data_config = d_output_table_config 

In [None]:
d_data_config

In [None]:
"""
Builds the output dataframe based on the input dataframe and data configuration.

Args:
    df (pd.DataFrame): The input dataframe.
    d_data_config (dict): The data configuration dictionary.

Returns:
    pd.DataFrame: The output dataframe.

"""
##########
# Rank main target variables
##########
# extract main target names from data config dictionary
l_targets = [target['name'] for target in d_data_config['target_variables']]
print(f'Ranking {len(l_targets)} main variables')

# build target name - index mapping from targets
d_target_idx_mapping = {
    idx: target_name
    for idx, target_name in enumerate(l_targets)
}

# build reco rank columns from targets
l_recos = [f'reco_{i}' for i, _ in enumerate(l_targets)]

# sort scores of target variables in descending order
np_scores = df[l_targets].to_numpy()
target_ranked = np.argsort(-np_scores, axis=1)    
df[l_recos] = target_ranked
# loop over reco columns and map idx to target names
for reco in l_recos:
    df[reco] = df[reco].map(d_target_idx_mapping)

df

In [None]:
df[df['ban'] == 37714603][l_targets]

In [None]:

##########
# Add combos:
#  - with 3 products -> check top 3 recos
#  - with 2 products -> check top 2 recos
##########
print('Adding combos')
# extract distinct number of products in combos
unique_number_of_product_in_combos = set([
    len(combo['target_variables']) for combo in d_data_config['combos']
])
## --> unique_number_of_product_in_combos = {2, 3}

# extract unique name of products in combos
unique_combo_target_names = set([
    target 
    for combo in d_data_config['combos'] for target in combo['target_variables']
])
## --> unique_combo_target_names = {'shs_acquisition', 'ttv_acquisition', 'hsic_acquisition'}

# check if product is in top n, for each product in combos
for target_name in unique_combo_target_names:
    for n in unique_number_of_product_in_combos:
        df[f'is_{target_name}_in_top_{n}'] = (df[l_recos[:n]] == target_name).any(axis=1)

# loop over combos
for combo in d_data_config['combos']:
    print(f"Combo: {combo['name']}")

    # set n to check for recommendation combo
    n = len(combo['target_variables'])

    # extract columns to check condition
    l_columns_to_check_combo_condition = [
        f'is_{target_name}_in_top_{n}'
        for target_name in combo['target_variables']
    ]
    
    ## --> l_columns_to_check_combo_condition = ['is_hsic_acquisition_in_top_3', 'is_ttv_acquisition_in_top_3', 'is_shs_acquisition_in_top_3']

    # define condition: all targets must be in top n recos
    # condition = df[l_columns_to_check_combo_condition].all(axis=1)

    """
    Combo Condition changed:
        - Generate combo scores for all customers
        - Combo score: exctract max score of products on combo 
    """         
    condition = True

    # check condition: if condition pass, return max scores of targets 
    df[combo['name']] = np.where(
        condition, df[combo['target_variables']].max(axis=1), 0
    )
    
print(df)

In [None]:
##########
# Rank scores again with combos included
##########
# extract combo target names from data config dictionary
l_combo_targets = [combo['name'] for combo in d_data_config['combos']]
print(f'Ranking {len(l_combo_targets)} combos variables')
## --> l_combo_targets = ['hsic_tv_shs_combo_acquisition', 'hsic_tv_combo_acquisition', 'hsic_shs_combo_acquisition', 'tv_shs_combo_acquisition']

# aggregate combos and main target names
# order is important, so combos can be ranked fisrt when score
# of combo is equal to the highest score of main targets
l_targets_with_combos = l_combo_targets + l_targets
print(f'Ranking {len(l_targets_with_combos)} total combos and main variables')
## --> l_targets_with_combos = ['hsic_tv_shs_combo_acquisition', 'hsic_tv_combo_acquisition', 'hsic_shs_combo_acquisition', 
##                              'tv_shs_combo_acquisition', 'hsic_acquisition', 'ttv_acquisition', 'shs_acquisition', 'sing_acquisition', 
##                              'tos_acquisition', 'lwc_acquisition', 'sws_acquisition', 'wifi_acquisition', 'whsia_acquisition', 'hpro_acquisition']

# rebuild target name - index mapping from targets including combos
d_target_idx_mapping_with_combos = {
    idx: target_name
    for idx, target_name in enumerate(l_targets_with_combos)
}
## --> d_target_idx_mapping_with_combos = {0: 'hsic_tv_shs_combo_acquisition', 1: 'hsic_tv_combo_acquisition', 2: 'hsic_shs_combo_acquisition', 
##                                         3: 'tv_shs_combo_acquisition', 4: 'hsic_acquisition', 5: 'ttv_acquisition', 
##                                         6: 'shs_acquisition', 7: 'sing_acquisition', 8: 'tos_acquisition', 
##                                         9: 'lwc_acquisition', 10: 'sws_acquisition', 11: 'wifi_acquisition', 
##                                         12: 'whsia_acquisition', 13: 'hpro_acquisition'}

# rebuild reco rank columns from targets including combos
l_recos_with_combos = [f'reco_{i}' for i, _ in enumerate(l_targets_with_combos)]
## --> l_recos_with_combos = ['reco_0', 'reco_1', 'reco_2', 'reco_3', 'reco_4', 'reco_5', 'reco_6', 'reco_7', 'reco_8', 'reco_9', 'reco_10', 'reco_11', 'reco_12', 'reco_13']

# sort scores of combos and target variables in descending order
np_scores = df[l_targets_with_combos].to_numpy()
target_ranked = np.argsort(-np_scores, axis=1)
df[l_recos_with_combos] = target_ranked

# loop over reco columns and map idx to combo and target names
for reco in l_recos_with_combos:
    df[reco] = df[reco].map(d_target_idx_mapping_with_combos)

print(df)

In [None]:
df.columns

In [None]:
print(l_recos_with_combos)

In [None]:

##########
# Explode columns to match output table format
##########
# extract columns intersection from data config dictionary and current dataframe
l_intersection_cols = [
    col for col in d_data_config['output_columns']
    if col in df.columns
]

## --> l_intersection_cols = ['part_dt', 'cust_id', 'ban', 'ban_src_id', 'lpds_id']

# extract unique tier names from data config dictionary
unique_tier_names = set([
    tier['name']
    for tiers in d_data_config['tiers'].values() for tier in tiers
])

## --> unique_tier_names = {'hsic_ultimate_tier_acquisition', 'tos_ultimate_tier_acquisition', 'hsic_complete_tier_acquisition', 'tos_complete_tier_acquisition'}

l_dfs = []
# loop over recommendations and build output dataframe
for i, reco in enumerate(l_recos_with_combos):
    print(f'Processing {reco}')
    
    # build helper dataframe
    df_helper = df[
        l_intersection_cols + [reco] + list(unique_tier_names)
    ].copy()
    
    # set rank and rename reco column
    df_helper['rank'] = i + 1     
    df_helper = df_helper.rename(columns={reco: 'product_name'})

    # extract model scores
    df_helper['score'] = df.lookup(df_helper.index, df_helper['product_name'])

    l_dfs.append(df_helper)

df_concat = pd.concat(l_dfs)
print(f'Concat dataframe df.shape {df_concat.shape}')

# remove rows with score zero, usually combos
df_concat = df_concat[df_concat['score'] > 0]   
print(f'Concat dataframe without zero scores df.shape {df_concat.shape}')

In [None]:
df_concat.to_csv('gs://divg-groovyhoon-pr-d2eab4-default/downloads/df_concat.csv')

In [None]:
df_concat[df_concat['ban'] == 40331338]

In [None]:
df_concat[df_concat['product_name'] == 'tos_acquisition']

In [None]:
for tier_name, l_tiers_values in d_data_config['tiers'].items():
    print(f'tier_name: {tier_name}')
    print(f'l_tiers_values: {l_tiers_values}')

In [None]:
for tier_name, l_tiers_values in d_data_config['tiers'].items():
    for i, d_tier in enumerate(l_tiers_values):
        print(f'i: {i}')
        print(f'd_tier: {d_tier}')

In [None]:

##########
# Add tiers
##########
# create a new column to store tier results
df_concat['product_name_tier'] = df_concat['product_name']

df_concat.head(10)

In [None]:
d_data_config['tiers']

In [None]:

##########
# Add tiers
##########
# create a new column to store tier results
df_concat['product_name_tier'] = df_concat['product_name']

# loop over tiers
for tier_name, l_tiers_values in d_data_config['tiers'].items():

    # set conditions to edit dataframe
    conditions = (df_concat['product_name'] == tier_name)
    print(f'Processing tier: {tier_name}')

    # loop over tier targets
    for i, d_tier in enumerate(l_tiers_values):
        print(f"Tier: {d_tier['name']}")

        # first iteration: only add tier scores to column
        if i == 0:
            df_concat['product_name_tier'] = np.where(
                conditions, d_tier['name'], df_concat['product_name_tier']
            )
            df_concat['tier_score'] = np.where(
                conditions, df_concat[d_tier['name']], df_concat[d_tier['name']]
            )

        # update dataframe if new tier score is higher then previous tier scores
        else:
            update_conditions = conditions & (df_concat[d_tier['name']] > df_concat['tier_score'])
            df_concat['product_name_tier'] = np.where(
                update_conditions, d_tier['name'], df_concat['product_name_tier']
            )
            df_concat['tier_score'] = np.where(
                update_conditions, df_concat[d_tier['name']], df_concat['tier_score']
            ) 

# set tier zeros scores to None
conditions = (df_concat['tier_score'] != 0)
df_concat['tier_score'] = np.where(
    conditions, df_concat['tier_score'], None
)


In [None]:
df_concat[df_concat['product_name_tier'] == 'tos_ultimate_tier_acquisition'].to_csv('gs://divg-groovyhoon-pr-d2eab4-default/downloads/df_concat_tos_u.csv')

In [None]:

##########
# Map target names to their abbreviation name
##########
print('Mapping target name with reco name')
# extract main target name - reco mapping
d_main_target_reco_mapping = {
    target['name']: target['reco']
    for target in d_data_config['target_variables']
}

# extract tier target name - reco mapping
d_tier_target_reco_mapping = {
    tier['name']: tier['reco']
    for tiers in d_data_config['tiers'].values() for tier in tiers
}

# extract combos target name - reco mapping
d_combo_target_reco_mapping = {
    combo['name']: combo['reco']
    for combo in d_data_config['combos']
}

# create final mapping dictionary
d_target_reco_mapping = {}
for d_map in (d_main_target_reco_mapping, d_tier_target_reco_mapping, d_combo_target_reco_mapping):
    d_target_reco_mapping.update(d_map)

# map names and recos
df_concat['reco'] = df_concat['product_name_tier'].map(d_target_reco_mapping)

# select output columns
df_output = df_concat[d_data_config['output_columns']]
df_output['product_name'] = df_concat['product_name_tier']
print(f'Final dataframe df.shape {df_output.shape}')

return df_output
