## Requirements

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import src.utils as ut
from src.data_handler import data_handler
from src.sagemaker_utils import generate_df_jobs, SagemakerHandler

## Params

In [None]:
list_cutoff = [202109]
run_name = 'test'
environment = 'seed'

## Configs

In [None]:
confs = ut.import_raw_config(environment)

global_bucket = confs['buckets']['refined_data_global']
global_path = confs['paths']['refined_global_path']

specific_bucket = confs['buckets']['refined_data_specific']
specific_path = confs['paths']['refined_specific_path']

algorithm = ut.import_raw_config(environment)['modeling_parameters']['algorithm']

## Load data

In [None]:
df_model_week_sales = ut.read_multipart_parquet_s3(global_bucket, global_path + 'model_week_sales')
df_model_week_tree = ut.read_multipart_parquet_s3(global_bucket, global_path + 'model_week_tree')
df_model_week_mrp = ut.read_multipart_parquet_s3(global_bucket, global_path + 'model_week_mrp')
df_imputed_sales_lockdown_1 = ut.read_multipart_parquet_s3('fcst-refined-demand-forecast-dev', 
                                                           global_path + 'imputed_sales_lockdown_1.parquet')

df_store_openings = pd.read_csv('data/store_openings.csv')
df_holidays = pd.read_csv('data/holidays.csv')

In [None]:
if confs['refining_specific_parameters']['patch_first_lockdown']:
    print("True")
    df_store_openings.loc[df_store_openings['week_id'].between(202011, 202028), 'perc_store_closed'] = 0
    df_store_openings.loc[df_store_openings['week_id'].between(202011, 202028), 'perc_store_partially_closed'] = 0

## Generate df_jobs

In [None]:
refined_data_specific_path = ut.to_uri(specific_bucket, specific_path)

df_jobs = generate_df_jobs(list_cutoff=list_cutoff,
                           run_name=run_name,
                           algorithm=algorithm,
                           refined_data_specific_path=refined_data_specific_path
                           )
df_jobs

## Generate modeling specific data

In [None]:
for cutoff in list_cutoff:
    
    print(cutoff)
    
    # Base data
    base_data = {
        'model_week_sales': df_model_week_sales,
        'model_week_tree': df_model_week_tree,
        'model_week_mrp': df_model_week_mrp,
        'imputed_sales_lockdown_1': df_imputed_sales_lockdown_1
    }
    
    # Static features
    df_static_tree = df_model_week_tree[df_model_week_tree['week_id'] == cutoff].copy()
    
    static_features = {
        'model_identifier': pd.DataFrame({'model_id': df_static_tree['model_id'],
                                          'model_identifier': df_static_tree['model_id']}),
        'family_id': df_static_tree[['model_id', 'family_id']],
        'sub_department_id': df_static_tree[['model_id', 'sub_department_id']],
        'department_id': df_static_tree[['model_id', 'department_id']],
        'univers_id': df_static_tree[['model_id', 'univers_id']],
        'product_nature_id': df_static_tree[['model_id', 'product_nature_id']]
    }
    
    # Dynamic features
    global_dynamic_features = {
        'perc_store_closed': {'dataset': df_store_openings[['week_id', 'perc_store_closed']], 
                              'projection': 'as_provided'},
        'perc_store_partially_closed': {'dataset': df_store_openings[['week_id', 'perc_store_partially_closed']], 
                                        'projection': 'as_provided'},
        'holidays': {'dataset': df_holidays, 'projection': 'as_provided'}
    }

    specific_dynamic_features = None
    
    # Import refining config
    train_path = df_jobs[df_jobs['cutoff'] == cutoff].loc[:, 'train_path'].values[0]
    predict_path = df_jobs[df_jobs['cutoff'] == cutoff].loc[:, 'predict_path'].values[0]

    refining_params = ut.import_refining_config(
        environment=environment,
        cutoff=cutoff,
        run_name=run_name,
        train_path=train_path,
        predict_path=predict_path
    )

    dh = data_handler(
        base_data=base_data,
        static_features=static_features,
        global_dynamic_features=global_dynamic_features,
        specific_dynamic_features=specific_dynamic_features,
        **refining_params
    )

    dh.execute_data_refining_specific()

In [None]:
# free some memory
import gc
del(df_model_week_sales, df_model_week_tree, df_model_week_mrp, df_store_openings, df_holidays)
gc.collect()

## Launch parallel Fit-Transform

In [None]:
sagemaker_params = ut.import_sagemaker_params(environment=environment)

sh = SagemakerHandler(
    run_name=run_name,
    df_jobs=df_jobs,
    **sagemaker_params
)

sh.launch_training_jobs()

sh.launch_transform_jobs()