In [None]:
from pyspark import SparkContext, SparkConf, StorageLevel
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from datetime import datetime, timedelta

import pyspark.sql.functions as F
import sys
import time
import math

In [None]:
conf = SparkConf().setAll([
    ('spark.app.name', 'data_refining_specific'),
    ('spark.maximizeResourceAllocation', 'false'),
    ('spark.dynamicAllocation.enabled', 'false'),
    ('spark.executors.cores', 5),
    ('spark.executor.memory', '37g'),
    ('spark.executor.memoryOverhead', '5g'),
    ('spark.executor.instances', 8),
    ('spark.default.parallelism', 80),
    ('spark.sql.shuffle.partitions', 80),
    ('spark.yarn.am.cores' , 5),
    ('spark.yarn.am.memory' , '37g'),
    ('spark.yarn.am.memoryOverhead' , '5g'),
    ('spark.executor.extraJavaOptions' , "-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark -XX:InitiatingHeapOccupancyPercent=35 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p'"),
    ('spark.yarn.am.extraJavaOptions' , "-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark -XX:InitiatingHeapOccupancyPercent=35 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p'"),
    ('yarn.nodemanager.vmem-check-enabled' , 'false'),
    ('yarn.nodemanager.pmem-check-enabled' , 'false')
])

spark = SparkSession.builder \
    .config(conf=conf) \
    .getOrCreate()

In [None]:
#for item in sorted(sc._conf.getAll()): print(item)

## Utils

In [None]:
def read_parquet_s3(app, bucket, file_path):
    """ """
    df = app.read.parquet(bucket + file_path)
    return df

def write_parquet_s3(spark_df, bucket, file_path):
    """ """
    spark_df.write.parquet(bucket + file_path, mode="overwrite")
    
def get_current_week_id():
    shifted_date = datetime.today() + timedelta(days=1)
    current_week_id = int(str(shifted_date.isocalendar()[0]) + str(shifted_date.isocalendar()[1]).zfill(2))
    return current_week_id

# Configs

In [None]:
#only_last = True
#s3_path_refine_global = "s3://fcst-refined-demand-forecast-prod/global/"
#s3_path_refine_specific = "s3://fcst-refined-demand-forecast-dev/specific/"
#filter_type, filter_val = 'family', [224, 12072, 600]
#scope = 'domyos_nov_2019'
#first_test_cutoff = 201922

In [None]:
#only_last = False
#s3_path_refine_global = "s3://fcst-refined-demand-forecast-prod/global/"
#s3_path_refine_specific = "s3://fcst-refined-demand-forecast-dev/specific/"
#filter_type, filter_val = 'department', [402, 403, 404, 406, 408, 473, 474]
#scope = 'racket_sports'
#first_test_cutoff = 201922

In [None]:
only_last = False
s3_path_refine_global = "s3://fcst-refined-demand-forecast-dev/global/"
s3_path_refine_specific = "s3://fcst-refined-demand-forecast-dev/specific/"
filter_type, filter_val = '', []
scope = 'full_scope'
first_test_cutoff = 201922

## Load global refined data

In [None]:
actual_sales = read_parquet_s3(spark, s3_path_refine_global, 'actual_sales/')
actual_sales.persist(StorageLevel.MEMORY_ONLY)

active_sales = read_parquet_s3(spark, s3_path_refine_global, 'active_sales/')
active_sales.persist(StorageLevel.MEMORY_ONLY)

model_info = read_parquet_s3(spark, s3_path_refine_global, 'model_info/')
model_info.persist(StorageLevel.MEMORY_ONLY)

In [None]:
if scope != 'full_scope':
    
    actual_sales = actual_sales \
        .join(model_info.select(['model'] + [filter_type]), on='model', how='left') \
        .filter(F.col(filter_type).isin(filter_val)) \
        .drop(filter_type)
    
    active_sales = active_sales \
        .join(model_info.select(['model'] + [filter_type]), on='model', how='left') \
        .filter(F.col(filter_type).isin(filter_val)) \
        .drop(filter_type)
    
    active_sales.persist(StorageLevel.MEMORY_ONLY)
    actual_sales.persist(StorageLevel.MEMORY_ONLY)

In [None]:
print(actual_sales.count())
print(active_sales.count())
print(model_info.count())

## Define History Reconstruction Function

In [None]:
def reconstruct_history(train_data_cutoff, actual_sales, model_info,
                        cluster_keys=['product_nature', 'family'], min_ts_len=160):

    # Create a complete TS dataframe
    max_week = train_data_cutoff.select(F.max('week_id')).collect()[0][0]
    
    all_model = train_data_cutoff.select('model').orderBy('model').drop_duplicates()
    all_week = actual_sales \
        .filter(actual_sales.week_id <= max_week) \
        .select('week_id') \
        .orderBy('week_id') \
        .drop_duplicates()

    complete_ts = all_model.crossJoin(all_week)
    
    # Add corresponding date
    complete_ts = complete_ts \
        .join(actual_sales.select(['week_id', 'date']).drop_duplicates(), 
              on=['week_id'], 
              how='inner')

    # Add cluster_keys info from model_info
    # /!\ drop_na because in very rare cases, the models are too old or too recent 
    #     and do not have descriptions in d_sku
    
    cluster_info = model_info.select(['model'] + cluster_keys)
    
    complete_ts = complete_ts \
        .join(cluster_info, on='model', how="left") \
        .dropna(subset=cluster_keys)
    
    # Add active sales from train_data_cutoff
    complete_ts = complete_ts.join(train_data_cutoff, 
                                   on=['model', 'week_id', 'date'], 
                                   how="left")

    
    # Calculate the average sales per cluster and week from actual_sales
    all_sales = actual_sales \
        .join(cluster_info, on='model', how='left') \
        .dropna() \
        .groupBy(['week_id', 'date'] + cluster_keys) \
        .agg(F.mean('y').alias('mean_cluster_y'))

    # Add it to complete_ts
    complete_ts = complete_ts.join(all_sales, 
                                   on=['week_id', 'date', 'product_nature', 'family'], 
                                   how='left')

    # Compute the scale factor by row
    complete_ts = complete_ts \
        .withColumn('row_scale_factor', complete_ts.y / complete_ts.mean_cluster_y)

    # Compute the scale factor by model
    model_scale_factor = complete_ts \
        .groupBy('model') \
        .agg(F.mean('row_scale_factor').alias('model_scale_factor'))

    complete_ts = complete_ts.join(model_scale_factor, on='model', how='left')

    # have each model a scale factor?
    assert complete_ts.filter(complete_ts.model_scale_factor.isNull()).count() == 0

    # Compute a fake Y by row (if unknow fill by 0)
    complete_ts = complete_ts \
        .withColumn('fake_y', 
                    (complete_ts.mean_cluster_y * complete_ts.model_scale_factor).cast('int'))
    complete_ts = complete_ts.fillna(0, subset=['fake_y'])

    # Calculate real age & total length of each TS
    # And estimate the implementation period: while fake y > y
    ts_start_end_date = complete_ts \
        .filter(complete_ts.y.isNotNull()) \
        .groupBy('model') \
        .agg(F.min('date').alias('start_date'), F.max('date').alias('end_date'))
    
    complete_ts = complete_ts.join(ts_start_end_date, on='model', how='left')

    complete_ts = complete_ts \
        .withColumn('age', (F.datediff(F.col('date'), F.col('start_date')) / 7) + 1) \
        .withColumn('length', (F.datediff(F.col('end_date'), F.col('date')) / 7) + 1) \
        .withColumn('is_y_sup', F.when(complete_ts.y.isNull(), 'false') \
                                 .when(complete_ts.y > complete_ts.fake_y, 'true') \
                                 .otherwise('false'))

    end_impl_period = complete_ts \
        .filter(complete_ts.is_y_sup == True) \
        .groupBy('model') \
        .agg(F.min('age').alias('end_impl_period'))

    complete_ts = complete_ts.join(end_impl_period, on='model', how='left')

    # Update y from "min_ts_len" weeks ago to the end of the implementation period
    complete_ts = complete_ts \
        .withColumn('y', 
                    F.when(((complete_ts.age <= 0) & (complete_ts.length <= min_ts_len)) | \
                           ((complete_ts.age > 0) & (complete_ts.age < complete_ts.end_impl_period)),
                           complete_ts.fake_y.cast('int')) \
                    .otherwise(complete_ts.y).cast('int'))

    complete_ts = complete_ts \
        .select(['week_id', 'date', 'model', 'y']) \
        .dropna() \
        .orderBy(['week_id', 'model'])

    return complete_ts

## Generate training data used to forecast cutoffs
- For each cutoff, keep only models sold at least once before the cutoff and active the last past week

In [None]:
def generate_cutoff_train_data(actual_sales, active_sales, model_info, only_last):

    current_cutoff = get_current_week_id()

    if only_last:
        l_cutoff_week_id = [current_cutoff]
    else:
        cutoff_week_id = active_sales \
            .filter(active_sales.week_id >= first_test_cutoff) \
            .select('week_id') \
            .drop_duplicates() \
            .orderBy('week_id')
        
        l_cutoff_week_id = [row['week_id'] for row in cutoff_week_id.collect()] + [current_cutoff]
        
    # loop generate cutoffs
    for cutoff_week_id in l_cutoff_week_id:

        print('Generating train data for cutoff', str(cutoff_week_id))

        t0 = time.time()
        
    
        train_data_cutoff = active_sales.filter(active_sales.week_id < cutoff_week_id)
    
        # Models sold at least once before the cutoff
        model_sold = train_data_cutoff \
            .groupBy('model') \
            .agg(F.sum('y').alias('qty_sold')) \
            .filter(F.col('qty_sold') > 0) \
            .select('model')
    
        # Active the last week before the cutoff
        last_week = train_data_cutoff.agg(F.max('week_id').alias('last_week'))
    
        model_active = train_data_cutoff \
            .groupBy('model') \
            .agg(F.max('week_id').alias('last_active_week'))
        
        model_active = model_active \
            .join(last_week, 
                  on=last_week.last_week == model_active.last_active_week, 
                  how='inner') \
            .select('model')
    
        # Keep only sold & active models
        model_to_keep = model_active.join(model_sold, 'model', 'inner')
        train_data_cutoff = train_data_cutoff.join(model_to_keep, on='model', how='inner')
    
        # Reconstruct a fake history
        train_data_cutoff = reconstruct_history(train_data_cutoff, actual_sales, model_info)
        
        cutoff_path = '{}/train_data_cutoff/train_data_cutoff_{}'.format(scope, str(cutoff_week_id))
        
        write_parquet_s3(train_data_cutoff, s3_path_refine_specific, cutoff_path)
        
        t1 = time.time()
        total = t1-t0
        print('Loop time {} {}:'.format(str(cutoff_week_id), total))

In [None]:
generate_cutoff_train_data(actual_sales, active_sales, model_info, only_last)

In [None]:
current_cutoff = get_current_week_id()

if only_last:
    l_cutoff_week_id = [current_cutoff]
else:
    cutoff_week_id = active_sales \
        .filter(active_sales.week_id >= first_test_cutoff) \
        .select('week_id') \
        .drop_duplicates() \
        .orderBy('week_id')
    
    l_cutoff_week_id = [row['week_id'] for row in cutoff_week_id.collect()] + [current_cutoff]

In [None]:
#l_cutoff_week_id = [202004, 202005, 202006, 202007]
l_cutoff_week_id

In [None]:
def generate_cutoff_train_data(cutoff_week_id):
    t0 = time.time()
        
    print('Generating train data for cutoff', str(cutoff_week_id))

    train_data_cutoff = active_sales.filter(active_sales.week_id < cutoff_week_id)

    # Models sold at least once before the cutoff
    model_sold = train_data_cutoff \
        .groupBy('model') \
        .agg(F.sum('y').alias('qty_sold')) \
        .filter(F.col('qty_sold') > 0) \
        .select('model')

    # Active the last week before the cutoff
    last_week = train_data_cutoff.agg(F.max('week_id').alias('last_week'))

    model_active = train_data_cutoff \
        .groupBy('model') \
        .agg(F.max('week_id').alias('last_active_week'))
    
    model_active = model_active \
        .join(last_week, 
              on=last_week.last_week == model_active.last_active_week, 
              how='inner') \
        .select('model')

    # Keep only sold & active models
    model_to_keep = model_active.join(model_sold, 'model', 'inner')
    train_data_cutoff = train_data_cutoff.join(model_to_keep, on='model', how='inner')

    # Reconstruct a fake history
    train_data_cutoff = reconstruct_history(train_data_cutoff, actual_sales, model_info)
    
    path_cutoff = '{}/train_data_cutoff/train_data_cutoff_{}'.format(scope, str(cutoff_week_id))
    
    #ut.write_parquet_s3(train_data_cutoff, s3_path_refine_specific, path_cutoff)
    
    t1 = time.time()
    total = t1-t0
    print('Loop time {} {}:'.format(str(cutoff_week_id), total))

In [None]:
t0 = time.time()
map(generate_cutoff_train_data, l_cutoff_week_id)
t1 = time.time()
print(t1-t0)

In [None]:
spark.stop()