In [1]:
import sys

from pyspark import SparkContext, SparkConf, StorageLevel
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, BooleanType
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer

import datetime
from datetime import datetime, timedelta
import numpy as np
import time

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
6,application_1580280484752_0007,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
conf = SparkConf().setAll([
 ('spark.sql.shuffle.partitions', 110),
 ('spark.default.parallelism', 110),
 ('spark.autoBroadcastJoinThreshold', 15485760),
 ('spark.dynamicAllocation.enabled', 'false'),
 ('spark.executor.instances', 11),
 ('spark.executor.memory', '19g'),
 ('spark.driver.memory', '19g'),
 ('spark.driver.cores', 5),
 ('spark.memory.storageFraction', 0.4),   
 ('spark.memory.fraction', 0.6),
 ('spark.executor.memoryOverhead', '2g'),
 ('spark.executor.cores', 5),
 ('spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version', 2)
])

spark = SparkSession.builder \
    .appName("data_refining_part_2_history_reconstruction") \
    .config(conf=conf)\
    .getOrCreate()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
def read_parquet_s3(app, s3_path):
    
    df = app.read.parquet(s3_path)
    path_sinature = ">> Parquet file read from " + s3_path
        
    return df


def write_parquet_s3(spark_df, bucket, file_path):
    """Writing spark Dataframe into s3 as a parquet file.
    
    Parameters:
    spark_df (pyspark.sql.dataframe): the spark Dataframe.
    bucket (string): the s3 bucket name.
    file_path (string) : the table name or directory.
    
    Returns:
    """
    s3_path = 's3://{}/{}'.format(bucket, file_path)
    spark_df.write.parquet(s3_path, mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Configs

In [4]:
horizon = 10
horizon_freq = '1W-SUN'

prediction_length = horizon
prediction_freq = '1W-SUN'
season_length = 52

dict_scope = {'marco': {'filter_type' : 'family', 'filter_val' : [12151, 230]},
              'racket_sports': {'filter_type' : 'department', 'filter_val' : [402, 403, 404, 406, 408, 473, 474]},
              'full_scope': {'filter_type' : '', 'filter_val' : []},
              "domyos_nov_2019": {"filter_type" : "family", "filter_val" : [224, 12072, 600]}}
              
scope = 'domyos_nov_2019' # change the scope here
first_test_cutoff = 201922 # change test period here, should be >= 201922

filter_val, filter_type = dict_scope[scope].values()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
actual_sales = read_parquet_s3(spark, 's3://fcst-refined-demand-forecast-dev/part_1/actual_sales/')
actual_sales.persist(StorageLevel.MEMORY_ONLY)
actual_sales.printSchema()
actual_sales.show(1)

active_sales = read_parquet_s3(spark, 's3://fcst-refined-demand-forecast-dev/part_1/active_sales/')
active_sales.persist(StorageLevel.MEMORY_ONLY)
active_sales.printSchema()
active_sales.show(1)

model_info = read_parquet_s3(spark, 's3://fcst-refined-demand-forecast-dev/part_1/model_info/')
model_info.persist(StorageLevel.MEMORY_ONLY)
model_info.printSchema()
model_info.show(1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- week_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- model: long (nullable = true)
 |-- y: long (nullable = true)

+-------+----------+-------+---+
|week_id|      date|  model|  y|
+-------+----------+-------+---+
| 201629|2016-07-17|8048316|  1|
+-------+----------+-------+---+
only showing top 1 row

root
 |-- model: long (nullable = true)
 |-- week_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- y: long (nullable = true)

+-------+-------+----------+---+
|  model|week_id|      date|  y|
+-------+-------+----------+---+
|8330598| 201520|2015-05-10|104|
+-------+-------+----------+---+
only showing top 1 row

root
 |-- model: long (nullable = true)
 |-- model_label: string (nullable = true)
 |-- family: long (nullable = true)
 |-- family_label: string (nullable = true)
 |-- sub_department: long (nullable = true)
 |-- sub_department_label: string (nullable = true)
 |-- department: long (nullable = true)
 |-- department_label: string (

## Load clean data

In [6]:
if scope != 'full_scope':
    actual_sales = actual_sales.join(model_info.select(model_info['model'], model_info[filter_type]), 'model', how='left')

    actual_sales = actual_sales.filter(actual_sales[filter_type].isin(filter_val))\
                               .drop(filter_type)
    
    active_sales = active_sales.join(model_info.select(model_info['model'], model_info[filter_type]), 'model', how='left')

    active_sales = active_sales.filter(active_sales[filter_type].isin(filter_val))\
                               .drop(filter_type)
    
    active_sales.persist(StorageLevel.MEMORY_ONLY)
    actual_sales.persist(StorageLevel.MEMORY_ONLY)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[model: bigint, week_id: int, date: date, y: bigint]
DataFrame[model: bigint, week_id: int, date: date, y: bigint]

## Define History Reconstruction Function

In [7]:
def sup_week(week_id):
    
    week_id = str(week_id)
    y, w = int(week_id[:4]), int(week_id[4:])
    
    if w == 1:
        w = '52'
        y = str(y - 1)
        
    elif len(str(w))==1:
        w = w - 1
        y = str(y)
        w = '0' + str(w)
        
    elif w == 10:
        w = w - 1
        y = str(y)
        w = '0' + str(w)
    
    else:
        w = str(w - 1)
        y = str(y)
    
    n_wk = y + w
    return int(n_wk)


def next_week(week_id):
    week_id = str(week_id)
    y, w = int(week_id[:4]), int(week_id[4:])
    
    if w == 9:
        w = '10'
        y = str(y)
    elif len(str(w))==1:
        w = w + 1
        y = str(y)
        w = '0' + str(w)
    elif w == 52:
        w = '01'
        y = str(y + 1)
    else:
        w = str(w + 1)
        y = str(y)
    n_wk = y + w
    return int(n_wk)


def __add_week(week, nb):
    if nb < 0 :
        for i in range(abs(nb)):
            week = sup_week(week)
    else:
        for i in range(nb):
            week = next_week(week)
    
    return week


def find_weeks(start, end):
    l = [int(start), int(end)]
    start = str(start)+'0'
    start = datetime.strptime(start, '%Y%W%w')
    end = sup_week(end)
    end = str(end)+'0'
    end = datetime.strptime(end, '%Y%W%w')
      
    
    for i in range((end - start).days + 1):
        d = (start + timedelta(days=i)).isocalendar()[:2] # e.g. (2011, 52)
        yearweek = '{}{:02}'.format(*d) # e.g. "201152"
        l.append(int(yearweek))
    
    
    return sorted(set(l))


def reconstruct_history(train_data_cutoff, actual_sales, model_info,
                        cluster_keys=['product_nature', 'family'], min_ts_len=160):


    last_week = train_data_cutoff.agg(F.max('week_id').alias('last_week'))
    model_to_keep = train_data_cutoff.groupBy('model').agg(F.max('week_id').alias('last_active_week'))
    model_to_keep = model_to_keep.join(last_week, last_week.last_week == model_to_keep.last_active_week, 'inner').select(model_to_keep.model)
    train_data_cutoff = train_data_cutoff.join(model_to_keep, 'model', how='inner')


    df_date = actual_sales.select(['week_id', 'date']).distinct()
    y_not_null = train_data_cutoff.where(train_data_cutoff.y.isNotNull())

    max_week = train_data_cutoff.select(F.max('week_id')).collect()[0][0]
    min_week = train_data_cutoff.select(F.min('week_id')).collect()[0][0]

    
    list_weeks = find_weeks(min_week, max_week)
    list_weeks = spark.createDataFrame(list_weeks, IntegerType()).selectExpr('value as week_id')
    list_models = train_data_cutoff.select(train_data_cutoff.model).distinct()

    full = list_weeks.crossJoin(list_models)
    full_actives_sales = full.join(train_data_cutoff, ['week_id', 'model'], how='left')
    #full_actives_sales = full_actives_sales[full_actives_sales['week_id'] < cutoff_week_id_test]

    #full_actives_sales.describe().show()
    # add cluster infos

    mdl_inf = model_info.select(['model'] + cluster_keys)

    complete_ts = full_actives_sales.join(mdl_inf, 'model', how='left').drop('date')
    complete_ts = complete_ts.join(df_date, 'week_id', how='inner')

    
    # Calculate the average sales per cluster and week from actual_sales
    all_sales = actual_sales.join(mdl_inf, 'model', how='left')
    all_sales = all_sales.dropna()
    join_key = ['week_id', 'date'] + cluster_keys
    all_sales = all_sales.groupBy(join_key).agg(F.mean('y').alias('mean_cluster_y'))


    # ad it to complete_ts
    complete_ts = complete_ts.join(all_sales, ['week_id', 'date', 'product_nature', 'family'], how='left')

    
    #SCale factor
    complete_ts = complete_ts.withColumn('row_scale_factor', complete_ts.y / complete_ts.mean_cluster_y)

    model_scale_factor = complete_ts.groupBy('model').agg(F.mean('row_scale_factor').alias('model_scale_factor'))

    complete_ts = complete_ts.join(model_scale_factor, ['model'], how='left')

    # assert complete_ts.where(complete_ts.model_scale_factor.isNull()).count() == 0

    
    #compute fake Y
    complete_ts = complete_ts.withColumn('fake_y', (complete_ts.mean_cluster_y * complete_ts.model_scale_factor).cast('int'))
    complete_ts = complete_ts.fillna(0, subset=['fake_y'])
    
    
    start_end = y_not_null.groupBy('model').agg(F.min('date').alias('start_date'), F.max('date').alias('end_date'))
    complete_ts = complete_ts.join(start_end, 'model', how='left')
    

    complete_ts = complete_ts.withColumn('age', (F.datediff(F.col('date'), F.col('start_date'))) / (7) + 1 )\
                             .withColumn('length', (F.datediff(F.col('end_date'), F.col('date'))) / (7) + 1 )\
                             .withColumn('is_y_sup', F.when(complete_ts.y.isNull(), 'false')\
                                                      .when(complete_ts.y > complete_ts.fake_y, 'true')\
                                                      .otherwise('false'))
    
    
    end_impl_period = complete_ts.filter(complete_ts.is_y_sup == True).select(['model', 'age']).groupBy('model').agg(F.min('age').alias('end_impl_period'))

    complete_ts = complete_ts.join(end_impl_period, on=['model'], how='left')

    
    complete_ts = complete_ts.withColumn('y', 
                F.when(
                    ((complete_ts.age <= 0) & (complete_ts.length <= min_ts_len)) | \
                    ((complete_ts.age > 0) & (complete_ts.age < complete_ts.end_impl_period)), complete_ts.fake_y.cast('int'))\
                 .otherwise(complete_ts.y).cast('int'))
    
    
    
    complete_ts = complete_ts.select(['week_id', 'date', 'model', 'y']).dropna(subset=('week_id', 'date', 'model', 'y'))


    complete_ts = complete_ts.orderBy(['week_id', 'model'])
    
    return complete_ts

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Generate training data used to forecast validation & test cutoffs
- For each cutoff, keep only models active the week before the cutoff

In [8]:
current_cutoff = next_week(actual_sales.select(F.max('week_id')).collect()[0][0])


cutoff_week_test = active_sales.where(active_sales.week_id >= first_test_cutoff).select(active_sales.week_id).distinct().orderBy('week_id')

nRow = spark.createDataFrame([[current_cutoff]])
cutoff_week_test = cutoff_week_test.union(nRow)

cutoff_week_test.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+
|week_id|
+-------+
| 201922|
| 201923|
| 201924|
+-------+
only showing top 3 rows

In [9]:
cutoff_week_val = active_sales.filter(active_sales.week_id < first_test_cutoff).select(active_sales.week_id).distinct().orderBy('week_id')

max_cutoff_week_val = cutoff_week_val.select(F.max('week_id')).collect()[0][0]

# keep the last 60 dates before test set for validation

_sup_week = __add_week(max_cutoff_week_val, (-horizon +1))
_inf_week = __add_week(max_cutoff_week_val, -(60 + horizon -1))


cutoff_week_val = cutoff_week_val.filter((cutoff_week_val.week_id > _inf_week) & (cutoff_week_val.week_id <= _sup_week))

# keep only one cutoff every 10 dates
idx_to_keep = ((np.arange(cutoff_week_val.count()) + 1) % 10 == 0)
idx = np.arange(1, len(idx_to_keep)+1)
idx_to_keep = np.c_[idx_to_keep, idx]

nRow = spark.createDataFrame(idx_to_keep.tolist()).selectExpr('_1 as value', '_2 as id')
cutoff_week_val = cutoff_week_val.withColumn('id', F.row_number().over(Window.orderBy('week_id')))
cutoff_week_val = cutoff_week_val.join(nRow, 'id', how='inner').drop('id')
cutoff_week_val = cutoff_week_val.filter(cutoff_week_val.value == 1).drop('value')

cutoff_week_val.show(3)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+
|week_id|
+-------+
| 201814|
| 201824|
| 201834|
+-------+
only showing top 3 rows

In [10]:
#weeks cutoff
iterate_week = cutoff_week_val.union(cutoff_week_test)
iterate_week = [row.week_id for row in iterate_week.collect()]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
for cutoff_week_id in sorted(iterate_week[:3]):
    
    t0 = time.time()
    print('Generating train data for cutoff', str(cutoff_week_id))
            
    train_data_cutoff = active_sales.filter(active_sales.week_id < cutoff_week_id)
       
    model_sold = train_data_cutoff.select(['model', 'y'])\
                 .groupBy('model')\
                 .agg(F.sum(train_data_cutoff.y).alias('qty_sold'))\
                 .orderBy('model')

    model_sold = model_sold.filter(model_sold.qty_sold > 0).select('model').orderBy('model')
    
    last_week = train_data_cutoff.agg(F.max('week_id').alias('last_week'))
    
    model_active = train_data_cutoff.groupBy('model').agg(F.max('week_id').alias('last_active_week'))
    
    model_active = model_active.join(last_week, last_week.last_week == model_active.last_active_week, 'inner').select(model_active.model)
    
    model_to_keep = model_active.join(model_sold, 'model', 'inner')
    
    train_data_cutoff = train_data_cutoff.join(model_to_keep, 'model', how='inner')
    
    # Reconstruct a fake history
    train_data_cutoff = reconstruct_history(train_data_cutoff, actual_sales, model_info)
    
    train_data_cutoff.write.parquet('s3://fcst-refined-demand-forecast-dev/scope/{}/train_data_cutoff/train_data_cutoff_{}'.format(scope, str(cutoff_week_id)), mode="overwrite")    

    t1 = time.time()
    total = t1-t0
    
    print('temps boucle {} {}:'.format(str(cutoff_week_id), total))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

('Generating train data for cutoff', '201814')
temps boucle 201814 27.1882758141:
('Generating train data for cutoff', '201824')
temps boucle 201824 28.7923500538:
('Generating train data for cutoff', '201834')
temps boucle 201834 28.1657111645:

In [None]:
s3://fcst-refined-demand-forecast-dev/scope/domyos_nov_2019/train_data_cutoff/

In [14]:
spark.stop()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…