#### Setup

In [1]:
import pyspark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1641875220807_0003,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
%%configure -f
{
"conf" :
{
"spark.serializer" : "org.apache.spark.serializer.KryoSerializer",
"spark.sql.legacy.parquet.int96RebaseModeInRead" : "CORRECTED",
"spark.sql.legacy.parquet.datetimeRebaseModeInWrite" : "CORRECTED",
"spark.sql.legacy.parquet.datetimeRebaseModeInRead" : "CORRECTED",
"spark.sql.legacy.timeParserPolicy" : "LEGACY"
}
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1641875220807_0004,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1641875220807_0004,pyspark,idle,Link,Link,,✔


In [3]:
import time

from datetime import datetime, timedelta
from functools import reduce

from pyspark import SparkConf, StorageLevel
from pyspark.sql import SparkSession, HiveContext, Window
import pyspark.sql.functions as F

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Utils

In [4]:
def to_uri(bucket, key):
    """
    Transforms bucket & key strings into S3 URI

    Args:
        bucket (string): name of the S3 bucket
        key (string): S3 key

    Returns:
        object (string): URI format
    """
    return 's3://{}/{}'.format(bucket, key)


def spark_read_parquet_s3(spark, bucket, path):
    """
    Read parquet file(s) hosted on a S3 bucket, load and return as spark dataframe

    Args:
        spark (SparkSession): spark app
        bucket (string): S3 bucket
        path (string): full path to the parquet directory or file within the S3 bucket

    Returns:
        (SparkDataframe): data loaded
    """
    return spark.read.parquet(to_uri(bucket, path))


def spark_write_parquet_s3(df, bucket, dir_path, repartition=10, mode='overwrite'):
    """
    Write a in-memory SparkDataframe to parquet files on a S3 bucket

    Args:
        df (SparkDataframe): the data to save
        bucket (string): S3 bucket
        dir_path (string): full path to the parquet directory within the S3 bucket
        repartition (int): number of partitions files to write
        mode (string): writing mode
    """
    df.repartition(repartition).write.parquet(to_uri(bucket, dir_path), mode=mode)
    

def spark_write_parquet_s3_coal(df, bucket, dir_path, mode='overwrite'):
    df.coalesce(10).write.parquet(to_uri(bucket, dir_path), mode=mode)
    
    
def get_timer(starting_time):
    """
    Displays the time that has elapsed between the input timer and the current time.

    Args:
        starting_time (timecode): timecode from Python 'time' package
    """
    end_time = time.time()
    minutes, seconds = divmod(int(end_time - starting_time), 60)
    print("{} minute(s) {} second(s)".format(int(minutes), seconds))


def union_all(l_df):
    """
    Apply union function on all spark dataframes in l_df

    """
    return reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), l_df)


def date_to_week_id(date):
    """
    Turn a date to Decathlon week id
    Args:
        date (str, pd.Timestamp or pd.Series): the date or pandas column of dates
    Returns:
        (int): the week id

    """
    day_of_week = date.strftime("%w")
    date = date if (day_of_week != '0') else date + timedelta(days=1)
    return int(str(date.isocalendar()[0]) + str(date.isocalendar()[1]).zfill(2))


def get_current_week_id():
    """
    Return current week id (international standard ISO 8601 - first day of week
    is Sunday, with format 'YYYYWW', as integer

    """
    return date_to_week_id(datetime.today())


def get_shift_n_week(week_id, nb_weeks):
    """
    Return input week_id shifted by nb_weeks (could be negative)

    """
    shifted_date = datetime.strptime(str(week_id) + '1', '%G%V%u') + timedelta(weeks=nb_weeks)
    ret_week_id = date_to_week_id(shifted_date)
    return ret_week_id

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Fetch data & Processing

In [5]:
bucket_refined = 'fcst-workspace/forecast-cn/fcst-refined-demand-forecast-dev'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# fetch sales to get the list of new model
df_sales = spark_read_parquet_s3(spark,bucket_refined,'global/model_week_sales')
selling_weeks = df_sales.groupby(['model_id']).agg(F.min('week_id').alias('start_selling_week'),
                                                   F.max('week_id').alias('last_selling_week'),
                                                   F.count('week_id').alias('selling_weeks')
                                                  )
selling_weeks_new = selling_weeks.filter('start_selling_week >= 201838').filter('last_selling_week >= 202137')
selling_weeks_new.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

12741

In [9]:
selling_weeks_ref = selling_weeks.filter('start_selling_week < 201838').filter('last_selling_week >= 202137')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
selling_weeks_ref.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

6397

In [11]:
df_sales_model = df_sales.groupby(['model_id', 'week_id']).agg(F.sum('sales_quantity').alias('sales_quantity'))
df_sales_ref = df_sales_model.withColumnRenamed('model_id', 'ref_model')
df_sales_ref.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-------+--------------+
|ref_model|week_id|sales_quantity|
+---------+-------+--------------+
|  1041951| 201844|             1|
|  8064899| 202020|            10|
+---------+-------+--------------+
only showing top 2 rows

In [12]:
# fetch sports tree
df_tree = spark_read_parquet_s3(spark,bucket_refined,'global/model_week_tree')
df_tree = df_tree.filter('week_id == 202138').select('model_id','family_id','product_nature_id')
df_tree.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------+-----------------+
|model_id|family_id|product_nature_id|
+--------+---------+-----------------+
| 8356685|    34003|            25014|
| 8501382|     3180|            25216|
+--------+---------+-----------------+
only showing top 2 rows

In [13]:
# fetch price
df_price = spark_read_parquet_s3(spark,bucket_refined,'global/model_week_price')
df_price = df_price.groupby(['model_id']).agg(F.mean('average_price').alias('average_price'))
df_price.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+------------------+
|model_id|     average_price|
+--------+------------------+
|  964396| 635.5735808955442|
| 8098218|2.6470588235294117|
+--------+------------------+
only showing top 2 rows

In [14]:
# fetch d_sku
d_sku = spark.read.parquet('s3://fcst-clean-prod/datalake/d_sku/')
d_sku = d_sku.withColumn('rn',F.row_number().over(Window.partitionBy('mdl_num_model_r3').orderBy(F.desc('rs_technical_date'))))\
.filter('rn = 1').drop('rn')\
.selectExpr('mdl_num_model_r3 as model_id','dsm_code').filter('dsm_code is not null')
d_sku.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+--------+
|model_id|dsm_code|
+--------+--------+
|  611790| X611790|
|  621230| X621230|
+--------+--------+
only showing top 2 rows

In [15]:
new_model_pool = selling_weeks_new.selectExpr('model_id').join(df_tree,on='model_id',how='left')\
.join(d_sku,on='model_id',how='left')\
.join(df_price,on='model_id',how='left')
new_model_pool.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------+-----------------+--------+------------------+
|model_id|family_id|product_nature_id|dsm_code|     average_price|
+--------+---------+-----------------+--------+------------------+
|  960734|     8607|            25032|  307300|179.81955445583887|
|  962836|    11956|            25015|  121764| 62.89103098512449|
+--------+---------+-----------------+--------+------------------+
only showing top 2 rows

In [16]:
# ref_model_pool
ref_model_pool = spark_read_parquet_s3(spark,bucket_refined,'test_data/cold_start_dev/ref_model_pool_202138.parquet')
ref_model_pool = selling_weeks_ref.selectExpr('model_id').join(ref_model_pool, on = 'model_id', how = 'inner')
ref_model_pool = ref_model_pool.selectExpr('model_id as ref_model','family_id as family_id_ref','dsm_code as dsm_code_ref',
                                           'product_nature_id as product_nature_id_ref','average_price as average_price_ref')
ref_model_pool.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-------------+------------+---------------------+-----------------+
|ref_model|family_id_ref|dsm_code_ref|product_nature_id_ref|average_price_ref|
+---------+-------------+------------+---------------------+-----------------+
|  8369958|        11504|      146051|                25111| 7.96458341362338|
|  8371831|         2502|      144349|                10855| 76.2989024230731|
+---------+-------------+------------+---------------------+-----------------+
only showing top 2 rows

In [17]:
# create ref_model_base with random sampling
ref_model_base = new_model_pool.alias('df1')\
.join(ref_model_pool.alias('df2'),F.col('df1.model_id') != F.col('df2.ref_model'),how='left')
ref_model_base.show(2)
ref_model_base.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+-----------------+
|model_id|family_id|product_nature_id|dsm_code|     average_price|ref_model|family_id_ref|dsm_code_ref|product_nature_id_ref|average_price_ref|
+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+-----------------+
|  960734|     8607|            25032|  307300|179.81955445583893|  8369958|        11504|      146051|                25111| 7.96458341362338|
|  960734|     8607|            25032|  307300|179.81955445583893|  8371831|         2502|      144349|                10855| 76.2989024230731|
+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+-----------------+
only showing top 2 rows

43523256

In [18]:
ref_model_base = ref_model_base.withColumn('price_difference', ref_model_base.average_price - ref_model_base.average_price_ref)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
ref_model_base.agg(F.min('price_difference')).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------+
|min(price_difference)|
+---------------------+
|   -13650.96939190476|
+---------------------+

In [20]:
ref_model_base.agg(F.max('price_difference')).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------+
|max(price_difference)|
+---------------------+
| 1.2544574503583174E9|
+---------------------+

In [21]:
ref_model_base = ref_model_base.withColumn('price_difference_norm',(ref_model_base.price_difference - (-13650.96939190476)) / (25998.000000000004 - (-13650.96939190476)))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
ref_model_base.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+------------------+------------------+---------------------+
|model_id|family_id|product_nature_id|dsm_code|     average_price|ref_model|family_id_ref|dsm_code_ref|product_nature_id_ref| average_price_ref|  price_difference|price_difference_norm|
+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+------------------+------------------+---------------------+
|  960734|     8607|            25032|  307300|179.81955445583887|  8371831|         2502|      144349|                10855|  76.2989024230731|103.52065203276577|  0.34690662216168017|
|  960734|     8607|            25032|  307300|179.81955445583887|  8488150|        34010|      165554|                12183|26.532270075187963| 153.2872843806509|   0.3481618031439643|
+--------+---------+-----------------+--------+------------------+----

In [23]:
df_model_tran = new_model_pool.join(df_sales_model, on = 'model_id', how = 'left')
df_ref_tran = ref_model_pool.join(df_sales_ref, 'ref_model', how = 'left')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [24]:
def calculate_corr(feature):
    
    df_model = df_model_tran.groupby([feature,'week_id']).agg(F.sum('sales_quantity').alias('sales_quantity'))
    df_ref = df_ref_tran.groupby([f'{feature}_ref','week_id']).agg(F.sum('sales_quantity').alias('sales_quantity_ref'))
    base = df_model.join(df_ref, on = 'week_id', how='left')
    base = base.groupby([feature, f'{feature}_ref']).agg(F.corr('sales_quantity', 'sales_quantity_ref').alias(f'{feature}_corr'))

    return base

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
df_family_corr = calculate_corr('family_id')
print('Finish processing family...')
df_dsm_corr = calculate_corr('dsm_code')
print('Finish processing dsm...')
df_product_nature_corr = calculate_corr('product_nature_id')
print('Finish processing product nature...')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Finish processing family...
Finish processing dsm...
Finish processing product nature...

In [26]:
df_family_corr.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-------------+--------------------+
|family_id|family_id_ref|      family_id_corr|
+---------+-------------+--------------------+
|    10778|        11689| 0.21983954186138918|
|     5062|        10767|-0.17044132426778932|
+---------+-------------+--------------------+
only showing top 2 rows

In [27]:
ref_model_base = ref_model_base.join(df_family_corr, ['family_id', 'family_id_ref'], how = 'left')
ref_model_base = ref_model_base.join(df_dsm_corr, ['dsm_code', 'dsm_code_ref'], how = 'left')
ref_model_base = ref_model_base.join(df_product_nature_corr, ['product_nature_id', 'product_nature_id_ref'], how = 'left')
ref_model_base.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+---------------------+--------+------------+---------+-------------+--------+-------------+---------+------------------+------------------+---------------------+--------------+-------------+----------------------+
|product_nature_id|product_nature_id_ref|dsm_code|dsm_code_ref|family_id|family_id_ref|model_id|average_price|ref_model| average_price_ref|  price_difference|price_difference_norm|family_id_corr|dsm_code_corr|product_nature_id_corr|
+-----------------+---------------------+--------+------------+---------+-------------+--------+-------------+---------+------------------+------------------+---------------------+--------------+-------------+----------------------+
|             null|                11009|X8768419|       15282|     null|        10943| 8768419|        169.0|  8319140|249.79833348061334|-80.79833348061334|  0.34225785099964806|          null|          1.0|                  null|
|             null|                11009|X8763948|       15282|     

In [103]:
null_total = ref_model_base.select('dsm_code', 'family_id', 'product_nature_id', 'price_difference_norm')
null_total.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in null_total.columns]).show() 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------+-----------------+---------------------+
|dsm_code|family_id|product_nature_id|price_difference_norm|
+--------+---------+-----------------+---------------------+
| 8385930|   105705|           105705|               155034|
+--------+---------+-----------------+---------------------+

In [28]:
# fill na with 0
ref_model_base = ref_model_base.fillna({'dsm_code_corr': 0, 'family_id_corr': 0, 'product_nature_id_corr': 0, 'price_difference_norm': 0})
#null_total = ref_model_base.select('dsm_code_corr', 'family_id_corr', 'product_nature_id_corr', 'price_difference_norm')
#null_total.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in null_total.columns]).show() 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
'''
dsm_code	1.622797071815100
family_id	0.606589832402213
price_difference	0.010834530380563
product_nature_id	0.121562762967326
intercept	-0.143325296539284
'''
ref_model_base = ref_model_base.withColumn('similarity',
                                           (ref_model_base.dsm_code_corr * 1.622797071815100 + ref_model_base.family_id_corr * 0.606589832402213 \
                                            + ref_model_base.price_difference_norm * 0.010834530380563 + ref_model_base.product_nature_id_corr * 0.121562762967326))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [30]:
ref_model_base.sort('similarity', ascending = False).show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+---------------------+--------+------------+---------+-------------+--------+--------------------+---------+------------------+--------------------+---------------------+------------------+------------------+----------------------+------------------+
|product_nature_id|product_nature_id_ref|dsm_code|dsm_code_ref|family_id|family_id_ref|model_id|       average_price|ref_model| average_price_ref|    price_difference|price_difference_norm|    family_id_corr|     dsm_code_corr|product_nature_id_corr|        similarity|
+-----------------+---------------------+--------+------------+---------+-------------+--------+--------------------+---------+------------------+--------------------+---------------------+------------------+------------------+----------------------+------------------+
|            25014|                25014|  306876|      165437|    10861|        34378| 8556896|1.2544574522583175E9|  8488039|30.188868460064185| 1.254457422069449E9|   31639.437097070408|0

In [31]:
ref_model_base.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

43523256

In [32]:
# Save to S3
spark_write_parquet_s3(ref_model_base.select('model_id','ref_model','similarity'), 'fcst-workspace',
                       'forecast-cn/fcst-refined-demand-forecast-dev/global/ref_model_base_simple_v3',
                       repartition=10, mode='overwrite')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…