#### Setup

In [1]:
import pyspark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2,application_1641875220807_0005,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
%%configure -f
{
"conf" :
{
"spark.serializer" : "org.apache.spark.serializer.KryoSerializer",
"spark.sql.legacy.parquet.int96RebaseModeInRead" : "CORRECTED",
"spark.sql.legacy.parquet.datetimeRebaseModeInWrite" : "CORRECTED",
"spark.sql.legacy.parquet.datetimeRebaseModeInRead" : "CORRECTED",
"spark.sql.legacy.timeParserPolicy" : "LEGACY"
}
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3,application_1641875220807_0006,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1641875220807_0004,pyspark,idle,Link,Link,,
3,application_1641875220807_0006,pyspark,idle,Link,Link,,✔


In [3]:
import time

from datetime import datetime, timedelta
from functools import reduce

from pyspark import SparkConf, StorageLevel
from pyspark.sql import SparkSession, HiveContext, Window
import pyspark.sql.functions as F

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Utils

In [4]:
def to_uri(bucket, key):
    """
    Transforms bucket & key strings into S3 URI

    Args:
        bucket (string): name of the S3 bucket
        key (string): S3 key

    Returns:
        object (string): URI format
    """
    return 's3://{}/{}'.format(bucket, key)


def spark_read_parquet_s3(spark, bucket, path):
    """
    Read parquet file(s) hosted on a S3 bucket, load and return as spark dataframe

    Args:
        spark (SparkSession): spark app
        bucket (string): S3 bucket
        path (string): full path to the parquet directory or file within the S3 bucket

    Returns:
        (SparkDataframe): data loaded
    """
    return spark.read.parquet(to_uri(bucket, path))


def spark_write_parquet_s3(df, bucket, dir_path, repartition=10, mode='overwrite'):
    """
    Write a in-memory SparkDataframe to parquet files on a S3 bucket

    Args:
        df (SparkDataframe): the data to save
        bucket (string): S3 bucket
        dir_path (string): full path to the parquet directory within the S3 bucket
        repartition (int): number of partitions files to write
        mode (string): writing mode
    """
    df.repartition(repartition).write.parquet(to_uri(bucket, dir_path), mode=mode)
    

def spark_write_parquet_s3_coal(df, bucket, dir_path, mode='overwrite'):
    df.coalesce(10).write.parquet(to_uri(bucket, dir_path), mode=mode)
    
    
def get_timer(starting_time):
    """
    Displays the time that has elapsed between the input timer and the current time.

    Args:
        starting_time (timecode): timecode from Python 'time' package
    """
    end_time = time.time()
    minutes, seconds = divmod(int(end_time - starting_time), 60)
    print("{} minute(s) {} second(s)".format(int(minutes), seconds))


def union_all(l_df):
    """
    Apply union function on all spark dataframes in l_df

    """
    return reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), l_df)


def date_to_week_id(date):
    """
    Turn a date to Decathlon week id
    Args:
        date (str, pd.Timestamp or pd.Series): the date or pandas column of dates
    Returns:
        (int): the week id

    """
    day_of_week = date.strftime("%w")
    date = date if (day_of_week != '0') else date + timedelta(days=1)
    return int(str(date.isocalendar()[0]) + str(date.isocalendar()[1]).zfill(2))


def get_current_week_id():
    """
    Return current week id (international standard ISO 8601 - first day of week
    is Sunday, with format 'YYYYWW', as integer

    """
    return date_to_week_id(datetime.today())


def get_shift_n_week(week_id, nb_weeks):
    """
    Return input week_id shifted by nb_weeks (could be negative)

    """
    shifted_date = datetime.strptime(str(week_id) + '1', '%G%V%u') + timedelta(weeks=nb_weeks)
    ret_week_id = date_to_week_id(shifted_date)
    return ret_week_id

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

#### Fetch data & Processing

In [5]:
bucket_refined = 'fcst-workspace/forecast-cn/fcst-refined-demand-forecast-dev'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# fetch sales to get the list of new model
df_sales = spark_read_parquet_s3(spark,bucket_refined,'global/model_week_sales')
selling_weeks = df_sales.groupby(['model_id']).agg(F.min('week_id').alias('start_selling_week'),
                                                   F.max('week_id').alias('last_selling_week'),
                                                   F.count('week_id').alias('selling_weeks')
                                                  )
selling_weeks_new = selling_weeks.filter('start_selling_week >= 201838').filter('last_selling_week >= 202137')
selling_weeks_new.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

12741

In [7]:
selling_weeks_ref = selling_weeks.filter('start_selling_week < 201838').filter('last_selling_week >= 202137')
selling_weeks_ref.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

6397

In [8]:
# fetch sports tree
df_tree = spark_read_parquet_s3(spark,bucket_refined,'global/model_week_tree')
df_tree = df_tree.filter('week_id == 202138').select('model_id','family_id','product_nature_id')
df_tree.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------+-----------------+
|model_id|family_id|product_nature_id|
+--------+---------+-----------------+
| 8356685|    34003|            25014|
| 8501382|     3180|            25216|
+--------+---------+-----------------+
only showing top 2 rows

In [9]:
# fetch price
df_price = spark_read_parquet_s3(spark,bucket_refined,'global/model_week_price')
df_price = df_price.groupby(['model_id']).agg(F.mean('average_price').alias('average_price'))
df_price.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+------------------+
|model_id|     average_price|
+--------+------------------+
| 8185920|134.04879167562228|
| 8217708|17.400000000000002|
+--------+------------------+
only showing top 2 rows

In [10]:
# fetch d_sku
d_sku = spark.read.parquet('s3://fcst-clean-prod/datalake/d_sku/')
d_sku = d_sku.withColumn('rn',F.row_number().over(Window.partitionBy('mdl_num_model_r3').orderBy(F.desc('rs_technical_date'))))\
.filter('rn = 1').drop('rn')\
.selectExpr('mdl_num_model_r3 as model_id','dsm_code').filter('dsm_code is not null')
d_sku.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+--------+
|model_id|dsm_code|
+--------+--------+
|  611790| X611790|
|  621230| X621230|
+--------+--------+
only showing top 2 rows

In [11]:
new_model_pool = selling_weeks_new.selectExpr('model_id').join(df_tree,on='model_id',how='left')\
.join(d_sku,on='model_id',how='left')\
.join(df_price,on='model_id',how='left')
new_model_pool.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------+-----------------+--------+------------------+
|model_id|family_id|product_nature_id|dsm_code|     average_price|
+--------+---------+-----------------+--------+------------------+
|  960734|     8607|            25032|  307300|179.81955445583893|
|  962836|    11956|            25015|  121764| 62.89103098512449|
+--------+---------+-----------------+--------+------------------+
only showing top 2 rows

In [12]:
# ref_model_pool
ref_model_pool = spark_read_parquet_s3(spark,bucket_refined,'test_data/cold_start_dev/ref_model_pool_202138.parquet')
ref_model_pool = selling_weeks_ref.selectExpr('model_id').join(ref_model_pool, on = 'model_id', how = 'inner')
ref_model_pool = ref_model_pool.selectExpr('model_id as ref_model','family_id as family_id_ref','dsm_code as dsm_code_ref',
                                           'product_nature_id as product_nature_id_ref','average_price as average_price_ref')
ref_model_pool.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+-------------+------------+---------------------+------------------+
|ref_model|family_id_ref|dsm_code_ref|product_nature_id_ref| average_price_ref|
+---------+-------------+------------+---------------------+------------------+
|  8371831|         2502|      144349|                10855|  76.2989024230731|
|  8488150|        34010|      165554|                12183|26.532270075187963|
+---------+-------------+------------+---------------------+------------------+
only showing top 2 rows

In [13]:
# create ref_model_base with random sampling
ref_model_base = new_model_pool.alias('df1')\
.join(ref_model_pool.alias('df2'),F.col('df1.model_id') != F.col('df2.ref_model'),how='left')
ref_model_base.show(2)
ref_model_base.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+-----------------+
|model_id|family_id|product_nature_id|dsm_code|     average_price|ref_model|family_id_ref|dsm_code_ref|product_nature_id_ref|average_price_ref|
+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+-----------------+
|  960734|     8607|            25032|  307300|179.81955445583893|  8369958|        11504|      146051|                25111| 7.96458341362338|
|  960734|     8607|            25032|  307300|179.81955445583893|  8371831|         2502|      144349|                10855| 76.2989024230731|
+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+-----------------+
only showing top 2 rows

43523256

In [14]:
# features = ['family_id','sub_department_id','department_id','univers_id','product_nature_id','sports']
ref_model_base = ref_model_base\
.withColumn('is_same_family',F.expr('case when family_id = family_id_ref then 1 else 0 end as is_same_family'))\
.withColumn('is_same_dsm',F.expr('case when dsm_code = dsm_code_ref then 1 else 0 end as is_same_dsm'))\
.withColumn('is_same_product_nature',F.expr('case when product_nature_id = product_nature_id_ref then 1 else 0 end as is_same_product_nature'))\
.withColumn('price_difference',ref_model_base.average_price - ref_model_base.average_price_ref)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [52]:
ref_model_base.agg(F.min('price_difference')).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------+
|min(price_difference)|
+---------------------+
|   -13650.96939190476|
+---------------------+

In [53]:
ref_model_base.agg(F.max('price_difference')).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------------+
|max(price_difference)|
+---------------------+
|   25997.014545454545|
+---------------------+

In [15]:
ref_model_base = ref_model_base\
.withColumn('price_difference_norm',(ref_model_base.price_difference - (-13650.96939190476)) / (25998.000000000004 - (-13650.96939190476)))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
'''
dsm_code	1.622797071815100
family_id	0.606589832402213
price_difference	0.010834530380563
product_nature_id	0.121562762967326
intercept	-0.143325296539284
'''
ref_model_base = ref_model_base.withColumn('similarity',
                                           (ref_model_base.is_same_dsm * 1.622797071815100 + ref_model_base.is_same_family * 0.606589832402213 \
                                            + ref_model_base.price_difference_norm * 0.010834530380563 + ref_model_base.is_same_product_nature * 0.121562762967326))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
ref_model_base.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+-----------------+--------------+-----------+----------------------+------------------+---------------------+--------------------+
|model_id|family_id|product_nature_id|dsm_code|     average_price|ref_model|family_id_ref|dsm_code_ref|product_nature_id_ref|average_price_ref|is_same_family|is_same_dsm|is_same_product_nature|  price_difference|price_difference_norm|          similarity|
+--------+---------+-----------------+--------+------------------+---------+-------------+------------+---------------------+-----------------+--------------+-----------+----------------------+------------------+---------------------+--------------------+
|  960734|     8607|            25032|  307300|179.81955445583893|  8369958|        11504|      146051|                25111| 7.96458341362338|             0|          0|                     0|171.85497104221554|  0.3486301050177919

In [18]:
ref_model_base.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

43523256

In [19]:
# Save to S3
spark_write_parquet_s3(ref_model_base.select('model_id','ref_model','similarity'), 'fcst-workspace',
                       'forecast-cn/fcst-refined-demand-forecast-dev/global/ref_model_base_simple_v2',
                       repartition=10, mode='overwrite')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…