In [3]:
import sys

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType 
import pyspark.sql.functions as F
#from pyspark.sql.functions import desc

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
spark = SparkSession.builder \
    .appName("data_refining_part_2") \
    .config("spark.sql.broadcastTimeout", "36000")\
    .getOrCreate()
spark.version

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

u'2.4.3'

## Load raw data

In [5]:
actual_sales_schema = StructType([
    StructField('week_id', IntegerType()),
    StructField('date', DateType()),
    StructField('model', IntegerType()),
    StructField('y', IntegerType())])

actual_sales = spark.read.csv('s3://fcst-workspace/qlik/data/raw/actual_sales.csv.gz',
                              schema=actual_sales_schema, sep='|', header=True)
actual_sales.cache()
actual_sales.printSchema()
actual_sales.show(1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- week_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- model: integer (nullable = true)
 |-- y: integer (nullable = true)

+-------+----------+-----+---+
|week_id|      date|model|  y|
+-------+----------+-----+---+
| 201611|2016-03-13|    1|  1|
+-------+----------+-----+---+
only showing top 1 row

In [6]:
lifestage_update_schema = StructType([
    StructField('model', IntegerType()),
    StructField('sku', IntegerType()),
    StructField('date_begin', DateType()),
    StructField('date_end', DateType()),
    StructField('lifestage', IntegerType())])

lifestage_update = spark.read.csv('s3://fcst-workspace/qlik/data/raw/lifestage_update.csv.gz',
                                  schema=lifestage_update_schema, sep='|', header=True)
lifestage_update.cache()
lifestage_update.printSchema()
lifestage_update.show(1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- model: integer (nullable = true)
 |-- sku: integer (nullable = true)
 |-- date_begin: date (nullable = true)
 |-- date_end: date (nullable = true)
 |-- lifestage: integer (nullable = true)

+-----+------+----------+----------+---------+
|model|   sku|date_begin|  date_end|lifestage|
+-----+------+----------+----------+---------+
|15691|629892|2018-10-01|2100-12-31|        1|
+-----+------+----------+----------+---------+
only showing top 1 row

In [7]:
model_info = spark.read.csv('s3://fcst-workspace/qlik/data/raw/model_info.csv.gz',
                            inferSchema=True, sep='|', header=True)
model_info.cache()
model_info.printSchema()
model_info.show(1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- model: integer (nullable = true)
 |-- model_label: string (nullable = true)
 |-- family: integer (nullable = true)
 |-- family_label: string (nullable = true)
 |-- sub_department: integer (nullable = true)
 |-- sub_department_label: string (nullable = true)
 |-- department: integer (nullable = true)
 |-- department_label: string (nullable = true)
 |-- univers: integer (nullable = true)
 |-- univers_label: string (nullable = true)
 |-- product_nature: integer (nullable = true)
 |-- product_nature_label: string (nullable = true)
 |-- category_label: string (nullable = true)
 |-- range_level: integer (nullable = true)

+-----+-----------+------+-------------+--------------+--------------------+----------+----------------+-------+-------------+--------------+--------------------+--------------+-----------+
|model|model_label|family| family_label|sub_department|sub_department_label|department|department_label|univers|univers_label|product_nature|product_nature_label|category_label

## Delete incomplete weeks ==> A gérer dans la partie 1 ?
- To be sure to have complete weeks (Sunday --> Saturday) regardless raw data extraction date, the first and last week of sales are deleted

In [8]:
#actual_sales.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
min_max_week_id = actual_sales.select(F.min('week_id').alias('min'), 
                                      F.max('week_id').alias('max'))

actual_sales = actual_sales \
    .join(min_max_week_id, 
          on=(actual_sales.week_id > min_max_week_id.min) & (actual_sales.week_id < min_max_week_id.max),
          how='inner') \
    .select('week_id', 'date', 'model', 'y') \
    .orderBy('model', 'date') 

actual_sales.cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[week_id: int, date: date, model: int, y: int]

In [10]:
#actual_sales.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Format life stages values
/!\ life stage values are only historized since September 10, 2018

##### 1) Keep only usefull life stage values: models in actual sales

In [11]:
# lifestage_update.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
lifestage_update = lifestage_update.join(actual_sales.select('model').drop_duplicates(), 
                                         on='model', how='inner')

lifestage_update.cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[model: int, sku: int, date_begin: date, date_end: date, lifestage: int]

In [13]:
# lifestage_update.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

##### 2) Sku life stage updates ==> sku life stage by week

In [14]:
# Calculates all possible date/sku combinations associated with a life stage update
first_lifestage_date = lifestage_update.select(F.min('date_begin').alias('first_date'))

all_lifestage_date = actual_sales \
    .join(first_lifestage_date,
          on=actual_sales.date >= first_lifestage_date.first_date,
          how='inner') \
    .select('date') \
    .drop_duplicates() \
    .orderBy('date') 


all_lifestage_sku = lifestage_update.select('sku').drop_duplicates().orderBy('sku')

date_sku = all_lifestage_date.crossJoin(all_lifestage_sku)

# Add corresponding models
date_sku = date_sku.join(lifestage_update.select('sku', 'model').drop_duplicates(), 
                         on='sku', 
                         how='inner')

# Calculate lifestage by date
sku_lifestage = date_sku.join(lifestage_update, on=['model', 'sku'], how='left')
sku_lifestage = sku_lifestage \
    .filter((sku_lifestage.date >= sku_lifestage.date_begin) &
            (sku_lifestage.date <= sku_lifestage.date_end)) \
    .drop('date_begin', 'date_end')

# The previous filter removes combinations that do not match the update dates.
# But sometimes the update dates do not cover all periods, 
# which causes some dates to disappear, even during the model's activity periods.
# To avoid this problem, we must merge again with all combinations to be sure 
# not to lose anything.
sku_lifestage = date_sku.join(sku_lifestage, on=['date', 'model', 'sku'], how='left')

sku_lifestage.cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[date: date, model: int, sku: int, lifestage: int]

In [15]:
#sku_lifestage.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

##### 3) Sku life stage ==> model life stage
- In order to aggregate at the model level, we decided to take the minimum life stage value of the SKUs that compose it
- If no life stage is filled in, we take the last known value (if exists)

In [16]:
model_lifestage = sku_lifestage \
    .groupby(['date', 'model']) \
    .agg(F.min('lifestage').alias('lifestage'))

model_lifestage.cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[date: date, model: int, lifestage: int]

In [17]:
#model_lifestage.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
#print('NB Null before:', model_lifestage.count() - model_lifestage.na.drop().count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
# This is a ffil by group in pyspark ==> OMG
window = Window.partitionBy('model')\
               .orderBy('date')\
               .rowsBetween(-sys.maxsize, 0)

ffilled_lifestage = F.last(model_lifestage['lifestage'], ignorenulls=True).over(window)

model_lifestage = model_lifestage.withColumn('lifestage', ffilled_lifestage)

model_lifestage.cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[date: date, model: int, lifestage: int]

In [20]:
#model_lifestage.take(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
#print('NB Null after:', model_lifestage.count() - model_lifestage.na.drop().count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

##### 4) Deal with zombie models ==> TO DO
If the life stage changes from active (1) to inactive (2 or more) and then back active, we consider the model is a zombie.  
This new life may have a different sales behaviour than the previous one, so it's better to pretend that the latter is the only one that never existed.  

For example, if the life stage looks like this: **1 1 1 3 3 3 1 1 1**, we only keep that: **3 1 1 1**.  
Note that we still keep the last inactive value (here 3) before life stages 1 in order to avoid that the model life stages are considered incomplete in the following session.

In [22]:
# print('NB row before: ', model_lifestage.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
model_lifestage = model_lifestage.withColumn('lifestage_shift', F.lag(model_lifestage['lifestage']).over(Window.partitionBy("model").orderBy(F.desc('date'))))

model_lifestage = model_lifestage.withColumn('diff_shift', model_lifestage['lifestage'] - model_lifestage['lifestage_shift'])

df_cut_date = model_lifestage.filter(F.col('diff_shift')>0)

df_cut_date = df_cut_date.groupBy('model')\
                         .agg(F.max('date').alias('cut_date'))

ml = model_lifestage.join(df_cut_date, on=['model'], how='left')

ml = ml.withColumn('cut_date',F.when(F.col('cut_date').isNull(),F.to_date(F.lit('1993-04-15'),'yyyy-MM-dd')).otherwise(F.col('cut_date')))

model_lifestage = ml.where(ml.date >= ml.cut_date)\
                    .select(['date', 'model', 'lifestage'])

model_lifestage.cache()
# model_lifestage.orderBy(['model', 'date', 'lifestage'], ascending=True).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[date: date, model: int, lifestage: int]

In [24]:
# print('NB row after: ', model_lifestage.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
# model_lifestage.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Match sales and life stages & rebuild incomplete life stages

##### 1) Complete sales
- Fill missing quantities by 0

In [26]:
# test 1 model

# all_sales_model = actual_sales.select('model').where(actual_sales.model == 1).orderBy(['model'], ascending=True).distinct()
all_sales_model = actual_sales.select('model').orderBy(['model'], ascending=True).distinct()

all_sales_date = actual_sales.select('date')\
                             .orderBy(['date'], ascending=True).distinct()


date_model = all_sales_model.crossJoin(all_sales_date)
date_model.cache()


ac = actual_sales.select(actual_sales.date, actual_sales.week_id)\
                 .drop_duplicates()
                 
         
dm = date_model.drop_duplicates()


date_model = dm.join(ac,['date'] , how='inner')
date_model.cache()

  
complete_ts = date_model.join(actual_sales, ['date', 'model', 'week_id'], how='left')



complete_ts = complete_ts.fillna(0, subset=['y'])
complete_ts.cache()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[date: date, model: int, week_id: int, y: int]

In [27]:
# 259 comme sur python pour le model 1

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

##### 2) Add model life stage by week

In [28]:
# complete_ts = pd.merge(complete_ts, model_lifestage, how='left')

complete_ts = complete_ts.join(model_lifestage, ['date', 'model'], how='left')
complete_ts.cache()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[date: date, model: int, week_id: int, y: int, lifestage: int]

##### 3) Rebuild incomplete life stages
/!\ Reminder: the life stage values are only historized since September 10, 2018
- If the life stage value is 1 at the first historized date 
- And we observe sales in the previous and consecutive weeks
- Then we fill the life stage values of these weeks with 1 as well

In [29]:
# find models respecting the first condition

first_lifestage = complete_ts.where(complete_ts.lifestage.isNotNull())


first_lifestage = first_lifestage.orderBy(['model', 'date'], ascending=True)\
                                 .groupBy('model').agg(F.first(complete_ts['date']).alias('date'), F.first(complete_ts['lifestage']).alias('lifestage'))


first_lifestage = first_lifestage.where(first_lifestage.lifestage == 1)\
                                 .select(first_lifestage.model, first_lifestage.date.alias('first_lifestage_date'))

first_lifestage.cache()


# Create the mask (rows to be completed) for theses models
complete_ts = complete_ts.withColumn('id', F.monotonically_increasing_id())
mask = complete_ts.withColumn('id', F.monotonically_increasing_id())

# keep only models respecting the first condition
mask = mask.join(first_lifestage, ['model'], how='inner')
mask.cache()
complete_ts.cache()



# Look only before the first historized lifestage date
mask = mask.where(mask.date <= mask.first_lifestage_date)

mask = mask.withColumn('cumsum_y', F.sum('y').over(Window.partitionBy('model').orderBy(F.col('date').desc())))\
           .withColumn('lag_cumsum_y', F.lag('cumsum_y').over(Window.partitionBy('model').orderBy(F.col('date').desc())))\
           .fillna(0, subset=['lag_cumsum_y'])\
           .withColumn('is_active', F.col('cumsum_y') > F.col('lag_cumsum_y'))


ts_start_date = mask.where(mask.is_active == False)\
                    .groupBy('model').agg(F.first('date').alias('start_date'))

mask = mask.join(ts_start_date, 'model', how='left')

# Case model start date unknown (older than first week recorded here)
# ==> fill by an old date
mask = mask.withColumn('start_date',F.when(F.col('start_date').isNull(),F.to_date(F.lit('1993-04-15'),'yyyy-MM-dd')).otherwise(F.col('start_date')))\
     
mask = mask.withColumn('start_date',F.when(F.col('start_date').isNull(),F.to_date(F.lit('1993-04-15'),'yyyy-MM-dd')).otherwise(F.col('start_date')))\
           .withColumn('is_model_start', F.col('date') > F.col('start_date'))\
           .withColumn('to_fill', (F.col('is_active')) & (F.col('is_model_start')) & (F.col('lifestage').isNull()))


mask = mask.where(mask.to_fill == True).select(mask.id, mask.to_fill)


mask.cache()

# Fill the eligible rows under all conditions
#complete_ts.loc[mask['index'], 'lifestage'] = 1

complete_ts = complete_ts.join(mask, 'id', how='left')
complete_ts = complete_ts.withColumn('lifestage', F.when(F.col('to_fill')==True, F.lit(1)).otherwise(F.col('lifestage')))
complete_ts.cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[id: bigint, date: date, model: int, week_id: int, y: int, lifestage: int, to_fill: boolean]

In [30]:
complete_ts.groupby('lifestage').count().orderBy('count').show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------+--------+
|lifestage|   count|
+---------+--------+
|        0|      89|
|        2|   47408|
|        3|  182502|
|        7|  430085|
|        8|  827983|
|        1| 2723262|
|        4| 4829419|
|     null|37121858|
+---------+--------+

In [32]:
complete_ts.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

46162606

In [None]:
# lifestage1 : wtf
# python: 2723285
# spark:  2723262

## Create active sales data set

##### 1) Keep in memory first sales dates by model

In [34]:
model_start_date = actual_sales.groupBy('model').agg(F.first(complete_ts['date']).alias('first_date'))
model_start_date.cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[model: int, first_date: date]

##### 2) Construct active sales
- Filtered on active life stage 
- After the first actual sales date
- Padded with zeros (already done in complete sales)

In [45]:
print('Nb rows before:', complete_ts.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

('Nb rows before:', 46162606)

In [46]:
active_sales = complete_ts.where(complete_ts.lifestage == 1)

active_sales = active_sales.join(model_start_date,'model' , how='inner')\
                           
active_sales = active_sales.where(active_sales.date >= active_sales.first_date)\
                           .drop('lifestage', 'first_date', 'to_fill')

active_sales.cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[model: int, id: bigint, date: date, week_id: int, y: int]

In [47]:
print('Nb rows after: ', active_sales.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

('Nb rows after: ', 2708815)

In [44]:
active_sales.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- model: integer (nullable = true)
 |-- id: long (nullable = false)
 |-- date: date (nullable = true)
 |-- week_id: integer (nullable = true)
 |-- y: integer (nullable = true)

## Clean model info

In [None]:
"""
model_info.loc[model_info['category_label'] == 'SOUS RAYON POUB', 'category_label'] = np.nan
model_info.fillna('UNKNOWN', inplace=True)
model_info.category_label.unique()
"""

In [None]:
"""
# Due to a discrepant seasonal behaviour between LOW SOCKS and HIGH SOCKS, we chose to split
# the product nature 'SOCKS' into two different product natures 'LOW SOCKS' and 'HIGH SOCKS'

def change_socks_nature(row):
    if (row['product_nature_label']=='SOCKS') & (' LOW' in str(row['model_label'])):
        return 'LOW SOCKS'
    if (row['product_nature_label']=='SOCKS') & (' MID' in str(row['model_label'])):
        return 'MID SOCKS'
    if (row['product_nature_label']=='SOCKS') & (' HIGH' in str(row['model_label'])):
        return 'HIGH SOCKS'
    else:
        return row['product_nature_label']
    
model_info['product_nature_label'] = model_info.apply(change_socks_nature, axis=1)

model_info['product_nature'] = pd.factorize(model_info['product_nature_label'])[0] + 1
"""

## Export datasets

In [None]:
"""
ut.write_csv_S3(model_info, cf.bucket, cf.s3_path_clean_data + 'model_info.csv')
ut.write_csv_S3(actual_sales, cf.bucket, cf.s3_path_clean_data + 'actual_sales.csv')
ut.write_csv_S3(active_sales, cf.bucket, cf.s3_path_clean_data + 'active_sales.csv')
"""

In [None]:
spark.stop()