# Feature Generation

## Load Clean Data

In [0]:
flat_transactions = spark.read.parquet('gs://h-and-m-tx/clean/flat-tx')
flat_transactions.cache()
flat_transactions.createOrReplaceTempView('flat_transactions')

In [0]:
# order_details_ = spark.table('order_details').cache()
# prior_order_details = order_details_.filter(f.expr("eval_set='prior'"))

In [0]:
prior_days = [30, 90, 180, 360, 720, 1080]

# calculate metrics for the following fields and time intervals
aggregations = []
for column in ['order_number', 'customer_id', 'article_id', 'product_group_name', 'graphical_appearance_name']:
    for prior_day in prior_days:
    
        # count distinct instances in the field during this time-range
        aggregations += [
          F.countDistinct(
              F.expr(
                  'CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'
                  .format(prior_day, column)))
            .alias('global_cnt_distinct_{1}_last_{0}_days'.format(prior_day, column))]
    
# execute metric definitions
global_metrics = (
  flat_transactions
  ).agg(*aggregations)
 
# show results
display(global_metrics)

In [0]:
global_metrics.write.parquet('gs://h-and-m-tx/clean/global_metrics')

In [0]:
# calculate metrics for the following fields and time intervals
aggregations = []
 
# distinct count metrics
for column in ['order_number', 'customer_id']:
    for prior_day in prior_days:
        aggregations += [
            F.countDistinct(
                F.expr('CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'.format(prior_day, column)))
            .alias('product_cnt_distinct_{1}_last_{0}_days'.format(prior_day, column))]
 
# occurrence count metrics
# for column in ['reordered', 1]:
#     for prior_day in prior_days:
#         aggregations += [
#             F.sum(
#                 F.expr('CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'.format(prior_day, column)))
#             .alias('product_sum_{1}_last_{0}_days'.format(prior_day, column))]
    
# get last assigned  'product_group_name', 'graphical_appearance_name' for each product
product_cat = (
    flat_transactions
    .select('article_id','product_group_name','graphical_appearance_name','order_number')
    .withColumn('product_group_name', F.expr('LAST(product_group_name) OVER(PARTITION BY article_id ORDER BY order_number)'))
    .withColumn('graphical_appearance_name', F.expr('LAST(graphical_appearance_name) OVER(PARTITION BY article_id ORDER BY order_number)'))
    .select('article_id','product_group_name','graphical_appearance_name')
    .distinct()
)
 
# execute metric definitions
product_metrics = (
    flat_transactions
    .groupBy('article_id')
      .agg(*aggregations)
    .join(product_cat, on='article_id')
  )
 
# show results
display(product_metrics)

In [0]:
product_metrics.write.parquet('gs://h-and-m-tx/clean/product_metrics', mode='overwrite')

In [0]:
# calculate metrics for the following fields and time intervals
aggregations = []
 
# distinct count metrics
for column in ['order_number', 'article_id', 'product_group_name','graphical_appearance_name']:
    for prior_day in prior_days:
        aggregations += [
            F.countDistinct(
                F.expr('CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'.format(prior_day, column)))
            .alias('user_cnt_distinct_{1}_last_{0}_days'.format(prior_day, column))]    
 
# occurrence count metrics
# for column in ['reordered', 1]:
#   for prior_day in prior_days:
    
#     aggregations += [
#       f.sum(
#         f.expr(
#           'CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'.format(prior_day, column))
#         ).alias('user_sum_{1}_last_{0}_days'.format(prior_day, column))]
    
# execute metric definitions  
user_metrics = (
    flat_transactions
    .groupBy('customer_id')
    .agg(*aggregations)
  )
 
# show results
display(user_metrics)

In [0]:
user_metrics.write.parquet('gs://h-and-m-tx/clean/user_metrics/', mode='overwrite')

### Feature Generation

In [0]:
global_metrics = spark.read.parquet('gs://h-and-m-tx/clean/global_metrics')
user_metrics = spark.read.parquet('gs://h-and-m-tx/clean/user_metrics')
product_metrics = spark.read.parquet('gs://h-and-m-tx/clean/product_metrics')

In [0]:
# calculate product specific features
product_feature_definitions = []
for prior_day in prior_days:
    # distinct users associated with a product within some number of prior days
    product_feature_definitions += [
        F.expr('product_cnt_distinct_customer_id_last_{0}_days/global_cnt_distinct_customer_id_last_{0}_days as product_shr_distinct_customers_last_{0}_days'
               .format(prior_day))]
    
    # distinct orders associated with a product within some number of prior days
    product_feature_definitions += [
        F.expr('product_cnt_distinct_order_number_last_{0}_days/global_cnt_distinct_order_number_last_{0}_days as product_shr_distinct_orders_last_{0}_days'
               .format(prior_day))]
    
    # product reorders within some number of prior days
    #product_feature_definitions += [
    #    f.expr('product_sum_reordered_last_{0}_days/product_sum_1_last_{0}_days as product_shr_reordered_last_{0}_days'
    #           .format(prior_day))]

# execute features
product_features = (
    product_metrics
    .join(global_metrics) # cross join to a single row
    .select(
        'article_id',
        'product_group_name',
        'graphical_appearance_name',
        *product_feature_definitions
      )
  ).na.fill(0) # fill any missing values with 0s
 
# show results
#display(product_features)

In [0]:
product_features.write.parquet('gs://h-and-m-tx/features/product_features')

In [0]:
orders = spark.read.parquet('gs://h-and-m-tx/clean/orders')
    
# calculate user-specific order metrics
median_cols = ['lines_per_order', 'days_since_prior_order']
approx_median_stmt = [F.expr(f'percentile_approx({col}, 0.5)').alias(f'user_med_{col}') for col in median_cols]
 
user_order_features = (
  orders
    .groupBy('order_number','customer_id')  # get order-specific details for each user
    .agg(
        F.first('days_since_prior_order').alias('days_since_prior_order'),
        F.count('*').alias('lines_per_order')
    )
    .groupBy('customer_id') # get median values across user orders
    .agg(*approx_median_stmt)).na.fill(0)

# user_order_features = (
#     orders.select(
#         'customer_id',
#         'days_since_prior_order',
#         'cnt_products_per_order')
# )
 
# calculate user overall features
user_feature_definitions = []
user_drop_columns = []
 
# for prior_day in prior_days:
#     user_feature_definitions += [f.expr('user_sum_reordered_last_{0}_days/user_sum_1_last_{0}_days as user_shr_reordered_last_{0}_days'
#                                         .format(prior_day))]
#     user_drop_columns += ['user_sum_reordered_last_{0}_days'.format(prior_day)]
#     user_drop_columns += ['user_sum_1_last_{0}_days'.format(prior_day)]

# assemble final set of user features
user_features = (
    user_metrics
    .join(user_order_features, on=['customer_id'])
    .select(
        F.expr('*'),
        *user_feature_definitions
    )
    .drop(*user_drop_columns)).na.fill(0)

#display(user_features)

In [0]:
user_features.write.parquet('gs://h-and-m-tx/features/user_features')

In [0]:
# Generate labels

labels = (
  flat_transactions
    .select('customer_id', 'article_id')
    .distinct()
    .withColumn('label', F.lit(1))
    .withColumn('id', F.monotonically_increasing_id())
)
 
# labels = (
#   flat_transactions
#     .select('customer_id','article_id')
#     .distinct()
#     .join(train_labels, on=['user_id','product_id'], how='fullouter') # preserve all user-product combinations observed in either period
#     .withColumn('label',f.expr('coalesce(label,0)'))
#     .select('user_id','product_id','label')
#     .withColumn('id', f.monotonically_increasing_id())
#   )
  
# (
#   labels
#     .write
#     .format('delta')
#     .mode('overwrite')
#     .option('overwriteSchema','true')
#     .saveAsTable('labels')
#   )
  
#display(labels.limit(100))

In [0]:
labels.write.parquet('gs://h-and-m-tx/features/labels', mode='overwrite')

In [0]:
# retrieve features and labels
product_features = spark.read.parquet('gs://h-and-m-tx/features/product_features')
user_features = spark.read.parquet('gs://h-and-m-tx/features/user_features')
#labels = spark.table('labels')
 
# assemble full feature set
labeled_features = (
  labels
  .join(product_features, on='article_id')
  .join(user_features, on='customer_id')
  )
 
# display results
display(labeled_features)

In [0]:
labeled_features.write.parquet('gs://h-and-m-tx/features/labeled_features')

## Label encode categoricals and reduce to smaller size for processing
```

EXPORT DATA
    OPTIONS(
             uri='gs://h-and-m-tx/features/label-bq/*.parquet',
             format='PARQUET',
             overwrite=false
            )
AS 
WITH ref_customer AS
(
    SELECT customer_id,
    DENSE_RANK() OVER (ORDER BY customer_id ASC) AS customer_id_labeled
    FROM 
    (
        SELECT DISTINCT customer_id
        FROM `h_and_m.labeled_features`
    )
),
ref_article AS (
    SELECT article_id,
    DENSE_RANK() OVER (ORDER BY article_id ASC) AS article_id_labeled
    FROM 
    (
        SELECT DISTINCT article_id
        FROM `h_and_m.labeled_features`
    )
),
ref_product_group_name AS (
  SELECT product_group_name,
    DENSE_RANK() OVER (ORDER BY product_group_name ASC) AS product_group_name_labeled
    FROM 
    (
        SELECT DISTINCT product_group_name
        FROM `h_and_m.labeled_features`
    )
),
/* https://stackoverflow.com/questions/23375456/random-sampling-in-google-bigquery */
customer_sample AS (
   SELECT customer_id
   FROM `h_and_m.customers`
   WHERE RAND() < 50000/(SELECT COUNT(*) FROM `h_and_m.customers`)
)

SELECT *
FROM `h_and_m.labeled_features`
LEFT JOIN ref_customer USING (customer_id)
LEFT JOIN ref_article USING (article_id)
LEFT JOIN ref_product_group_name USING (product_group_name)
INNER JOIN customer_sample USING (customer_id)
```