# Feature Generation

In [0]:
from pyspark.sql.functions import col
from pyspark.sql import functions as F

## Load Clean Data

In [0]:
flat_transactions = spark.read.parquet('gs://h-and-m-tx/clean/flat-tx')
flat_transactions.cache()
flat_transactions.createOrReplaceTempView('flat_transactions')

In [0]:
# order_details_ = spark.table('order_details').cache()
# prior_order_details = order_details_.filter(f.expr("eval_set='prior'"))

In [0]:
prior_days = [30, 90, 180, 360, 720, 1080]

# calculate metrics for the following fields and time intervals
aggregations = []
for column in ['order_number', 'customer_id', 'article_id', 'product_group_name', 'graphical_appearance_name']:
    for prior_day in prior_days:
    
        # count distinct instances in the field during this time-range
        aggregations += [
          F.countDistinct(
              F.expr(
                  'CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'
                  .format(prior_day, column)))
            .alias('global_cnt_distinct_{1}_last_{0}_days'.format(prior_day, column))]
    
# execute metric definitions
global_metrics = (
  flat_transactions
  ).agg(*aggregations)
 
# show results
display(global_metrics)

global_cnt_distinct_order_number_last_30_days,global_cnt_distinct_order_number_last_90_days,global_cnt_distinct_order_number_last_180_days,global_cnt_distinct_order_number_last_360_days,global_cnt_distinct_order_number_last_720_days,global_cnt_distinct_order_number_last_1080_days,global_cnt_distinct_customer_id_last_30_days,global_cnt_distinct_customer_id_last_90_days,global_cnt_distinct_customer_id_last_180_days,global_cnt_distinct_customer_id_last_360_days,global_cnt_distinct_customer_id_last_720_days,global_cnt_distinct_customer_id_last_1080_days,global_cnt_distinct_article_id_last_30_days,global_cnt_distinct_article_id_last_90_days,global_cnt_distinct_article_id_last_180_days,global_cnt_distinct_article_id_last_360_days,global_cnt_distinct_article_id_last_720_days,global_cnt_distinct_article_id_last_1080_days,global_cnt_distinct_product_group_name_last_30_days,global_cnt_distinct_product_group_name_last_90_days,global_cnt_distinct_product_group_name_last_180_days,global_cnt_distinct_product_group_name_last_360_days,global_cnt_distinct_product_group_name_last_720_days,global_cnt_distinct_product_group_name_last_1080_days,global_cnt_distinct_graphical_appearance_name_last_30_days,global_cnt_distinct_graphical_appearance_name_last_90_days,global_cnt_distinct_graphical_appearance_name_last_180_days,global_cnt_distinct_graphical_appearance_name_last_360_days,global_cnt_distinct_graphical_appearance_name_last_720_days,global_cnt_distinct_graphical_appearance_name_last_1080_days
1891945,2742000,3725343,5779159,9061239,9080179,1362281,1362281,1362281,1362281,1362281,1362281,91187,93571,95974,99157,104487,104547,19,19,19,19,19,19,30,30,30,30,30,30


In [0]:
global_metrics.write.parquet('gs://h-and-m-tx/clean/global_metrics')

In [0]:
# calculate metrics for the following fields and time intervals
aggregations = []
 
# distinct count metrics
for column in ['order_number', 'customer_id']:
    for prior_day in prior_days:
        aggregations += [
            F.countDistinct(
                F.expr('CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'.format(prior_day, column)))
            .alias('product_cnt_distinct_{1}_last_{0}_days'.format(prior_day, column))]
 
# occurrence count metrics
# for column in ['reordered', 1]:
#     for prior_day in prior_days:
#         aggregations += [
#             F.sum(
#                 F.expr('CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'.format(prior_day, column)))
#             .alias('product_sum_{1}_last_{0}_days'.format(prior_day, column))]
    
# get last assigned  'product_group_name', 'graphical_appearance_name' for each product
product_cat = (
    flat_transactions
    .select('article_id','product_group_name','graphical_appearance_name','order_number')
    .withColumn('product_group_name', F.expr('LAST(product_group_name) OVER(PARTITION BY article_id ORDER BY order_number)'))
    .withColumn('graphical_appearance_name', F.expr('LAST(graphical_appearance_name) OVER(PARTITION BY article_id ORDER BY order_number)'))
    .select('article_id','product_group_name','graphical_appearance_name')
    .distinct()
)
 
# execute metric definitions
product_metrics = (
    flat_transactions
    .groupBy('article_id')
      .agg(*aggregations)
    .join(product_cat, on='article_id')
  )
 
# show results
display(product_metrics)

article_id,product_cnt_distinct_order_number_last_30_days,product_cnt_distinct_order_number_last_90_days,product_cnt_distinct_order_number_last_180_days,product_cnt_distinct_order_number_last_360_days,product_cnt_distinct_order_number_last_720_days,product_cnt_distinct_order_number_last_1080_days,product_cnt_distinct_customer_id_last_30_days,product_cnt_distinct_customer_id_last_90_days,product_cnt_distinct_customer_id_last_180_days,product_cnt_distinct_customer_id_last_360_days,product_cnt_distinct_customer_id_last_720_days,product_cnt_distinct_customer_id_last_1080_days,product_group_name,graphical_appearance_name
118458038,10,12,16,23,95,97,10,12,16,23,91,93,Garment Lower body,Melange
120129014,263,340,444,665,964,964,259,333,430,641,924,924,Garment Lower body,Melange
147339034,12,16,18,22,48,48,12,16,18,22,48,48,Garment Upper body,All over pattern
156227002,111,159,235,461,781,784,111,156,231,452,752,754,Socks & Tights,Solid
162074069,1,1,1,2,5,5,1,1,1,2,5,5,Socks & Tights,All over pattern
189691051,1,1,1,1,1,1,1,1,1,1,1,1,Garment Upper body,Solid
194242050,1,1,1,1,5,5,1,1,1,1,5,5,Garment Lower body,Stripe
194270044,1,1,1,1,11,11,1,1,1,1,11,11,Garment Upper body,Solid
212766045,1,1,1,2,23,24,1,1,1,2,20,21,Garment Lower body,Solid
213691080,6,7,9,17,97,98,6,7,9,17,97,98,Accessories,Solid


In [0]:
product_metrics.write.parquet('gs://h-and-m-tx/clean/product_metrics', mode='overwrite')

In [0]:
# calculate metrics for the following fields and time intervals
aggregations = []
 
# distinct count metrics
for column in ['order_number', 'article_id', 'product_group_name','graphical_appearance_name']:
    for prior_day in prior_days:
        aggregations += [
            F.countDistinct(
                F.expr('CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'.format(prior_day, column)))
            .alias('user_cnt_distinct_{1}_last_{0}_days'.format(prior_day, column))]    
 
# occurrence count metrics
# for column in ['reordered', 1]:
#   for prior_day in prior_days:
    
#     aggregations += [
#       f.sum(
#         f.expr(
#           'CASE WHEN (days_prior_to_last_order <= {0}) THEN {1} ELSE NULL END'.format(prior_day, column))
#         ).alias('user_sum_{1}_last_{0}_days'.format(prior_day, column))]
    
# execute metric definitions  
user_metrics = (
    flat_transactions
    .groupBy('customer_id')
    .agg(*aggregations)
  )
 
# show results
display(user_metrics)

customer_id,user_cnt_distinct_order_number_last_30_days,user_cnt_distinct_order_number_last_90_days,user_cnt_distinct_order_number_last_180_days,user_cnt_distinct_order_number_last_360_days,user_cnt_distinct_order_number_last_720_days,user_cnt_distinct_order_number_last_1080_days,user_cnt_distinct_article_id_last_30_days,user_cnt_distinct_article_id_last_90_days,user_cnt_distinct_article_id_last_180_days,user_cnt_distinct_article_id_last_360_days,user_cnt_distinct_article_id_last_720_days,user_cnt_distinct_article_id_last_1080_days,user_cnt_distinct_product_group_name_last_30_days,user_cnt_distinct_product_group_name_last_90_days,user_cnt_distinct_product_group_name_last_180_days,user_cnt_distinct_product_group_name_last_360_days,user_cnt_distinct_product_group_name_last_720_days,user_cnt_distinct_product_group_name_last_1080_days,user_cnt_distinct_graphical_appearance_name_last_30_days,user_cnt_distinct_graphical_appearance_name_last_90_days,user_cnt_distinct_graphical_appearance_name_last_180_days,user_cnt_distinct_graphical_appearance_name_last_360_days,user_cnt_distinct_graphical_appearance_name_last_720_days,user_cnt_distinct_graphical_appearance_name_last_1080_days
0003e56a4332b2503e34640be92031ad48f1280ee6e3a7f6b7b94664383facdc,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0011a72ff279179723ef5f8785cce80c968237b6381ce90af2ba9f3f8f8396c7,2,2,2,2,2,2,10,10,10,10,10,10,4,4,4,4,4,4,4,4,4,4,4,4
0038bf2b66fdc1de4e25bce86c50d6dbdc3a0c3551c354d74d976b477b97ea65,2,4,7,12,27,27,4,7,13,23,59,59,3,3,3,5,8,8,3,3,7,7,10,10
004d7b2d16b8dae54dba523ebdc08b8b167998f1ed4002a92f8df25d9ec6bb78,1,1,1,4,4,4,3,3,3,11,11,11,1,1,1,3,3,3,3,3,3,6,6,6
005ddabf9bc77f963ba39fa3c577b8902ae77558bd0ea26462ffdd6f57f09b83,2,5,6,20,31,31,3,9,11,37,52,52,1,3,4,8,8,8,2,4,4,9,10,10
006ae0656ded2215de94e105a90d1c7d08b182399aba7d49f35564a34f2ea113,1,2,3,6,11,11,2,4,6,15,22,22,2,2,3,5,6,6,1,2,3,8,9,9
007ebdd8e50f1b32f3e22bdf2f2d36c5492356ab2d4b15d95ff3acc9e6efa477,1,2,3,7,7,7,4,5,6,13,13,13,2,2,2,3,3,3,3,3,3,6,6,6
008588b203fab392d6e801db90fc90f526708539fc9c9fd959700bb3e0cf65d5,1,1,1,1,1,1,6,6,6,6,6,6,1,1,1,1,1,1,3,3,3,3,3,3
0086f22a4967559a6f799d0b05e25f16dc2a34014806c6f56beb16079b7c8d3d,1,1,1,1,10,10,4,4,4,4,30,30,3,3,3,3,6,6,1,1,1,1,9,9
00a206498f5bf6893905ee1f91a0eb127e7be6844e36f3bb59df8823ade1098d,1,1,1,3,6,6,1,1,1,10,16,16,1,1,1,3,4,4,1,1,1,3,4,4


In [0]:
user_metrics.write.parquet('gs://h-and-m-tx/clean/user_metrics/', mode='overwrite')

### Feature Generation

In [0]:
global_metrics = spark.read.parquet('gs://h-and-m-tx/clean/global_metrics')
user_metrics = spark.read.parquet('gs://h-and-m-tx/clean/user_metrics')
product_metrics = spark.read.parquet('gs://h-and-m-tx/clean/product_metrics')

In [0]:
# calculate product specific features
product_feature_definitions = []
for prior_day in prior_days:
    # distinct users associated with a product within some number of prior days
    product_feature_definitions += [
        F.expr('product_cnt_distinct_customer_id_last_{0}_days/global_cnt_distinct_customer_id_last_{0}_days as product_shr_distinct_customers_last_{0}_days'
               .format(prior_day))]
    
    # distinct orders associated with a product within some number of prior days
    product_feature_definitions += [
        F.expr('product_cnt_distinct_order_number_last_{0}_days/global_cnt_distinct_order_number_last_{0}_days as product_shr_distinct_orders_last_{0}_days'
               .format(prior_day))]
    
    # product reorders within some number of prior days
    #product_feature_definitions += [
    #    f.expr('product_sum_reordered_last_{0}_days/product_sum_1_last_{0}_days as product_shr_reordered_last_{0}_days'
    #           .format(prior_day))]

# execute features
product_features = (
    product_metrics
    .join(global_metrics) # cross join to a single row
    .select(
        'article_id',
        'product_group_name',
        'graphical_appearance_name',
        *product_feature_definitions
      )
  ).na.fill(0) # fill any missing values with 0s
 
# show results
#display(product_features)

In [0]:
product_features.write.parquet('gs://h-and-m-tx/features/product_features')

In [0]:
orders = spark.read.parquet('gs://h-and-m-tx/clean/orders')
    
# calculate user-specific order metrics
median_cols = ['lines_per_order', 'days_since_prior_order']
approx_median_stmt = [F.expr(f'percentile_approx({col}, 0.5)').alias(f'user_med_{col}') for col in median_cols]
 
user_order_features = (
  orders
    .groupBy('order_number','customer_id')  # get order-specific details for each user
    .agg(
        F.first('days_since_prior_order').alias('days_since_prior_order'),
        F.count('*').alias('lines_per_order')
    )
    .groupBy('customer_id') # get median values across user orders
    .agg(*approx_median_stmt)).na.fill(0)

# user_order_features = (
#     orders.select(
#         'customer_id',
#         'days_since_prior_order',
#         'cnt_products_per_order')
# )
 
# calculate user overall features
user_feature_definitions = []
user_drop_columns = []
 
# for prior_day in prior_days:
#     user_feature_definitions += [f.expr('user_sum_reordered_last_{0}_days/user_sum_1_last_{0}_days as user_shr_reordered_last_{0}_days'
#                                         .format(prior_day))]
#     user_drop_columns += ['user_sum_reordered_last_{0}_days'.format(prior_day)]
#     user_drop_columns += ['user_sum_1_last_{0}_days'.format(prior_day)]

# assemble final set of user features
user_features = (
    user_metrics
    .join(user_order_features, on=['customer_id'])
    .select(
        F.expr('*'),
        *user_feature_definitions
    )
    .drop(*user_drop_columns)).na.fill(0)

#display(user_features)

In [0]:
user_features.write.parquet('gs://h-and-m-tx/features/user_features')

In [0]:
# Generate labels

labels = (
  flat_transactions
    .select('customer_id', 'article_id')
    .distinct()
    .withColumn('label', F.lit(1))
    .withColumn('id', F.monotonically_increasing_id())
)
 
# labels = (
#   flat_transactions
#     .select('customer_id','article_id')
#     .distinct()
#     .join(train_labels, on=['user_id','product_id'], how='fullouter') # preserve all user-product combinations observed in either period
#     .withColumn('label',f.expr('coalesce(label,0)'))
#     .select('user_id','product_id','label')
#     .withColumn('id', f.monotonically_increasing_id())
#   )
  
# (
#   labels
#     .write
#     .format('delta')
#     .mode('overwrite')
#     .option('overwriteSchema','true')
#     .saveAsTable('labels')
#   )
  
#display(labels.limit(100))

In [0]:
labels.write.parquet('gs://h-and-m-tx/features/labels', mode='overwrite')

In [0]:
# retrieve features and labels
product_features = spark.read.parquet('gs://h-and-m-tx/features/product_features')
user_features = spark.read.parquet('gs://h-and-m-tx/features/user_features')
#labels = spark.table('labels')
 
# assemble full feature set
labeled_features = (
  labels
  .join(product_features, on='article_id')
  .join(user_features, on='customer_id')
  )
 
# display results
display(labeled_features)

customer_id,article_id,label,id,product_group_name,graphical_appearance_name,product_shr_distinct_customers_last_30_days,product_shr_distinct_orders_last_30_days,product_shr_distinct_customers_last_90_days,product_shr_distinct_orders_last_90_days,product_shr_distinct_customers_last_180_days,product_shr_distinct_orders_last_180_days,product_shr_distinct_customers_last_360_days,product_shr_distinct_orders_last_360_days,product_shr_distinct_customers_last_720_days,product_shr_distinct_orders_last_720_days,product_shr_distinct_customers_last_1080_days,product_shr_distinct_orders_last_1080_days,user_cnt_distinct_order_number_last_30_days,user_cnt_distinct_order_number_last_90_days,user_cnt_distinct_order_number_last_180_days,user_cnt_distinct_order_number_last_360_days,user_cnt_distinct_order_number_last_720_days,user_cnt_distinct_order_number_last_1080_days,user_cnt_distinct_article_id_last_30_days,user_cnt_distinct_article_id_last_90_days,user_cnt_distinct_article_id_last_180_days,user_cnt_distinct_article_id_last_360_days,user_cnt_distinct_article_id_last_720_days,user_cnt_distinct_article_id_last_1080_days,user_cnt_distinct_product_group_name_last_30_days,user_cnt_distinct_product_group_name_last_90_days,user_cnt_distinct_product_group_name_last_180_days,user_cnt_distinct_product_group_name_last_360_days,user_cnt_distinct_product_group_name_last_720_days,user_cnt_distinct_product_group_name_last_1080_days,user_cnt_distinct_graphical_appearance_name_last_30_days,user_cnt_distinct_graphical_appearance_name_last_90_days,user_cnt_distinct_graphical_appearance_name_last_180_days,user_cnt_distinct_graphical_appearance_name_last_360_days,user_cnt_distinct_graphical_appearance_name_last_720_days,user_cnt_distinct_graphical_appearance_name_last_1080_days,user_med_lines_per_order,user_med_days_since_prior_order
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,694966001,1,154444,Garment Upper body,Colour blocking,4.7714091292471966e-05,3.48847350213669e-05,5.505472072208303e-05,2.8081692195477756e-05,6.019316132281079e-05,2.254825931464566e-05,9.469411964198282e-05,2.2667658045054653e-05,0.0003391370796480315,5.352468906294161e-05,0.000342807394362837,5.407382387505797e-05,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,745232001,1,203127,Garment Lower body,Denim,0.00046539590583734194,0.0003435617843013407,0.000685614788725674,0.00034974471188913204,0.0007861814119113458,0.0002960801193339781,0.0012559816954064,0.0003073111502902066,0.0020656531214925,0.000325452181539412,0.0020656531214925,0.0003247733332129245,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,685823003,1,25769997808,Garment Full body,All over pattern,1.835157357402768e-05,1.3213914780820796e-05,2.789439183252207e-05,1.4587892049598834e-05,4.8448154235433074e-05,1.852178443703036e-05,0.00010056662318567169,2.457104917860886e-05,0.00018131354691139343,2.8031486643272517e-05,0.00018131354691139343,2.7973016831496383e-05,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,485176004,1,42949838275,Garment Lower body,Solid,3.229876949028872e-05,2.3785046605477432e-05,4.330971363470532e-05,2.224653537563822e-05,6.0927224265771895e-05,2.308512263166103e-05,0.00011451381910193272,2.751265365773809e-05,0.0004705343464380697,7.327916193359429e-05,0.0004705343464380697,7.312631171698267e-05,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,766955002,1,51539811907,Garment Lower body,Check,5.799097249392747e-05,4.281308388985938e-05,7.267223135314961e-05,3.6834427425237056e-05,9.836443435678837e-05,3.650670555704535e-05,0.0002980295548422095,7.180975640227238e-05,0.0003817127303397757,5.926341861195804e-05,0.0003817127303397757,5.913980330123448e-05,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,827636001,1,68719632444,Garment Full body,Solid,2.862845477548318e-05,2.061370705808044e-05,4.330971363470532e-05,2.151714077315828e-05,6.753379075242185e-05,2.469571258270715e-05,0.00017690916925362682,4.4124067186938447e-05,0.00017690916925362682,2.814184682690745e-05,0.00017690916925362682,2.808314681902196e-05,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,854193002,1,68719651776,Underwear,Solid,0.0005277912559890361,0.0003890176511473642,0.0009718993364805058,0.0005032822757111597,0.0012229488629732,0.00047109756068098967,0.0012229488629732,0.000303677403580694,0.0012229488629732,0.00019368212227930415,0.0012229488629732,0.0001932781281073864,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,809672001,1,85899511115,Garment Full body,All over pattern,3.890533597693868e-05,2.907061251780575e-05,5.945909837984968e-05,3.063457330415755e-05,0.00013800383327668814,5.288103672601422e-05,0.0003273920725606538,8.08076053972559e-05,0.0003626270938227869,5.727693530652927e-05,0.0003626270938227869,5.715746352577411e-05,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,669708001,1,111669314712,Garment Upper body,Glittering/Metallic,0.00013213132973299927,9.566874301314256e-05,0.00018057948396843235,9.044493070751276e-05,0.00030023174367109284,0.00011113070662218216,0.0005652284660800525,0.00013479469936715705,0.0009337280634465284,0.00014291643780723584,0.0009337280634465284,0.00014261833384562132,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69
000346516dd355b40badca0c0f5f37a318ddae31f0e0f76a3a0454eb591b6384,456163030,1,163208912804,Garment Upper body,Solid,5.578878366504414e-05,4.122741411616088e-05,6.973597958130517e-05,3.574033552151714e-05,0.00010203474907159388,3.811729550809147e-05,0.00020920793874391557,5.087245393317609e-05,0.0007347970059040683,0.00011378134932761402,0.0007347970059040683,0.00011354401713886918,1,3,3,5,9,9,3,6,6,12,20,20,2,3,3,4,4,4,2,3,3,7,8,8,1,69


In [0]:
labeled_features.write.parquet('gs://h-and-m-tx/features/labeled_features')

## Label encode categoricals and reduce to smaller size for processing
```

EXPORT DATA
    OPTIONS(
             uri='gs://h-and-m-tx/features/label-bq/*.parquet',
             format='PARQUET',
             overwrite=false
            )
AS 
WITH ref_customer AS
(
    SELECT customer_id,
    DENSE_RANK() OVER (ORDER BY customer_id ASC) AS customer_id_labeled
    FROM 
    (
        SELECT DISTINCT customer_id
        FROM `h_and_m.labeled_features`
    )
),
ref_article AS (
    SELECT article_id,
    DENSE_RANK() OVER (ORDER BY article_id ASC) AS article_id_labeled
    FROM 
    (
        SELECT DISTINCT article_id
        FROM `h_and_m.labeled_features`
    )
),
ref_product_group_name AS (
  SELECT product_group_name,
    DENSE_RANK() OVER (ORDER BY product_group_name ASC) AS product_group_name_labeled
    FROM 
    (
        SELECT DISTINCT product_group_name
        FROM `h_and_m.labeled_features`
    )
),
/* https://stackoverflow.com/questions/23375456/random-sampling-in-google-bigquery */
customer_sample AS (
   SELECT customer_id
   FROM `h_and_m.customers`
   WHERE RAND() < 50000/(SELECT COUNT(*) FROM `h_and_m.customers`)
)

SELECT *
FROM `h_and_m.labeled_features`
LEFT JOIN ref_customer USING (customer_id)
LEFT JOIN ref_article USING (article_id)
LEFT JOIN ref_product_group_name USING (product_group_name)
INNER JOIN customer_sample USING (customer_id)
```