In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import time
import pprint

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from typing import Dict, Text

from tqdm import tqdm
tqdm.pandas()

plt.style.use('seaborn-whitegrid')

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)



### TO DO LIST:
* ~Exclude existing candidate on scann~
* Add gender feature (article gender and user gender)
https://github.com/Wp-Zhang/H-M-Fashion-RecSys
* 

## Data Preprocessing

#### Load dataset

In [218]:
# Set seed for reproducibility
seed = 17
tf.random.set_seed(seed)
np.random.seed(seed)

In [231]:
article_df = pd.read_csv('dataset/articles.csv')
article_df['article_id'] = article_df['article_id'].astype(str)
article_df['article_id'] = article_df['article_id'].apply(lambda x: x.zfill(10))
article_df.fillna(value='', inplace=True)

customer_df = pd.read_csv('dataset/customers.csv')
train_df = pd.read_csv('dataset/transactions_train.csv')

train_df['article_id'] = train_df['article_id'].astype(str)
train_df['article_id'] = train_df['article_id'].apply(lambda x: x.zfill(10))

submission = pd.read_csv('dataset/sample_submission.csv')

#### Get top-N articles in date range

In [232]:
N = 500

In [233]:
train_df['t_dat'] = pd.to_datetime(train_df['t_dat'])

In [234]:
train_df["season"] = ""
train_df.loc[train_df.t_dat.dt.month.between(1, 3), "season"] = "spring"
train_df.loc[train_df.t_dat.dt.month.between(4, 6), "season"] = "summer"
train_df.loc[train_df.t_dat.dt.month.between(7, 9), "season"] = "fall"
train_df.loc[train_df.t_dat.dt.month.between(10, 12), "season"] = "winter"

In [235]:
train_df["year"] = train_df.t_dat.dt.year
train_df["month"] = train_df.t_dat.dt.month
train_df["season_year"] = train_df["season"] + train_df["year"].astype(str)

In [236]:
transactions_fall2018 = train_df[train_df["season_year"] == "fall2018"]
transactions_fall2019 = train_df[train_df["season_year"] == "fall2019"]
transactions_fall2020 = train_df[train_df["season_year"] == "fall2020"]

In [237]:
def get_products_count_rank(df, N):
    products_list = df.groupby("article_id").size().to_frame("count").sort_values(by="count", ascending=False).\
    reset_index().head(N).article_id.to_list()
    return products_list

In [238]:
topN_fall2020 = get_products_count_rank(transactions_fall2020, N)

#### Filter transaction by top articles

In [239]:
train_df = transactions_fall2020[transactions_fall2020.article_id.isin(topN_fall2020)]

In [240]:
train_df.customer_id.nunique()

273341

In [205]:
train_customer = train_df.customer_id.unique()

#### Gender feature

In [245]:
def set_gender_flg(x):
            female_pro_types = [
                "Bra",
                "Underwear Tights",
                "Leggings/Tights",
                "Hair clip",
                "Hair string",
                "Hair/alice band",
                "Bikini top",
                "Skirt",
                "Dress",
                "Earring",
                "Alice band",
                "Straw hat",
                "Necklace",
                "Ballerinas",
                "Blouse",
                "Beanie",
                "Giftbox",
                "Pumps",
                "Bootie",
                "Heeled sandals",
                "Nipple covers",
                "Hair ties",
                "Underwear corset",
                "Bra extender",
                "Underdress",
                "Underwear set",
                "Sarong",
                "Leg warmers",
                "Hairband",
                "Tote bag",
                "Earrings",
                "Flat shoes",
                "Heels",
                "Cap",
                "Shoulder bag",
                "Headband",
                "Baby Bib",
                "Cross-body bag",
                "Bumbag",
            ]
            x["article_gender"] = 0  # * 0 for not divided, 1 for male, 2 for female
            if x["index_group_name"] == "Ladieswear":
                x["article_gender"] = 2
            elif x["index_group_name"] == "Menswear":
                x["article_gender"] = 1
            else:
                if (
                    "boy" in x["department_name"].lower()
                    or "men" in x["department_name"].lower()
                ):
                    x["article_gender"] = 1
                if (
                    "girl" in x["department_name"].lower()
                    or "ladies" in x["department_name"].lower()
                    or x["product_type_name"] in female_pro_types
                ):
                    x["article_gender"] = 2
            return x

In [246]:
article_df = article_df.progress_apply(set_gender_flg, axis=1)

100%|█████████████████████████████████| 105542/105542 [00:47<00:00, 2229.31it/s]


In [247]:
train_df = pd.merge(
            train_df,
            article_df[["article_id", "article_gender"]],
            on="article_id",
            how="left",
        )

In [249]:
user = customer_df[['customer_id']]

In [250]:
ttl_cnt = train_df.groupby(["customer_id"]).size().reset_index(name="ttl_cnt")
gender_sale = (
    train_df.groupby(["customer_id", "article_gender"])
    .size()
    .reset_index(name="cnt")
)
gender_sale = gender_sale.merge(ttl_cnt, on=["customer_id"], how="left")
gender_sale["ratio"] = gender_sale["cnt"] / gender_sale["ttl_cnt"]
gender_sale = pd.pivot_table(
    gender_sale, values="ratio", index="customer_id", columns=["article_gender"]
)
gender_sale = gender_sale.reset_index()
gender_sale["user_gender"] = 0
gender_sale.loc[gender_sale[1] >= 0.8, "user_gender"] = 1  # * male
gender_sale.loc[gender_sale[2] >= 0.8, "user_gender"] = 2  # * female
user = user.merge(
    gender_sale[["customer_id", "user_gender"]], on="customer_id", how="left"
)
user["user_gender"] = user["user_gender"].fillna(0)

In [253]:
train_df = pd.merge(
            train_df,
            user[["customer_id", "user_gender"]],
            on="customer_id",
            how="left",
                )

In [264]:
train_df.user_gender.value_counts()

2.0    483377
0.0    216611
1.0      5122
Name: user_gender, dtype: int64

In [260]:
train_df.astype({'user_gender': 'int64'}).dtypes

t_dat               datetime64[ns]
customer_id                 object
article_id                  object
price                      float64
sales_channel_id             int64
season                      object
year                         int64
month                        int64
season_year                 object
article_gender               int64
user_gender                  int64
dtype: object

In [492]:
train_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,season,year,month,season_year,article_gender,user_gender,prod_name
0,1593562000.0,000ce5da167c6c8dfaea6dfc4b59a5ea3217630ec36cfc...,559616013,0.021593,2,fall,2020,7,fall2020,2,2.0,Timeless Push Triangle(1)
1,1593562000.0,000ce5da167c6c8dfaea6dfc4b59a5ea3217630ec36cfc...,559616013,0.025407,2,fall,2020,7,fall2020,2,2.0,Timeless Push Triangle(1)
2,1593562000.0,000ce5da167c6c8dfaea6dfc4b59a5ea3217630ec36cfc...,566140001,0.016932,2,fall,2020,7,fall2020,2,2.0,Timeless Tie Tanga
3,1593562000.0,000fb6e772c5d0023892065e659963da90b1866035558e...,907696002,0.016932,2,fall,2020,7,fall2020,2,2.0,Jinny smock top
4,1593562000.0,00263c043933b458dfdee2816b23387b1307fc39c8e0db...,610776074,0.008458,1,fall,2020,7,fall2020,2,0.0,Tilly (1)


#### Get top-12 articles for cold-start user

In [19]:
trans_sept2020 = transactions_fall2020[transactions_fall2020.month == 9]

In [21]:
top12_sept2020 = trans_sept2020.groupby("article_id").size().to_frame("trans_count").\
sort_values(by="trans_count", ascending=False).head(12).reset_index().article_id.tolist()

In [26]:
submission = pd.read_csv('dataset/sample_submission.csv')

In [30]:
submission["prediction"] = " ".join([x for x in top12_sept2020])

In [40]:
submission.head(1)

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0751471001 0909370001 0918522001 0924243001 09...


In [32]:
submission.to_csv("top12_sept2020.csv", index=False)

#### Use unix timestamp

In [266]:
train_df['t_dat'] = train_df['t_dat'].values.astype(int) / 10**9

In [269]:
article_map = dict(article_df[['article_id', 'prod_name']].values)

In [270]:
train_df['prod_name'] = train_df.article_id.map(article_map)

In [271]:
list(train_df.loc[train_df['customer_id'] == '000ce5da167c6c8dfaea6dfc4b59a5ea3217630ec36cfcdfac7615808334aea0'].article_id)

['0559616013', '0559616013', '0566140001', '0721990006', '0559616013']

In [493]:
articles = tf.data.Dataset.from_tensor_slices(dict(article_df)).map(lambda x: {
    'article_id' : x['article_id'],
    'article_gender' : x['article_gender'],
#     'prod_name' : x['prod_name'],
})

trans = tf.data.Dataset.from_tensor_slices(dict(train_df)).map(lambda x: {
    'customer_id' : x['customer_id'],
    'user_gender' : x['user_gender'],
    'article_id' : x['article_id'],
    'article_gender' : x['article_gender'],
#     'prod_name' : x['prod_name'],
#     't_dat' : x['t_dat'],
})

#### Features Preprocessing

In [481]:
# ### Timestamp bucket
# timestamps = np.concatenate(list(trans.map(lambda x: x["t_dat"]).batch(100)))

# max_timestamp = timestamps.max()
# min_timestamp = timestamps.min()

# timestamp_buckets = np.linspace(
#     min_timestamp, max_timestamp, num=1000,
# )

In [482]:
### Get article and customer vocabulary
article_ids = articles.map(lambda x: x['article_id']).batch(1_000)
unique_article_ids = np.unique(np.concatenate(list(article_ids)))

customer_ids = trans.map(lambda x: x['customer_id']).batch(1_000)
unique_customer_ids = np.unique(np.concatenate(list(customer_ids)))

## Two-tower Model

In [494]:
class CustomerModel(tf.keras.Model):
  def __init__(self, use_timestamps):
    super().__init__()

    self._use_timestamps = use_timestamps
    
    embedding_dim = 32
    
    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_customer_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_customer_ids) + 1, embedding_dim),
    ])
    
    self.gender_embedding = tf.keras.Sequential([
        tf.keras.layers.Embedding(3 + 1, embedding_dim)
    ])

    if use_timestamps:
      self.timestamp_embedding = tf.keras.Sequential([
          tf.keras.layers.Discretization(timestamp_buckets.tolist()),
          tf.keras.layers.Embedding(len(timestamp_buckets) + 1, embedding_dim),
      ])
      self.normalized_timestamp = tf.keras.layers.Normalization(
          axis=None
      )

      self.normalized_timestamp.adapt(timestamps)

  def call(self, inputs):
    if not self._use_timestamps:
        return tf.concat([
            self.user_embedding(inputs["customer_id"]),
            self.gender_embedding(inputs["user_gender"]),
        ], axis=1)
#         return self.user_embedding(inputs["customer_id"])

    return tf.concat([
        self.user_embedding(inputs["customer_id"]),
        self.timestamp_embedding(inputs["t_dat"]),
        tf.reshape(self.normalized_timestamp(inputs["t_dat"]), (-1, 1)),
    ], axis=1)

In [503]:
class ArticleModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000
    
    embedding_dim = 32

    self.article_id_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_article_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_article_ids) + 1, embedding_dim)
    ])
    
    self.gender_embedding = tf.keras.Sequential([
        tf.keras.layers.Embedding(3 + 1, embedding_dim)
    ])

#     self.article_vectorizer = tf.keras.layers.TextVectorization(
#         max_tokens=max_tokens)

#     self.article_text_embedding = tf.keras.Sequential([
#       self.article_vectorizer,
#       tf.keras.layers.Embedding(max_tokens, embedding_dim, mask_zero=True),
#       tf.keras.layers.GlobalAveragePooling1D(),
#     ])

#     self.article_vectorizer.adapt(articles.map(lambda x: x["prod_name"]))

  def call(self, inputs):
    return tf.concat([
        self.article_id_embedding(inputs["article_id"]),
        self.gender_embedding(inputs["article_gender"]),
#         self.article_text_embedding(inputs["prod_name"]),
    ], axis=1)

In [504]:
class HNMModel(tfrs.models.Model):

  def __init__(self, use_timestamps):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      CustomerModel(use_timestamps),
      tf.keras.layers.Dense(32)
    ])
    self.candidate_model = tf.keras.Sequential([
      ArticleModel(),
      tf.keras.layers.Dense(32)
    ])

    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=articles.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    query_embedding = self.query_model({
        "customer_id": features["customer_id"],
        "user_gender": features["user_gender"],
    })
    candidate_embedding = self.candidate_model({
        "article_id": features["article_id"],
        "article_gender": features["article_gender"],
#         "prod_name": features["prod_name"],
    })

    return self.task(query_embedding, candidate_embedding, compute_metrics=not training)

## Model Training

In [505]:
EPOCHS = 1
EVAL_FREQ = EPOCHS
INIT_EPOCH = 0  # set to latest epoch when resuming from checkpoint

In [506]:
train_n = int(0.8 * train_df.shape[0])
test_n = int(0.2 * train_df.shape[0])

In [507]:
train_df.customer_id.nunique()

273341

In [508]:
shuffled = trans.shuffle(train_df.shape[0], seed=seed, reshuffle_each_iteration=False)

train = shuffled.take(train_n)
test = shuffled.skip(train_n).take(test_n)

cached_train = train.shuffle(train_n).batch(2048)
cached_test = test.batch(4096).cache()

In [509]:
checkpoint_path = "checkpoints/vanilla_1_random/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    verbose=1,
    period=EVAL_FREQ)

In [510]:
# Load model
model = HNMModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

if INIT_EPOCH > 0:
    # Load trained model weights
    latest = tf.train.latest_checkpoint(checkpoint_dir)
    model.load_weights(latest)

wo_timestamp_hist = model.fit(
    cached_train,
#     validation_data=cached_test,
#     validation_freq=EVAL_FREQ,
    epochs=EPOCHS,
    initial_epoch=INIT_EPOCH,
    callbacks=[cp_callback])

Epoch 1: saving model to checkpoints/vanilla_1_random/cp-0001.ckpt


In [296]:
# result = model.evaluate(cached_test, return_dict=True)
# result

## Get Recommendation

In [409]:
# ### Simple query testing
# index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)

# identifiers = articles.batch(100).map(lambda x: x['article_id'])
# candidates = articles.batch(100).map(lambda x: model.candidate_model({
#         'article_id': x['article_id'],
#     }))

# index.index_from_dataset(tf.data.Dataset.zip((identifiers, candidates)))

# # test_query = dict(train_df[['customer_id',
# #                            't_dat',
# #                         ]].iloc[0].map(lambda x: tf.expand_dims(x, axis=0)))

# test_query = {'customer_id' : tf.constant(['ffffd9ac14e89946416d80e791d064701994755c3ab686a1eaf3458c36f52241'])}

# _, titles = index(test_query, k=12)
# print(f"Top 12 recommendations for user 40: {titles}")

Top 12 recommendations for user 40: [[b'0874754002' b'0556539001' b'0865929003' b'0903773001' b'0865917002'
  b'0278811006' b'0778534004' b'0878013001' b'0873678003' b'0751592001'
  b'0877273001' b'0863646004']]


In [511]:
identifiers = articles.batch(100).map(lambda x: x['article_id'])

candidates = articles.batch(100).map(lambda x: model.candidate_model({
        'article_id': x['article_id'],
        'article_gender' : x['article_gender'],
    }))

In [512]:
brute_force = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
brute_force.index_from_dataset(tf.data.Dataset.zip((identifiers, candidates)))

scann = tfrs.layers.factorized_top_k.ScaNN(model.query_model, k=12)
scann.index_from_dataset(tf.data.Dataset.zip((identifiers, candidates)))



<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7fe28dc3d370>

In [532]:
def get_rec(df):
    MODE = 'bf'
    EXCLUDE = False
    COLD_START = False

    SERVING = {'bf' : brute_force, 'scann' : scann}

    query = {
        'customer_id' : tf.constant([df['customer_id']]),
        'user_gender' : tf.constant([df['user_gender']])
    }

    if COLD_START:
        if not df['in_train']:
            article_str = " ".join([str(x) for x in top12_sept2020])
        else:
            if not(EXCLUDE):
                _, recs = SERVING[MODE](query, k=12)
            else:
                _, recs = SERVING[MODE].query_with_exclusions(query, k=12, exclusions=tf.constant([known_articles]))
            article_str = " ".join([str(x) for x in recs.numpy()[0].astype(str)])
    else:
        if not(EXCLUDE):
                _, recs = SERVING[MODE](query, k=12)
        else:
            _, recs = SERVING[MODE].query_with_exclusions(query, k=12, exclusions=tf.constant([known_articles]))
        article_str = " ".join([str(x) for x in recs.numpy()[0].astype(str)])
    return article_str

#### Generate customer-known articles pairs

In [514]:
t_df = train_df[['customer_id', 'article_id']]

In [515]:
t_df.shape[0]

705110

In [516]:
known_df = t_df.groupby('customer_id')['article_id'].apply(list).reset_index(name='known_articles')

In [517]:
known_df.head()

Unnamed: 0,customer_id,known_articles
0,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0896152002, 0730683050, 0791587015]"
1,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,[0448509014]
2,0000945f66de1a11d9447609b8b41b1bc987ba185a5496...,[0760084003]
3,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,"[0884319008, 0921226001, 0706016001, 0516859008]"
4,0000b2f1829e23b24feec422ef13df3ccedaedc85368e6...,"[0706016038, 0914441005, 0706016015]"


#### If customer in train

In [533]:
pred_df = customer_df[['customer_id']]

In [534]:
pred_df.loc[pred_df['customer_id'].isin(train_customer), 'in_train'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


#### Generate submission

In [535]:
# ### Merge sample submission and known articles
# pred_df = pred_df.merge(known_df, how='left', on='customer_id')

In [536]:
pred_df = pd.merge(
            pred_df,
            user[["customer_id", "user_gender"]],
            on="customer_id",
            how="left",
                )

In [537]:
pred_df.in_train.fillna(value=False, inplace=True)

In [538]:
pred_df.head()

Unnamed: 0,customer_id,in_train,user_gender
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,False,0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,False,0.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,False,0.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,False,0.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,True,2.0


In [539]:
pred_df.in_train.value_counts()

False    1098639
True      273341
Name: in_train, dtype: int64

In [540]:
pred_df['prediction'] = pred_df.progress_apply(get_rec, axis=1)

100%|██████████████████████████████| 1371980/1371980 [1:35:11<00:00, 240.20it/s]


In [547]:
pred_df.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0640021019 0640021011 0448509014 0914441004 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0640021019 0640021011 0448509014 0914441004 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0640021019 0640021011 0448509014 0914441004 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0640021019 0640021011 0448509014 0914441004 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152001 0896152002 0925404001 0910601001 08...


In [542]:
top12_sept2020

['0751471001',
 '0909370001',
 '0918522001',
 '0924243001',
 '0918292001',
 '0915526001',
 '0448509014',
 '0915529003',
 '0751471043',
 '0706016001',
 '0865799006',
 '0863595006']

In [543]:
pred_df.drop(columns=['in_train', 'user_gender'], inplace=True)

In [548]:
sub_path = 'tfrs_fall2020_top500_bf_4.csv'

In [549]:
pred_df.to_csv(sub_path, index=False)

In [553]:
msg = "'users fall 2020; top 500 articles fall 2020; brute force; feature : article_id, cust_id, user_gender, article_gender; cold-start : top12_sept2020.'"

In [556]:
!kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f {sub_path} -m {msg}

100%|████████████████████████████████████████| 258M/258M [00:28<00:00, 9.64MB/s]
Successfully submitted to H&M Personalized Fashion Recommendations