## H&M Personalized Fashion Recommendations

This notebook contains the approach taken for the 2022 H&M Personalized Fashion Recommendations Kaggle competition. 

*Visit repo README.md for instructions on how to execute notebook locally.*

Developed By **Jaileen Salazar**
_____

### Required Dependencies

In [1]:
from typing import Dict, Text

# Data Processing
import pandas as pd
import numpy as np
import csv

# Recommendation System
import tensorflow as tf
import tensorflow_recommenders as tfrs


### Constants

In [2]:
# FILE PATHS
TRANSACTIONS_PATH = '../data/transactions_train.csv'
CUSTOMER_PATH = '../data/customers.csv'
ARTICLES_PATH = '../data/articles.csv'
TEST_PATH = '../data/sample_submission.csv'

# FILE FORMATS
TRANSACTIONS_HEADERS = ['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id']
ARTICLE_META_HEADERS = ['article_id', 'product_code', 'prod_name', 'product_type_no', 'product_type_name', 'product_group_name', 'graphical_appearance_no', 'graphical_appearance_name', 'colour_group_code', 'colour_group_name', 'perceived_colour_value_id', 'perceived_colour_value_name', 'perceived_colour_master_id', 'perceived_colour_master_name', 'department_no', 'department_name', 'index_code', 'index_name', 'index_group_no', 'index_group_name', 'section_no', 'section_name', 'garment_group_no', 'garment_group_name', 'detail_desc']
SUBMISSION_HEADERS = ['customer_id','prediction']
CUSTOMER_META_HEADERS = ['customer_id', 'fashion_news_frequency', 'age', 'postal_code']

# MODEL PARAMETERS
EMBEDDING_DIM = 32
EPOCHS = 3
BATCH_SIZE = 100

### Utility Methods

In [3]:
def parse_data(filepath, headers=None):
    """
        Open file, apply preprocessing and return formatted dataframe
    """
    df_data = pd.read_csv(filepath, sep=',', header=0, usecols=headers)
    if filepath == CUSTOMER_PATH:
        df_data['fashion_news_frequency'] = df_data['fashion_news_frequency'].fillna('NONE')
        df_data['age'] = df_data['age'].fillna(99)
        df_data['postal_code'] = df_data['postal_code'].fillna(0)

    return df_data

def save_results(ids, predictions, filename, headers):
    """
        Save predictions to csv file
    """
    data = zip(ids, predictions)
    with open(filename, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(data)

### Fashion Recommendations Class

In [None]:
class FashionRecommendationsModel(tfrs.Model):
    def __init__(self, user_model, product_model):
        super().__init__()
        self.user_model: tf.keras.Model = user_model
        self.product_model: tf.keras.Model = product_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["customer_id"])
        # And pick out the product features and pass them into the product model,
        # getting embeddings back.
        positive_product_embeddings = self.product_model(features["article_id"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_product_embeddings)

### Parse and Prepare Data

In [None]:
transactions = parse_data(filepath=TRANSACTIONS_PATH, headers=TRANSACTIONS_HEADERS)
# For training time purposes, only certain time period will be used
# Last transaction date is Sep 21 2020
transactions = transactions[transactions['t_dat'] >= '2020-09-01']

products = parse_data(filepath=ARTICLES_PATH, headers=ARTICLE_META_HEADERS)
products['detail_desc'] = products['detail_desc'].fillna('')
users = parse_data(filepath=CUSTOMER_PATH)
test_ids = parse_data(filepath=TEST_PATH, headers=['customer_id'])

### Create training and testing datasets

In [None]:
# Training -> Before 09/14/2020
# Testing -> After 09/14/2020
transactions = transactions.sort_values(by=['t_dat'])
train_ds = transactions[transactions['t_dat'] <= '2020-09-14']
test_ds = transactions[transactions['t_dat'] > '2020-09-14']
print('Training Length: ', len(train_ds))
print('Testing Length: ', len(test_ds))
train_ds = tf.data.Dataset.from_tensor_slices(dict(train_ds[['customer_id','article_id']].astype(str)))
test_ds = tf.data.Dataset.from_tensor_slices(dict(test_ds[['customer_id','article_id']].astype(str)))

# Remove any duplicates from user_ids and product_ids
user_ids = np.unique(users[['customer_id']])
product_ids = np.unique(products[['article_id']]).astype(str)

# Generate product Dataset for model
products_ds = tf.data.Dataset.from_tensor_slices(dict(products[['article_id']].astype(str)))
product_map = products_ds.map(lambda x: x['article_id'])

### Define Models

In [None]:
# Convert user_ids to ints for matrix factorization
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=user_ids, mask_token=None),
  # Additional embeddings to account for unknown tokens.
  tf.keras.layers.Embedding(len(user_ids) + 1, EMBEDDING_DIM)
])

# Convert product_ids to ints for matrix factorization
product_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=product_ids, mask_token=None),
  tf.keras.layers.Embedding(len(product_ids) + 1, EMBEDDING_DIM)
])

### Define Metrics

In [None]:
# Metrics: Compare affinity score that the model calculates for this pair to the scores of all the other possible candidates: if the score for the positive pair is higher than for all other candidates, our model is highly accurate.
metrics = tfrs.metrics.FactorizedTopK(
  candidates=product_map.batch(BATCH_SIZE).map(product_model)
)

# Loss
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

### Build Model

In [None]:
rec_model = FashionRecommendationsModel(user_model, product_model)
rec_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

### Fit and Evaluate Model

In [None]:
cached_train = train_ds.shuffle(100_000).batch(8192).cache()
cached_test = test_ds.batch(4096).cache()
rec_model.fit(cached_train, epochs=EPOCHS)
rec_model.evaluate(cached_test, return_dict=True)

### Generate Predictions

In [None]:
# ScaNN used for its increased performance with large datasets
retrieval_index = tfrs.layers.factorized_top_k.ScaNN(rec_model.user_model, k=12)

# recommends products out of the entire transactions dataset.
retrieval_index = tfrs.layers.factorized_top_k.ScaNN(rec_model.user_model, k=12).index_from_dataset(
  tf.data.Dataset.zip((product_map.batch(100), product_map.batch(100).map(rec_model.product_model)))
)

# Get recommendations
_, raw_preds = retrieval_index(test_ids.customer_id.values)
product_predications = raw_preds.numpy().astype(str)
# Due to type conversions, product ids leading 0 were removed, need to be readded for submission
product_predications_formatted = [[y.zfill(10) for y in x] for x in product_predications]
product_predications_series = pd.Series(map(' '.join, product_predications_formatted))
save_results(test_ids.customer_id.values, product_predications_series, filename='submission.csv', headers=SUBMISSION_HEADERS)
