# DataLab Cup 4: Recommender Systems

Team name: 窩不知道誒

Team members: 112501533 黃思誠 112065527 劉承瑋

## Prepare environment

In [11]:
import os
import pickle
import random
from dataclasses import dataclass
from pathlib import Path

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import numpy as np
import pandas as pd
import tensorflow as tf
from evaluation.environment import TestingEnvironment, TrainingEnvironment
from sentence_transformers import SentenceTransformer
from tensorflow import keras
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.layers import Add, Dot, Embedding, Flatten, Input
from tensorflow.keras.regularizers import L2
from tqdm.auto import tqdm, trange


In [12]:
# Check GPU
gpus = tf.config.experimental.list_physical_devices("GPU")
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        # Select GPU number 1
        tf.config.experimental.set_visible_devices(gpus[0], "GPU")
        logical_gpus = tf.config.experimental.list_logical_devices("GPU")
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)


1 Physical GPUs, 1 Logical GPUs


In [36]:
# 固定參數
@dataclass
class ConstParams:
    N_TRAIN_USERS: int = 1000
    N_TEST_USERS: int = 2000
    N_ITEMS: int = 209527
    HORIZON: int = 2000
    TEST_EPISODES: int = 5
    SLATE_SIZE: int = 5


@dataclass
class HParams:
    EMBED_SIZE: int = 128
    BATCH_SIZE: int = 128
    RANDOM_STATE: int = 42
    NUM_EPOCHS: int = 1
    NUM_EPOSIDES: int = 5
    N_NEGTIVES: int = 8


@dataclass
class Paths:
    USER_DATA: Path = Path("./dataset/user_data.json")
    ITEM_DATA: Path = Path("./dataset/item_data.json")
    OUTPUT: Path = Path("./output/output.csv")
    CHECKPOINT_DIR: Path = Path("./checkpoint")
    TOKEN_PATH: Path = Path("./dataset/item_token.pkl")
    EMBEDDING_PATH: Path = Path("./dataset/item_to_embedding.pkl")
    USER_DATA_PLUS: Path = Path("./dataset/user_data_plus.pkl")
    SIMILARITY_PATH: Path = Path("./dataset/similarity_items.pkl")


In [14]:
random.seed(HParams.RANDOM_STATE)

if not Path("./dataset").exists():
    Path.mkdir(Path("./dataset"))

if not Path("./checkpoint").exists():
    Path.mkdir(Path("./checkpoint"))

if not Path("./output").exists():
    Path.mkdir(Path("./output"))

if not Path("./checkpoint/FunkSVD").exists():
    Path.mkdir(Path("./checkpoint/FunkSVD/"))

if not Path("./checkpoint/FunkSVD/best").exists():
    Path.mkdir(Path("./checkpoint/FunkSVD/best"))


## Data preprocess

### Create text embeddings

由於後續有使用 Content-based 的推薦系統，因此我們先將 item_data 中的 headline、short-description 相連並使用 Sentence transformer 將其轉換為 Embeddings。

In [15]:
if not Paths.EMBEDDING_PATH.exists():
    embedder = SentenceTransformer("paraphrase-MiniLM-L6-v2")
    df_item = pd.read_json(Paths.ITEM_DATA, lines=True)
    sentences = df_item["headline"] + " " + df_item["short_description"]

    embeddings = sentences.apply(lambda x: embedder.encode(x))
    embeddings = pd.DataFrame.from_records(embeddings)
    embeddings.to_pickle(Paths.EMBEDDING_PATH)


### Define dataset generator, data manager and history manager

**Data manager:**
- 由於資料量過少，收集更多 User click 的資料
- 每次都計算一次 Consine similarity 花太多時間，每計算一次就紀錄 Item 對 Item 的 Cosine similarity 來提昇後續訓練速度
- 在 Function 之間傳輸各 Dataframe

**History:**
- 用於紀錄每一 Epoch Test 過程中的 User click 資料，會在每次 Epoch 之後 Reset

**Dataset generator:**
- 將前面處理的 Data 轉換成 Tensorflow 的 Dataset


In [16]:
class DataManager:
    def __init__(self, user_path, item_path, token_path, embedding_path):
        df_user = pd.read_json(user_path, lines=True)
        df_item = pd.read_json(item_path, lines=True)

        self.num_users = len(df_user)
        self.num_items = len(df_item)

        self.pairs = set(df_user.explode("history").itertuples(index=False, name=None))
        self.item_to_embedding = pd.read_pickle(embedding_path)

        self.pos_item_sets = df_user["history"].apply(set).to_list()
        self.similarity_items = {}

    def add(self, user_id, item_id):
        self.pairs.add((user_id, item_id))
        self.pos_item_sets[user_id].add(item_id)

    def remove(self, user_id, item_id):
        self.pairs.remove((user_id, item_id))
        self.pos_item_sets[user_id].discard(item_id)

    def get_sequences(self):
        return list(self.pairs)

    def save(self, user_plus_path, similarity_path):
        df_user = pd.DataFrame({
            "user_id": range(self.num_users),
            "history": self.pos_item_sets,
        })
        df_user.to_pickle(user_plus_path)

        with Path.open(similarity_path, "wb") as f:
            pickle.dump(self.similarity_items, f)

    def load(self, user_plus_path, similarity_path):
        if Path(user_plus_path).exists():
            df_user = pd.read_pickle(user_plus_path)
            self.pairs = set(
                df_user.explode("history").itertuples(index=False, name=None)
            )
            self.pos_item_sets = df_user["history"].to_list()

        if Path(similarity_path).exists():
            with Path.open(similarity_path, "rb") as f:
                self.similarity_items = pickle.load(f)

    def add_top100_items(self, item_id, sort_item_ids):
        self.similarity_items[item_id] = sort_item_ids[:100]


class History:
    def __init__(self, user_path):
        df_user = pd.read_json(user_path, lines=True)
        self.init_histories = df_user.set_index("user_id")["history"]
        self.curr_histories = self.init_histories.copy()

    def reset(self):
        self.curr_histories = self.init_histories.copy()

    def add(self, user_id, item_id):
        self.curr_histories.loc[user_id].append(item_id)

    def get(self, user_id):
        return self.curr_histories.loc[user_id]

    def update_init(self, sequence):
        self.init_histories = (
            pd.DataFrame(sequence, columns=["user_id", "history"])
            .groupby("user_id")["history"]
            .apply(list)
        )


class LabelDatasetGenerator:
    def __init__(self, user_item_pairs):
        self.df_seq = pd.DataFrame(user_item_pairs, columns=["user_id", "item_id"])
        self.df_seq["label"] = 1

    def __call__(self, batch_size):
        dataset: tf.data.Dataset = tf.data.Dataset.from_tensor_slices((
            tf.convert_to_tensor(self.df_seq["user_id"].to_numpy(dtype=int)),
            tf.convert_to_tensor(self.df_seq["item_id"].to_numpy(dtype=int)),
            tf.convert_to_tensor(self.df_seq["label"].to_numpy(dtype=int)),
        ))
        dataset = dataset.shuffle(buffer_size=batch_size * 10)
        dataset = dataset.batch(
            batch_size, drop_remainder=True, num_parallel_calls=tf.data.AUTOTUNE
        )
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        return dataset


## Define model

我們這次分別嘗試了數種推薦系統：
- FunkSVD
- NeuMF
- Factorization Machine
- Content-based

不過最後是採用了 Content-based + FunkSVD 兩個相對簡單的推薦系統的的組合。其中 FunkSVD 我們加上了 User bias 及 Item bias 來提昇預測準確度。而 Content-based 就是通過 Item 之間的 Embedding 的 Cosine similarity 來找最接近的 Item。

In [17]:
class FunkSVD(tf.keras.Model):
    """
    Simplified Funk-SVD recommender model
    """

    def __init__(self, num_factors, num_users, num_items, l2_lambda=0.1, **kwargs):
        """
        Constructor of the model
        """
        super().__init__(**kwargs)

        self.num_users = num_users
        self.num_items = num_items

        # Input
        user_id = Input(shape=(1,), dtype=tf.int32)
        item_id = Input(shape=(1,), dtype=tf.int32)

        # Embedding
        vec_user = Embedding(
            num_users,
            num_factors,
            embeddings_initializer=RandomNormal(),
            embeddings_regularizer=L2(l2_lambda),
        )(user_id)
        vec_item = Embedding(
            num_items,
            num_factors,
            embeddings_initializer=RandomNormal(),
            embeddings_regularizer=L2(l2_lambda),
        )(item_id)
        embeddings = Add()([
            tf.reduce_sum(Dot(axes=2)([vec_user, vec_item]), axis=2, keepdims=True),
        ])

        # Bias
        b_user = Embedding(
            num_users,
            1,
            embeddings_initializer=RandomNormal(),
            embeddings_regularizer=L2(l2_lambda),
        )(user_id)
        b_item = Embedding(
            num_items,
            1,
            embeddings_initializer=RandomNormal(),
            embeddings_regularizer=L2(l2_lambda),
        )(item_id)
        biases = Add()([
            b_user,
            b_item,
        ])

        # Output
        output = Add()([embeddings, biases])
        output = Flatten()(output)

        self.model = keras.Model(
            inputs=(
                user_id,
                item_id,
            ),
            outputs=output,
        )

    @tf.function
    def call(self, inputs) -> tf.Tensor:
        return self.model(inputs)

    @tf.function
    def train_step(self, inputs: tf.Tensor) -> tf.Tensor:
        user_ids, item_ids, y_trues = inputs

        # compute loss
        with tf.GradientTape() as tape:
            y_preds = self.call((user_ids, item_ids))
            loss = self.loss(y_trues, y_preds)

        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return loss

    def get_topk(self, user_id, k=5) -> tf.Tensor:
        user_ids = tf.repeat(tf.constant(user_id), self.num_items)
        item_ids = tf.range(self.num_items)
        rank_list = tf.squeeze(self.call((user_ids, item_ids)))
        return tf.math.top_k(rank_list, k=k).indices.numpy().tolist()


def get_content_topk(data_manager, clicked_id, k=2, choose_self=True):
    n = 0 if choose_self else 1
    if clicked_id in data_manager.similarity_items:
        return data_manager.similarity_items[clicked_id][n : n + k]

    item_to_embedding = data_manager.item_to_embedding
    scores = tf.losses.CosineSimilarity(reduction="none")(
        tf.repeat(
            tf.constant(item_to_embedding.iloc[clicked_id], shape=(1, 384)),
            len(item_to_embedding),
            axis=0,
        ),
        tf.constant(item_to_embedding),
    )

    sort_items = tf.argsort(scores).numpy().tolist()

    data_manager.add_top100_items(clicked_id, sort_items)
    return sort_items[n : n + k]


## Define train, explore, update function

我們將訓練過程分成兩個部份：
1. Train: 
    - 每次 Explore 前會進行數個 Epoch 的 Training，用以讓模型再學習目前全部資料（原本 + 收集）
2. Explore: 
    - 取得 FunkSVD top-2 及 Content-based top-3 的 Item 作為 Slate，其中 Content-based 的方式會是通過隨機選取過去 User click 過的 Item 來作為搜尋的基準
    - 如果 User click 那就紀錄並通過 Clicked item 去 Update model

In [18]:
def train(model, dataset, n_neg=14):
    epoch_loss = []

    pbar = trange(HParams.NUM_EPOCHS, desc="Training", ncols=0)
    for _ in pbar:
        batch_loss = []

        for user_ids, pos_item_ids, labels in dataset:
            losses = []
            batch_size = len(user_ids)

            # Train positive samples
            loss = model.train_step((
                user_ids,
                pos_item_ids,
                labels,
            ))
            losses.append(loss)

            # Train negative samples
            neg_item_ids = tf.random.uniform(
                shape=(n_neg, batch_size),
                minval=0,
                maxval=ConstParams.N_ITEMS,
                dtype=tf.int32,
            )
            for _neg_item_id in neg_item_ids:
                loss = model.train_step((
                    tf.constant(user_ids),
                    tf.constant(_neg_item_id),
                    tf.zeros(batch_size),
                ))
                losses.append(loss)

            batch_loss.append(tf.reduce_mean(losses).numpy())
        epoch_loss.append(np.mean(batch_loss))
        pbar.set_postfix({"loss": epoch_loss[-1]})
    pbar.set_postfix({"loss": np.mean(epoch_loss)}, refresh=True)

    return model, np.mean(epoch_loss)


def update(model, user_id, clicked_id):
    # Positive samples
    model.train_step((
        tf.convert_to_tensor([[user_id]]),
        tf.convert_to_tensor([[clicked_id]]),
        tf.ones(1),
    ))

    # Negative samples
    neg_item_ids = tf.random.uniform(
        shape=(HParams.N_NEGTIVES,),
        minval=0,
        maxval=ConstParams.N_ITEMS,
        dtype=tf.int32,
    )
    model.train_step((
        tf.repeat(user_id, HParams.N_NEGTIVES),
        neg_item_ids,
        tf.zeros(HParams.N_NEGTIVES),
    ))

    return model


# Explore pipeline
def explore(env, model, data_manager, slate_size=5):
    hit_count = 0
    pbar = tqdm(desc="Explore")
    while env.has_next_state():
        user_id = env.get_state()
        random_pos_item_id = random.choice(tuple(data_manager.pos_item_sets[user_id]))
        coll_slate = model.get_topk(user_id, 2)
        cont_slate = get_content_topk(data_manager, random_pos_item_id, 3, False)
        slate = np.unique(coll_slate + cont_slate).tolist()
        while len(slate) < slate_size:
            slate = np.unique(
                slate
                + random.sample(model.get_topk(user_id, 10), slate_size - len(slate))
            ).tolist()
        clicked_id, _ = env.get_response(slate)

        if clicked_id != -1:
            hit_count += 1
            data_manager.add(user_id, clicked_id)
            model = update(model, user_id, clicked_id)

        pbar.update(1)
        pbar.set_postfix({"#click": hit_count})

    return model, hit_count


## Training

Model 的 Optimizer 我們選擇使用 AdamW 並加上其 EMA 的功能來提昇表現，而 Loss function 則是使用 Binary Focal Cross-entropy。其中 Lable smoothing 及 Apply class balanceing 都會讓 model 的表現更好。

除此之外，為了避免只拿 Positive sample 訓練可能會表現不好的問題，每訓練一筆 Positive sample 都會從所有 Item 隨機抽幾個 Item 隨機抽幾個作為 Negtive samples 來平衡。 

*結果只是示意，非最終訓練過程

In [35]:
data_manager = DataManager(
    Paths.USER_DATA, Paths.ITEM_DATA, Paths.TOKEN_PATH, Paths.EMBEDDING_PATH
)
data_manager.load(Paths.USER_DATA_PLUS, Paths.SIMILARITY_PATH)

model = FunkSVD(
    HParams.EMBED_SIZE,
    ConstParams.N_TRAIN_USERS,
    ConstParams.N_ITEMS,
    l2_lambda=0.005,
)
model.compile(
    optimizer=tf.keras.optimizers.AdamW(
        learning_rate=0.0005, weight_decay=0.0004, use_ema=True
    ),
    loss=tf.keras.losses.BinaryFocalCrossentropy(
        apply_class_balancing=True, from_logits=True, label_smoothing=0.5
    ),
)

checkpoint = tf.train.Checkpoint(model=model)
ckpt_manager = tf.train.CheckpointManager(
    checkpoint, Paths.CHECKPOINT_DIR / "FunkSVD", max_to_keep=5
)
best_manager = tf.train.CheckpointManager(
    checkpoint, Paths.CHECKPOINT_DIR / "FunkSVD/best", max_to_keep=1
)


In [20]:
best_score = 0
for i in range(HParams.NUM_EPOSIDES):
    print("=" * 5 + f" Eposide {i + 1}/{HParams.NUM_EPOSIDES} " + "=" * 5)

    # Initialize
    env = TrainingEnvironment()
    dataset_generator = LabelDatasetGenerator(data_manager.get_sequences())

    # Train
    dataset = dataset_generator(HParams.BATCH_SIZE)
    model, _ = train(model, dataset, HParams.N_NEGTIVES)

    # Explore and update
    model, _ = explore(env, model, data_manager, ConstParams.SLATE_SIZE)
    score = np.mean(env.get_score())
    print(f"Avg. Score: {score:.6f}")

    # Save
    ckpt_manager.save()
    data_manager.save(Paths.USER_DATA_PLUS, Paths.SIMILARITY_PATH)

    # Save best model
    if score > best_score:
        best_score = score
        best_manager.save()
        print(f"Best model saved at {best_manager.latest_checkpoint}.")


===== Eposide 1/5 =====


Training: 100% 1/1 [00:18<00:00, 18.49s/it, loss=0.102]
Explore: 7045it [00:46, 151.14it/s, #click=1392]


Avg. Score: 0.003523
Best model saved at checkpoint/FunkSVD/best/ckpt-2.
===== Eposide 2/5 =====


Training: 100% 1/1 [00:17<00:00, 17.11s/it, loss=0.0969]
Explore: 7016it [00:44, 158.79it/s, #click=1354]


Avg. Score: 0.003508
===== Eposide 3/5 =====


Training: 100% 1/1 [00:17<00:00, 17.07s/it, loss=0.0934]
Explore: 6934it [00:44, 156.58it/s, #click=1293]


Avg. Score: 0.003467
===== Eposide 4/5 =====


Training: 100% 1/1 [00:16<00:00, 16.63s/it, loss=0.0904]
Explore: 6928it [00:41, 165.44it/s, #click=1297]


Avg. Score: 0.003464
===== Eposide 5/5 =====


Training: 100% 1/1 [00:16<00:00, 16.67s/it, loss=0.0876]
Explore: 7077it [00:45, 155.43it/s, #click=1402]


Avg. Score: 0.003539
Best model saved at checkpoint/FunkSVD/best/ckpt-7.


## Testing

Test 整體的流程跟 Train 很像，只是不會每個 Epoch 之間去 Train，只會在 Explore 的過程中去 Update。而每個 Epoch 後都會重置 Model weight 及紀錄的 History。

In [37]:
best_ckpt_dir = Paths.CHECKPOINT_DIR / "FunkSVD/best_backup"

test_env = TestingEnvironment()
scores = []

# Repeat the testing process for 5 times
for epoch in range(ConstParams.TEST_EPISODES):
    # [TODO] Load your model weights here (in the beginning of each testing episode)
    # [TODO] Code for loading your model weights...
    print(f"Model restored from {tf.train.latest_checkpoint(best_ckpt_dir)}.")
    checkpoint = tf.train.Checkpoint(model=model)
    checkpoint.restore(tf.train.latest_checkpoint(best_ckpt_dir))
    history = History(Paths.USER_DATA)
    clicked_count = 0

    # Start the testing process
    with tqdm(desc="Testing") as pbar:
        # Run as long as there exist some active users
        while test_env.has_next_state():
            # Get the current user id
            cur_user = test_env.get_state()

            # [TODO] Employ your recommendation policy to generate a slate of 5 distinct items
            # [TODO] Code for generating the recommended slate...
            random_pos_item_id = random.choice(
                np.unique(history.get(cur_user)).tolist()
            )
            coll_slate = model.get_topk(cur_user, 2)
            cont_slate = get_content_topk(data_manager, random_pos_item_id, 3, False)
            slate = np.unique(coll_slate + cont_slate).tolist()

            while len(slate) < ConstParams.SLATE_SIZE:
                slate = np.unique(
                    slate
                    + random.sample(
                        model.get_topk(cur_user, 10),
                        ConstParams.SLATE_SIZE - len(slate),
                    )
                ).tolist()

            # Get the response of the slate from the environment
            clicked_id, _in_environment = test_env.get_response(slate)

            # [TODO] Update your model here (optional)
            # [TODO] You can update your model at each step, or perform a batched update after some interval
            # [TODO] Code for updating your model...
            if clicked_id != -1:
                clicked_count += 1
                history.add(cur_user, clicked_id)
                model = update(model, cur_user, clicked_id)
                pbar.set_postfix({"#click": clicked_count})

            # Update the progress indicator
            pbar.update(1)

    # Record the score of this testing episode
    scores.append(test_env.get_score())

    # Reset the testing environment
    test_env.reset()

    # [TODO] Delete or reset your model weights here (in the end of each testing episode)
    # [TODO] Code for deleting your model weights...
    checkpoint.restore(tf.train.latest_checkpoint(best_ckpt_dir))
    history.reset()

# Calculate the average scores
avg_scores = [np.average(score) for score in zip(*scores)]

# Generate a DataFrame to output the result in a .csv file
df_result = pd.DataFrame(
    [[user_id, avg_score] for user_id, avg_score in enumerate(avg_scores)],
    columns=["user_id", "avg_score"],
)
df_result.to_csv(Paths.OUTPUT, index=False)


Model restored from checkpoint/FunkSVD/best_backup/ckpt-88.


Testing: 16990it [02:18, 122.92it/s, #click=4998]


Model restored from checkpoint/FunkSVD/best_backup/ckpt-88.


Testing: 16988it [02:14, 126.47it/s, #click=4990]


Model restored from checkpoint/FunkSVD/best_backup/ckpt-88.


Testing: 16907it [02:02, 137.72it/s, #click=4929]


Model restored from checkpoint/FunkSVD/best_backup/ckpt-88.


Testing: 16919it [02:07, 133.00it/s, #click=4928]


Model restored from checkpoint/FunkSVD/best_backup/ckpt-88.


Testing: 17092it [02:08, 132.78it/s, #click=5066]


## Conclusion

相比前面三次競賽，我們在這次競賽感覺比較抓不太到方向，即使到最後的表現離 70 分線依然有一段距離。即使有嘗試了許多的方式，但大多連 60 分的門檻都模不到，最後回到簡單的方式反而通過了。而在最後一個小時持續嘗試時才發現 Focal loss 中的 label smooth 的影響很大，但已經來不及再 Train 一個 Model 出來了。

不過本次競賽的過程中也學習到很多東西，各種推薦系統的架構，對於一個推薦系統重要的是什麼...等。所以即使分數表現不如預期也不至於太過沮喪。畢竟這緊湊的一個學期總算結束了。