In this notebook, I will create and test a reward simulator inspired by the original article. I'll start by implementing individual functions and then refine them into production-ready code.

In [None]:
import os
import numpy as np
import pandas as pd
from google.colab import drive
from collections import Counter
from tqdm import tqdm
import pickle

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
project_dir = '/content/drive/MyDrive/ML/Reinforcement Learning/Final project/MIND'

In [None]:
behaviors_train_path = os.path.join(project_dir, 'MINDsmall_train/behaviors.tsv')

behaviors_train = pd.read_csv(behaviors_train_path, sep='\t', header=None, names=["impression_id", "user_id", "time", "history", "impressions"])

## Clean up impressions and merge users with duplicate histories and several sessions into a single impression list

In [None]:
behaviors_train.shape

(156965, 5)

In [None]:
behaviors_train.dropna(subset='history', inplace=True)

In [None]:
behaviors_train["time"] = pd.to_datetime(behaviors_train["time"])
behaviors_train = behaviors_train.sort_values(by=["user_id", "time"]).reset_index(drop=True)

In [None]:
behaviors_train.head()

Unnamed: 0,impression_id,user_id,time,history,impressions
0,107691,U100,2019-11-12 07:34:12,N20121 N33998 N45954 N55743 N50095 N18870 N534...,N61235-0 N54489-0 N42597-0 N7800-1 N61408-0 N2...
1,86767,U1000,2019-11-13 23:16:18,N29641 N1789 N41244,N35273-0 N7618-0 N55281-0 N4021-0 N16148-0 N38...
2,46640,U1000,2019-11-14 01:00:56,N29641 N1789 N41244,N16148-0 N43628-0 N13094-0 N32774-0 N57034-0 N...
3,74345,U1000,2019-11-14 22:37:21,N29641 N1789 N41244,N14478-0 N58264-0 N7494-0 N46917-0 N22257-0 N1...
4,155923,U10001,2019-11-11 05:30:21,N47937 N51706 N56447 N61319 N27644 N18030 N272...,N35729-0 N56598-0 N48759-0 N49685-0 N33632-0 N...


In [None]:
# Process impressions (remove duplicates present in history)
def clean_impressions(row):
    history_set = set(row["history"].split())
    filtered_impressions = [imp for imp in row["impressions"].split() if imp.split('-')[0] not in history_set]
    return " ".join(filtered_impressions)

In [None]:
behaviors_train["impressions"] = behaviors_train.apply(clean_impressions, axis=1)

In [None]:
# Merge impressions for users with multiple sessions
merged_behaviors = behaviors_train.groupby("user_id").agg({
    "history": "first",  # Take the first occurrence (all are identical)
    "impressions": lambda x: " ".join(x)  # Merge impressions into a single impression
}).reset_index()

In [None]:
merged_behaviors.head()

Unnamed: 0,user_id,history,impressions
0,U100,N20121 N33998 N45954 N55743 N50095 N18870 N534...,N61235-0 N54489-0 N42597-0 N7800-1 N61408-0 N2...
1,U1000,N29641 N1789 N41244,N35273-0 N7618-0 N55281-0 N4021-0 N16148-0 N38...
2,U10001,N47937 N51706 N56447 N61319 N27644 N18030 N272...,N35729-0 N56598-0 N48759-0 N49685-0 N33632-0 N...
3,U10003,N39074 N26619 N31431 N1282 N61052 N41668 N5083...,N35729-0 N57090-1 N6693-0 N13801-0 N55689-1 N3...
4,U10008,N23912 N30148 N46754 N33117 N9619 N36526 N4449...,N24272-0 N8016-0 N43595-0 N732-0 N8929-0 N3567...


In [None]:
# Save for future use
# save_path = os.path.join(project_dir, 'processed_data/merged_behaviors_train.csv')
# merged_behaviors.to_csv(save_path, index=False)

Do the same for dev set.

In [None]:
behaviors_dev_path = os.path.join(project_dir, 'MINDsmall_dev/behaviors.tsv')
behaviors_dev = pd.read_csv(behaviors_dev_path, sep='\t', header=None, names=["impression_id", "user_id", "time", "history", "impressions"])

behaviors_dev.dropna(subset='history', inplace=True)
behaviors_dev["time"] = pd.to_datetime(behaviors_dev["time"])
behaviors_dev = behaviors_dev.sort_values(by=["user_id", "time"]).reset_index(drop=True)
behaviors_dev["impressions"] = behaviors_dev.apply(clean_impressions, axis=1)

merged_behaviors_dev = behaviors_dev.groupby("user_id").agg({
    "history": "first",
    "impressions": lambda x: " ".join(x)
}).reset_index()

# merged_behaviors_dev.to_csv(os.path.join(project_dir, 'processed_data/merged_behaviors_dev.csv'), index=False)

## Generate (state, action, reward) pairs

For each user's history, we select a slice containing the last MAX_HISTORY items—ensuring that there are at least MIN_HISTORY items. Then, we slide a window of size K along the impressions list. For each recommended item within the window, we record the user's action (1 if the item was clicked, 0 otherwise). When an item is clicked, it is appended to the user's history, and the sliding window continues. This process produces a dataset of tuples containing: the user's history (state) at each step, action (a set of K impressions) at each step, and the rewards corresponding to the user's actions for each item.

In [None]:
K = 4
MIN_HISTORY = 3
MAX_HISTORY = 12

state_action_reward_list = []

In [None]:
for _, row in tqdm(merged_behaviors.iterrows(), total=len(merged_behaviors), desc="Processing sessions"):

    user_id, history_str, impressions_str = row["user_id"], row["history"], row["impressions"]
    history = history_str.split()
    impressions = [imp.split('-') for imp in impressions_str.split()]

    # Convert clicks to integer labels
    impressions = [(article_id, int(click)) for article_id, click in impressions]

    # Skip if history is too short or impressions are too few
    if len(history) < MIN_HISTORY or len(impressions) < K:
        continue

    # Truncate history to the last MAX_HISTORY articles
    history = history[-MAX_HISTORY:]

    # Move with a window of size K while recommending articles
    for i in range(0, len(impressions) - K + 1, K):
        action_window = impressions[i:i+K]
        action = [article_id for article_id, _ in action_window]
        reward = [click for _, click in action_window]

        state_action_reward_list.append({
            "state": history.copy(),
            "action": action,
            "reward": reward
        })

        # Update history with clicked articles while maintaining size limit
        clicked_articles = [article_id for article_id, click in action_window if click == 1]
        history.extend(clicked_articles)
        history = history[-MAX_HISTORY:]  # Truncate history to the last MAX_HISTORY articles

Processing sessions: 100%|██████████| 49108/49108 [00:18<00:00, 2633.44it/s]


In [None]:
state_action_reward_df = pd.DataFrame(state_action_reward_list)

In [None]:
state_action_reward_df['reward'] = state_action_reward_df['reward'].apply(tuple)

In [None]:
# Sanity check: state, action, reward pairs for user U1000

state_action_reward_df.loc[26:76, :]

Unnamed: 0,state,action,reward
26,"[N29641, N1789, N41244]","[N35273, N7618, N55281, N4021]","(0, 0, 0, 0)"
27,"[N29641, N1789, N41244]","[N16148, N3841, N54752, N61787]","(0, 0, 0, 0)"
28,"[N29641, N1789, N41244]","[N1012, N57034, N55976, N59138]","(0, 0, 0, 0)"
29,"[N29641, N1789, N41244]","[N27869, N21128, N5048, N28767]","(0, 0, 0, 0)"
30,"[N29641, N1789, N41244]","[N48875, N49712, N4642, N16282]","(0, 0, 0, 0)"
31,"[N29641, N1789, N41244]","[N18522, N13094, N7121, N37194]","(0, 0, 0, 0)"
32,"[N29641, N1789, N41244]","[N33831, N36252, N14436, N64174]","(0, 0, 0, 0)"
33,"[N29641, N1789, N41244]","[N41698, N53875, N8015, N23272]","(0, 1, 0, 0)"
34,"[N29641, N1789, N41244, N53875]","[N35387, N16844, N55949, N39399]","(0, 0, 0, 0)"
35,"[N29641, N1789, N41244, N53875]","[N3957, N32774, N36261, N3031]","(0, 0, 0, 0)"


In [None]:
# Another sanity check to ensure the correct state and action lengths and reward combinations

state_action_reward_df_copy = state_action_reward_df.copy()

In [None]:
state_action_reward_df_copy['state_len'] = state_action_reward_df_copy['state'].apply(lambda x: len(x))
state_action_reward_df_copy['action_len'] = state_action_reward_df_copy['action'].apply(lambda x: len(x))

print(f"State lengths: {sorted(state_action_reward_df_copy['state_len'].unique())}")
print(f"Action lengths: {state_action_reward_df_copy['action_len'].unique()}")

State lengths: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
Action lengths: [4]


In [None]:
print(f"Unique rewards combinations: {state_action_reward_df_copy['reward'].nunique()}")

Unique rewards combinations: 16


In [None]:
state_action_reward_df_copy['reward'].value_counts()

Unnamed: 0_level_0,count
reward,Unnamed: 1_level_1
"(0, 0, 0, 0)",1158622
"(0, 1, 0, 0)",48269
"(1, 0, 0, 0)",47988
"(0, 0, 1, 0)",47905
"(0, 0, 0, 1)",47240
"(1, 0, 0, 1)",2154
"(0, 1, 0, 1)",2044
"(1, 0, 1, 0)",2007
"(0, 1, 1, 0)",1802
"(0, 0, 1, 1)",1668


## Calculate state and action vector representations for each (state, action, reward) pair

Below is a demonstration of how to convert tuples of item IDs from each history and action combination into an averaged embedding using the model we trained earlier, and how to perform a similarity search for a new history and action combination to determine the most probable reward.

In [None]:
# Load the embeddings obtained in the item_embeddings.ipynb research

embeddings_path = os.path.join(project_dir, 'news_train_fasttext_embeddings.pkl')

with open(embeddings_path, 'rb') as f:
    item_emb = pickle.load(f)

In [None]:
# Normalize and create a dictionary for fast lookup of embeddings by news_id

item_emb['fasttext_embedding'] = item_emb['fasttext_embedding'].apply(lambda x: x / np.linalg.norm(x)) # Normalize first
embedding_dict = dict(zip(item_emb['news_id'], item_emb['fasttext_embedding']))

In [None]:
def get_average_state_action_embeddings(df, embedding_dict):
    """Calculate the average state and action embeddings for each row in the DataFrame."""
    embeddings = []

    for _, row in tqdm(df.iterrows(), desc="Processing state-action pairs"):

        # Get embeddings for the state and action (lists of article IDs)
        state_embeddings = np.array([embedding_dict[i] for i in row['state']])
        action_embeddings = np.array([embedding_dict[i] for i in row['action']])

        # Compute the average state and action embeddings for this row
        avg_state_embedding = np.mean(state_embeddings, axis=0)
        avg_action_embedding = np.mean(action_embeddings, axis=0)

        # Store the results in the list
        embeddings.append({
            'state_embedding': avg_state_embedding,
            'action_embedding': avg_action_embedding,
            'reward': row['reward']
        })

    return pd.DataFrame(embeddings)

In [None]:
# Example usage
state_action_embeddings_reward_df = get_average_state_action_embeddings(state_action_reward_df, embedding_dict)

Processing state-action pairs: 1361705it [03:05, 7346.96it/s]


In [None]:
state_action_embeddings_reward_df.head()

Unnamed: 0,state_embedding,action_embedding,reward
0,"[0.02388275, -0.023278613, -0.12967333, -0.250...","[-0.0012808153, -0.019162979, -0.057180144, -0...","(0, 0, 0, 1)"
1,"[0.016010033, -0.024824375, -0.12313172, -0.24...","[0.02143856, 0.058424987, -0.08302912, -0.2425...","(0, 0, 0, 0)"
2,"[0.016010033, -0.024824375, -0.12313172, -0.24...","[0.01122797, -0.059371784, -0.10260101, -0.194...","(0, 0, 0, 0)"
3,"[0.016010033, -0.024824375, -0.12313172, -0.24...","[0.024843303, -0.04548509, -0.0837238, -0.1957...","(0, 0, 0, 0)"
4,"[0.016010033, -0.024824375, -0.12313172, -0.24...","[0.035845738, 0.04544969, -0.10997892, -0.1940...","(0, 0, 0, 0)"


In [None]:
def compute_mean_embeddings(state_article_ids, action_article_ids, embedding_dict):
    """
    Compute the mean state and action embeddings for a single state-action pair.
    Args:
        state_article_ids: List of article IDs for the state
        action_article_ids: List of article IDs for the action
        embedding_dict: Dictionary mapping article IDs to embeddings
    Returns:
        mean_state: Mean embedding for the state
        mean_action: Mean embedding for the action
    """
    # Get embeddings for the given article IDs
    state_embeddings = np.mean([embedding_dict[article_id] for article_id in state_article_ids], axis=0)
    action_embeddings = np.mean([embedding_dict[article_id] for article_id in action_article_ids], axis=0)

    return state_embeddings, action_embeddings


# Compute the cosine similarity between a mean state-action pair and the dataframe
def find_closest_match(state_action_pair, state_embeddings_df, action_embeddings_df, averaged_state_action_embeddings_df, embedding_dict, alpha=0.2):
    """
    Find the closest state-action pair from the dataframe based on cosine similarity using vectorized operations.
    Args:
        state_action_pair: A tuple (state_article_ids, action_article_ids)
        state_embeddings_df: Precomputed state embeddings from the dataframe
        action_embeddings_df: Precomputed action embeddings from the dataframe
        averaged_state_action_embeddings_df: DataFrame containing averaged state-action embeddings and corresponding rewards
        embedding_dict: Dictionary mapping article IDs to embeddings
        alpha: Weight for combining state and action similarity (default 0.2)
    Returns:
        reward: The reward of the closest matching state-action pair
    """
    # Compute the mean state and action embeddings for the given state-action pair
    state_article_ids, action_article_ids = state_action_pair
    state_embeddings, action_embeddings = compute_mean_embeddings(state_article_ids, action_article_ids, embedding_dict)

    # Compute cosine similarity for state embeddings and action embeddings
    cosine_state = np.dot(state_embeddings, state_embeddings_df.T) / (np.linalg.norm(state_embeddings) * np.linalg.norm(state_embeddings_df, axis=1))
    cosine_action = np.dot(action_embeddings, action_embeddings_df.T) / (np.linalg.norm(action_embeddings) * np.linalg.norm(action_embeddings_df, axis=1))

    # Combine state and action similarities (alpha is the weight for the state)
    combined_similarity = alpha * cosine_state + (1 - alpha) * cosine_action

    # Find the index of the maximum similarity
    max_index = np.argmax(combined_similarity)

    # Get the corresponding reward
    reward = averaged_state_action_embeddings_df.iloc[max_index]['reward']

    return reward

In [None]:
# Precompute state and action embeddings from the dataframe
state_embeddings_df = np.array(state_action_embeddings_reward_df['state_embedding'].values.tolist())
action_embeddings_df = np.array(state_action_embeddings_reward_df['action_embedding'].values.tolist())

In [None]:
# Check on the existing combinations to ensure everything works fine

# (0, 1, 0, 0)

state_action_pair = (['N29641', 'N1789', 'N41244'], ['N41698', 'N53875', 'N8015', 'N23272'])
reward = find_closest_match(state_action_pair, state_embeddings_df, action_embeddings_df, state_action_embeddings_reward_df, embedding_dict)

print(f"Closest reward: {reward}")


# (1, 1, 1, 1)

state_action_pair = (['N21086', 'N14130', 'N60374', 'N59852', 'N41178',
                      'N49279', 'N57614', 'N48657', 'N11830', 'N29128',
                      'N41220', 'N26706'],
                       ['N61185', 'N61022', 'N1026', 'N7328'])
reward = find_closest_match(state_action_pair, state_embeddings_df, action_embeddings_df, state_action_embeddings_reward_df, embedding_dict)

print(f"Closest reward: {reward}")

Closest reward: (0, 1, 0, 0)
Closest reward: (1, 1, 1, 1)


## Wrap it up into a single class

In production, we will use an enhanced version that supports multiple types of embeddings.

In [None]:
class StateActionRewardProcessor:
    def __init__(self, behaviors_df, embeddings_path, K=4, MIN_HISTORY=3, MAX_HISTORY=12, alpha=0.2):
        """
        Initializes the StateActionRewardProcessor by:
        - Preprocessing behaviors_df into (state, action, reward) pairs
        - Loading embeddings and computing mean state/action embeddings
        - Precomputing similarity metrics for fast reward retrieval

        Args:
            behaviors_df (pd.DataFrame): DataFrame with user merged behaviors (history, impressions).
            embeddings_path (str): Path to precomputed fastText embeddings.
            K (int): Number of recommended items per action.
            MIN_HISTORY (int): Minimum required history length.
            MAX_HISTORY (int): Maximum history length for a state.
            alpha (float): Weight for combining state and action similarity.
        """
        self.K = K
        self.MIN_HISTORY = MIN_HISTORY
        self.MAX_HISTORY = MAX_HISTORY
        self.alpha = alpha

        # Load embeddings
        self.embedding_dict = self._load_embeddings(embeddings_path)

        # Preprocess behaviors dataset
        self.state_action_reward_df = self._process_behaviors(behaviors_df)

        # Compute average state and action embeddings
        self.averaged_state_action_embeddings_df = self._compute_average_embeddings()

        # Store precomputed embeddings for fast similarity lookup
        self.state_embeddings = np.array(self.averaged_state_action_embeddings_df['state_embedding'].tolist())
        self.action_embeddings = np.array(self.averaged_state_action_embeddings_df['action_embedding'].tolist())

    def _load_embeddings(self, embeddings_path):
        """Loads item embeddings from a pickle file and normalizes them."""
        with open(embeddings_path, 'rb') as f:
            item_emb = pickle.load(f)

        item_emb['fasttext_embedding'] = item_emb['fasttext_embedding'].apply(lambda x: x / np.linalg.norm(x))
        return dict(zip(item_emb['news_id'], item_emb['fasttext_embedding']))

    def _process_behaviors(self, behaviors_df):
        """Processes user behaviors to extract state-action-reward pairs."""
        state_action_reward_list = []

        for _, row in tqdm(behaviors_df.iterrows(), total=len(behaviors_df), desc="Processing sessions"):

            user_id, history_str, impressions_str = row["user_id"], row["history"], row["impressions"]
            history = history_str.split()
            impressions = [imp.split('-') for imp in impressions_str.split()]
            impressions = [(article_id, int(click)) for article_id, click in impressions]

            if len(history) < self.MIN_HISTORY or len(impressions) < self.K:
                continue

            history = history[-self.MAX_HISTORY:]

            for i in range(0, len(impressions) - self.K + 1, self.K):
                action_window = impressions[i:i+self.K]
                action = [article_id for article_id, _ in action_window]
                reward = [click for _, click in action_window]

                state_action_reward_list.append({
                    "state": history.copy(),
                    "action": action,
                    "reward": reward
                })

                # Update history with clicked articles while maintaining size limit
                clicked_articles = [article_id for article_id, click in action_window if click == 1]
                history.extend(clicked_articles)
                history = history[-self.MAX_HISTORY:]

        return pd.DataFrame(state_action_reward_list)

    def _compute_average_embeddings(self):
        """Computes average state and action embeddings for each state-action pair."""
        embeddings = []

        for _, row in tqdm(self.state_action_reward_df.iterrows(), desc="Computing state-action embeddings"):

            state_embeddings = np.array([self.embedding_dict[i] for i in row['state'] if i in self.embedding_dict])
            action_embeddings = np.array([self.embedding_dict[i] for i in row['action'] if i in self.embedding_dict])

            avg_state_embedding = np.mean(state_embeddings, axis=0)
            avg_action_embedding = np.mean(action_embeddings, axis=0)

            embeddings.append({
                'state_embedding': avg_state_embedding,
                'action_embedding': avg_action_embedding,
                'reward': row['reward']
            })

        return pd.DataFrame(embeddings)

    def get_reward(self, state_embeddings, action_embeddings):
        """
        Finds the closest matching state-action pair and returns its reward.

        Args:
            state_embeddings: mean embedding of state_article_ids for which to find a similar entry
            action_embeddings: mean embedding of action_article_ids for which to find a similar entry

        Returns:
            reward (tuple): Reward vector for the most similar state-action pair.
        """

        if state_embeddings is None or action_embeddings is None:
            return None

        cosine_state = np.dot(state_embeddings, self.state_embeddings.T) / (np.linalg.norm(state_embeddings) * np.linalg.norm(self.state_embeddings, axis=1))
        cosine_action = np.dot(action_embeddings, self.action_embeddings.T) / (np.linalg.norm(action_embeddings) * np.linalg.norm(self.action_embeddings, axis=1))

        combined_similarity = self.alpha * cosine_state + (1 - self.alpha) * cosine_action
        max_index = np.argmax(combined_similarity)

        return self.averaged_state_action_embeddings_df.iloc[max_index]['reward']

## Test

In [None]:
# Separate function to process a single state_action_pair

def compute_mean_embeddings(state_action_pair):
    """Computes mean embeddings for given state and action article IDs."""
    state_article_ids, action_article_ids = state_action_pair

    state_embeddings = np.array([embedding_dict[i] for i in state_article_ids if i in embedding_dict])
    action_embeddings = np.array([embedding_dict[i] for i in action_article_ids if i in embedding_dict])

    if len(state_embeddings) == 0 or len(action_embeddings) == 0:
        return None, None

    return np.mean(state_embeddings, axis=0), np.mean(action_embeddings, axis=0)

In [None]:
merged_behaviors_path = os.path.join(project_dir, 'merged_behaviors_train.csv')
merged_behaviors = pd.read_csv(merged_behaviors_path)

In [None]:
embeddings_path = os.path.join(project_dir, 'news_train_fasttext_embeddings.pkl')

processor = StateActionRewardProcessor(merged_behaviors, embeddings_path)

Processing sessions: 100%|██████████| 49108/49108 [00:16<00:00, 2928.02it/s]
Computing state-action embeddings: 1361705it [03:21, 6756.84it/s]


In [None]:
# (0, 1, 0, 0)

state_action_pair = (['N29641', 'N1789', 'N41244'], ['N41698', 'N53875', 'N8015', 'N23272'])
state_embedding, action_embedding = compute_mean_embeddings(state_action_pair)
reward = processor.get_reward(state_embedding, action_embedding)
print("Reward:", reward)

# (1, 1, 1, 1)

state_action_pair = (['N21086', 'N14130', 'N60374', 'N59852', 'N41178',
                      'N49279', 'N57614', 'N48657', 'N11830', 'N29128',
                      'N41220', 'N26706'],
                       ['N61185', 'N61022', 'N1026', 'N7328'])
state_embedding, action_embedding = compute_mean_embeddings(state_action_pair)
reward = processor.get_reward(state_embedding, action_embedding)
print("Reward:", reward)

Reward: [0, 1, 0, 0]
Reward: [1, 1, 1, 1]
