Reference: https://github.com/shashist/recsys-rl/tree/master

#### Setup

In [None]:
!pip install pytorch_ranger

Collecting pytorch_ranger
  Downloading pytorch_ranger-0.1.1-py3-none-any.whl (14 kB)
Installing collected packages: pytorch_ranger
Successfully installed pytorch_ranger-0.1.1


In [None]:
from google.colab import drive
import warnings
import sys

import pickle
import pandas as pd
import collections
import scipy.sparse as sp
import numpy as np
import torch
from pytorch_ranger import Ranger
import tqdm

In [None]:
warnings.filterwarnings('ignore')
sys.path.append('/content/drive/Shareddrives/CMPE260/')

In [None]:
%run /content/drive/Shareddrives/CMPE260/utils.ipynb

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

torch.manual_seed(101)

cpu


<torch._C.Generator at 0x7e0d8a37f750>

In [None]:
params = {
    'batch_size': 8,
    'embedding_dim': 16,
    'hidden_dim': 16,
    'N': 5, #  past latest positive interactions for a user

    'value_lr': 1e-2,
    'value_decay': 1e-4,
    'policy_lr': 1e-3,
    'policy_decay': 1e-6,
    'state_rep_lr': 1e-3,
    'state_rep_decay': 1e-3,

    'log_w_features_dir': '/content/drive/Shareddrives/CMPE260/logs_w_features/',
    'log_base_dir': '/content/drive/Shareddrives/CMPE260/logs_base/',
    'log_w_gaussian_dir': '/content/drive/Shareddrives/CMPE260/logs_w_gaussian/',
    'log_w_ou_dir': '/content/drive/Shareddrives/CMPE260/logs_w_ou/',

    'gamma': 0.9,
    'min_value': -10,
    'max_value': 10,
    'soft_tau': 1e-2,

    'buffer_size': 10000
}

#### Load data

In [None]:
df = pd.read_csv('/content/drive/Shareddrives/CMPE260/data/df.csv')
df.head()

Unnamed: 0,isbn,title,author,user_id,rating,best_seller
0,2005018,Clara Callan,Richard Bruce Wright,8,5.0,False
1,2005018,Clara Callan,Richard Bruce Wright,11676,8.0,False
2,2005018,Clara Callan,Richard Bruce Wright,67544,8.0,False
3,2005018,Clara Callan,Richard Bruce Wright,116866,9.0,False
4,2005018,Clara Callan,Richard Bruce Wright,123629,9.0,False


In [None]:
user_features_df = pd.read_csv('/content/drive/Shareddrives/CMPE260/data/User_features.csv', index_col=0)
user_features_df.head()

Unnamed: 0,user_id,Location,age,city,state,country
2032,2033,"omaha, nebraska, usa",27.0,omaha,nebraska,usa
2275,2276,"niskayuna, new york, usa",46.0,niskayuna,new york,usa
4016,4017,"new orleans, louisiana, usa",48.0,new orleans,louisiana,usa
4384,4385,"albq, new mexico, usa",33.0,albq,new mexico,usa
6250,6251,"wahiawa, hawaii, usa",32.0,wahiawa,hawaii,usa


In [None]:
df['user_id'] = df['user_id'].astype(int)

In [None]:
filtered_df = filter_data(df)

In [None]:
g = filtered_df.groupby(['user'])['isbn'].count()
g.describe()

count    251.000000
mean     169.486056
std       84.064611
min      100.000000
25%      115.000000
50%      137.000000
75%      185.500000
max      490.000000
Name: isbn, dtype: float64

In [None]:
filtered_df.shape, filtered_df.user_id.nunique(), filtered_df.isbn.nunique() # there are 251 users who have rated 100-500 books and the unique isbn are 30361

((42541, 8), 251, 30361)

In [None]:
(train_data, train_matrix, test_data, test_matrix, num_of_users, num_of_books, apt_users) = preprocess_data(filtered_df)

In [None]:
train_data.shape, test_data.shape # user-book df which tells us which user has rated which book

((38287, 2), (4254, 2))

In [None]:
num_of_users, num_of_books, train_matrix.shape, test_matrix.shape, len(apt_users)

(252, 30362, (252, 30362), (252, 30362), 52)

In [None]:
# users which we will train on - these include users who have rated 137-185 books
# for u in apt_users:
#   print(f'user= {u}, books rated= {train_matrix[u].sum()}')

user= 1, books rated= 166.0
user= 5, books rated= 141.0
user= 11, books rated= 151.0
user= 12, books rated= 145.0
user= 14, books rated= 142.0
user= 21, books rated= 137.0
user= 22, books rated= 144.0
user= 25, books rated= 168.0
user= 27, books rated= 138.0
user= 32, books rated= 141.0
user= 36, books rated= 172.0
user= 38, books rated= 144.0
user= 43, books rated= 167.0
user= 52, books rated= 154.0
user= 54, books rated= 145.0
user= 63, books rated= 138.0
user= 67, books rated= 162.0
user= 70, books rated= 139.0
user= 75, books rated= 137.0
user= 78, books rated= 170.0
user= 82, books rated= 167.0
user= 83, books rated= 171.0
user= 93, books rated= 175.0
user= 99, books rated= 149.0
user= 105, books rated= 163.0
user= 124, books rated= 175.0
user= 125, books rated= 150.0
user= 132, books rated= 146.0
user= 134, books rated= 173.0
user= 136, books rated= 157.0
user= 137, books rated= 175.0
user= 141, books rated= 141.0
user= 143, books rated= 156.0
user= 144, books rated= 147.0
user= 

In [None]:
user_features_df, num_of_countries, num_of_age_buckets = process_user_features(user_features_df, filtered_df)

In [None]:
# create dataloaders for evaluation

# take one user for faster evaluation
test_dataset = Dataset(np.array(test_data)[np.array(test_data)[:, 0] == 234], num_of_books, test_matrix)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=params['batch_size'])

# obtain best policy by evaluating on full test set
full_dataset = Dataset(np.array(test_data), num_of_books, test_matrix)
full_loader = torch.utils.data.DataLoader(full_dataset, batch_size=params['batch_size'], shuffle=False)

#### Implementation

In [None]:
class Environment_w_Features():
    def __init__(self, user_book_matrix):
      self.matrix = user_book_matrix
      self.book_count = num_of_books
      self.memory = np.ones([num_of_users, params['N']]) * num_of_books

    def reset(self, user):
      self.user = user
      self.viewed_books = []

      # get user features
      country, age = get_user_features(self.user, user_features_df)

      # related books - books that the user has rated
      self.related_books = np.argwhere(self.matrix[self.user] > 0)[:, 1]
      self.num_related_books = len(self.related_books)
      # non-related books - books which the user has not rated
      self.nonrelated_books = np.random.choice(list(set(range(self.book_count)) - set(self.related_books)), self.num_related_books)
      # rated books at even position and non-rated books at odd position (will be presented to user during their interaction with the environment)
      self.available_books = np.zeros(self.num_related_books * 2)
      self.available_books[::2] = self.related_books
      self.available_books[1::2] = self.nonrelated_books

      return torch.tensor([self.user]), torch.tensor([country]), torch.tensor([age]), torch.tensor(self.memory[[self.user], :])

    def step(self, action, action_emb=None, replay_buffer=None):
      initial_user = self.user
      initial_memory = self.memory[[initial_user], :]

      # calculate reward based on whether the chosen action (recommendation) corresponds to a book that the user has rated or the book is a best seller
      ac = action[0].detach().cpu().numpy()
      ac_list = ac.tolist()
      best = filtered_df[filtered_df.book.isin([ac_list])]
      reward = float((ac in self.related_books) or ((best['best_seller'] == True).any()))

      # update viewed books
      self.viewed_books.append(action[0].detach().cpu().numpy())

      # update memory if reward is received
      if reward:
        self.memory[self.user] = list(self.memory[self.user][1:]) + [action.cpu()]

      # marks end of episode if user has viewed the same number of books that they rated in the past
      if len(self.viewed_books) == len(self.related_books):
        done = 1
      else:
        done = 0

      if replay_buffer is not None:
            replay_buffer.push(
                torch.tensor([initial_user]).to(device),
                torch.tensor(initial_memory).to(device),
                action_emb[0].detach().cpu().numpy(),
                torch.tensor([reward]).to(device),
                torch.tensor([self.user]).to(device),
                torch.tensor(self.memory[[self.user], :]).to(device),
                torch.tensor([reward]).to(device)
            )

      return (
          torch.tensor([self.user]).to(device),
          torch.tensor(self.memory[[self.user], :]).to(device),
          torch.tensor([reward]).to(device),
          torch.tensor([done]).to(device)
      )

In [None]:
# policy network that generates action based on state s
class Actor(torch.nn.Module):
  def __init__(self, embedding_dim, hidden_dim):
    super().__init__()
    self.layers = torch.nn.Sequential(
        torch.nn.Linear(embedding_dim * 3, hidden_dim),
        torch.nn.ReLU(),
        torch.nn.Linear(hidden_dim, embedding_dim)
    )
    self.initialize()

  def initialize(self):
    for layer in self.layers:
      if isinstance(layer, torch.nn.Linear):
        torch.nn.init.normal_(layer.weight)

  def forward(self, state):
    return self.layers(state)

  def get_action(self, user, memory, state_representation, action_emb, books, return_scores=False):
    user = user.to(device)
    memory = memory.to(device)
    books = books.to(device)

    # bmm (batch matrix-matrix product) calculates score of each book based on dot product between the embedding of the book and the action embedding
    scores = torch.bmm(state_representation.book_embeddings(books).unsqueeze(0), action_emb.T.unsqueeze(0)).squeeze(0)
    if return_scores:
        return scores, books[scores.argmax(0)]
    else:
        return books[scores.argmax(0)]

In [None]:
# network that determines the true Q-value function
class Critic(torch.nn.Module):
  def __init__(self, state_representation_dim, action_emb_dim, hidden_dim):
    super().__init__()
    self.layers = torch.nn.Sequential(
        torch.nn.Linear(state_representation_dim + action_emb_dim, hidden_dim),
        torch.nn.ReLU(),
        torch.nn.Linear(hidden_dim, 1)
    )
    self.initialize()

  def initialize(self):
    for layer in self.layers:
      if isinstance(layer, torch.nn.Linear):
        torch.nn.init.normal_(layer.weight)

  def forward(self, state, action):
    state = state.to(device)
    action = action.to(device)

    x = torch.cat([state, action], 1)
    x = self.layers(x)

    return x

In [None]:
# models user-item interaction
class State_Representation_w_Features(torch.nn.Module):
  def __init__(self, num_of_countries, num_of_age_buckets, num_of_books, embedding_dim, hidden_dim):
    super().__init__()
    self.country_embeddings = torch.nn.Embedding(num_of_countries, embedding_dim)
    self.age_embeddings = torch.nn.Embedding(num_of_age_buckets, embedding_dim)
    self.book_embeddings = torch.nn.Embedding(num_of_books+1, embedding_dim, padding_idx=num_of_books)
    self.drr = torch.nn.Conv1d(in_channels=params['N'], out_channels=1, kernel_size=1)
    self.initialize()

  def initialize(self):
    torch.nn.init.normal_(self.country_embeddings.weight, std=0.01)
    torch.nn.init.normal_(self.age_embeddings.weight, std=0.01)
    torch.nn.init.normal_(self.book_embeddings.weight, std=0.01)
    self.book_embeddings.weight.data[-1].zero_()
    torch.nn.init.uniform_(self.drr.weight)
    self.drr.bias.data.zero_()

  def forward(self,  country, age, memory):
    country = country.to(device)
    age = age.to(device)
    memory = memory.to(device)

    country_embedding = self.country_embeddings(country.long())
    age_embedding = self.age_embeddings(age.long())
    user_embedding = torch.mul(country_embedding, age_embedding)
    book_embedding = self.book_embeddings(memory.long())
    drr= self.drr(book_embedding).squeeze(1)

    return torch.cat((user_embedding, user_embedding * drr, drr), 1)

In [None]:
class ReplayBuffer_w_Features(object):
  def __init__(self, capacity):
    self.capacity = capacity
    self.buffer = []
    self.pos = 0

  def push(self, user, memory, action, reward, next_user, next_memory, done):
    # add to buffer if not full else follow FIFO and remove oldest from history and add new
    if len(self.buffer) < self.capacity:
        self.buffer.append((user, memory, action, reward, next_user, next_memory, done))
    else:
        self.buffer[self.pos] = (user, memory, action, reward, next_user, next_memory, done)
    self.pos = (self.pos + 1) % self.capacity

  def sample(self, batch_size):
    indices = np.random.choice(len(self.buffer), batch_size)
    samples = [self.buffer[idx] for idx in indices]

    batch = list(zip(*samples))
    user_features = [get_user_features(user.item(), user_features_df) for user in batch[0]]
    country = np.concatenate([np.array([feature[0]]) for feature in user_features])
    age = np.concatenate([np.array([feature[1]]) for feature in user_features])
    memory = np.concatenate([item.cpu().numpy() for item in batch[1]])
    action = batch[2]
    reward = batch[3]
    next_user_features = [get_user_features(user.item(), user_features_df) for user in batch[4]]
    next_country = np.concatenate([np.array([feature[0]]) for feature in next_user_features])
    next_age = np.concatenate([np.array([feature[1]]) for feature in next_user_features])
    next_memory = np.concatenate([item.cpu().numpy() for item in batch[5]])
    done = batch[6]

    return country, age, memory, action, reward, next_country, next_age, next_memory, done

  def __len__(self):
    return len(self.buffer)

#### Initializations

In [None]:
state_representation = State_Representation_w_Features(num_of_countries, num_of_age_buckets, num_of_books, params['embedding_dim'], params['hidden_dim']).to(device)
policy_network = Actor(params['embedding_dim'], params['hidden_dim']).to(device)
value_network = Critic(params['embedding_dim'] * 3, params['embedding_dim'], params['hidden_dim']).to(device)

replay_buffer = ReplayBuffer_w_Features(params['buffer_size'])

target_policy_network = Actor(params['embedding_dim'], params['hidden_dim']).to(device)
target_value_network  = Critic(params['embedding_dim'] * 3, params['embedding_dim'], params['hidden_dim']).to(device)

In [None]:
# copy actor network parameters to critic network
for target_param, param in zip(target_policy_network.parameters(), policy_network.parameters()):
  target_param.data.copy_(param.data)

for target_param, param in zip(target_value_network.parameters(), value_network.parameters()):
  target_param.data.copy_(param.data)

In [None]:
# define loss function and optimizer
value_criterion = torch.nn.MSELoss()

value_optimizer = Ranger(value_network.parameters(), lr=params['value_lr'], weight_decay=params['value_decay'])
policy_optimizer = Ranger(policy_network.parameters(), lr=params['policy_lr'], weight_decay=params['policy_decay'])
state_representation_optimizer = Ranger(state_representation.parameters(), lr=params['state_rep_lr'], weight_decay=params['state_rep_decay'])

#### Helper functions

In [None]:
def evaluation(network, state, training_env_memory, loader):
  hits = []
  dcgs = []

  state = state.to(device)

  # set up environment
  test_env = Environment_w_Features(test_matrix)
  test_env.memory = training_env_memory.copy()
  user, country, age, memory = test_env.reset(int((next(iter(loader))['user']).detach().cpu().numpy()[0]))

  for batch in loader:
    # get action embedding given user and memory
    action_emb = network(state_representation(country, age, memory)).to(device)
    # obtain recommendations (action) along with scores for each recommendation
    scores, action = network.get_action(
        batch['user'],
        torch.tensor(test_env.memory[batch['user'].detach().cpu().numpy().astype(int), :]),
        state,
        action_emb,
        batch['book'].long(),
        return_scores=True
    )
    user, memory, reward, done = test_env.step(action)

    # calculate hit and dcg metric
    _, ind = scores[:, 0].topk(5)
    predictions = torch.take(batch['book'].to(device), ind).cpu().numpy().tolist()

    actual = batch['book'].tolist()
    label = batch['label'].tolist()

    hits.append(hit_metric(predictions, actual, label, filtered_df))
    dcgs.append(dcg_metric(predictions, actual, label, filtered_df))

  return np.mean(hits), np.mean(dcgs)

In [None]:
def update(batch_size, gamma, min_value, max_value, soft_tau):
  # sample random samples from replay buffer
  country, age, memory, action, reward, next_country, next_age, next_memory, done = replay_buffer.sample(batch_size)

  country = torch.FloatTensor(country).to(device)
  age = torch.FloatTensor(age).to(device)
  memory = torch.FloatTensor(memory).to(device)
  action = torch.FloatTensor(action).to(device)
  reward = torch.FloatTensor(reward).to(device)
  next_country = torch.FloatTensor(next_country).to(device)
  next_age = torch.FloatTensor(next_age).to(device)
  next_memory = torch.FloatTensor(next_memory).to(device)
  done = torch.FloatTensor(done).to(device)

  # obtain current policy based on the sampled data
  state = state_representation(country, age, memory)
  policy_loss = value_network(state, policy_network(state))
  policy_loss = -policy_loss.mean()

  # obtain next state, action and taget value given the the current state and action
  next_state = state_representation(next_country, next_age, next_memory)
  next_action = target_policy_network(next_state)
  target_value = target_value_network(next_state, next_action.detach())
  expected_value = reward + (1.0 - done) * gamma * target_value
  expected_value = torch.clamp(expected_value, min_value, max_value)

  value = value_network(state, action)
  value_loss = value_criterion(value, expected_value.detach())

  # backpropagation through action and critic network
  state_representation_optimizer.zero_grad()
  policy_optimizer.zero_grad()
  policy_loss.backward(retain_graph=True)
  policy_optimizer.step()
  value_optimizer.zero_grad()
  value_loss.backward(retain_graph=True)
  value_optimizer.step()
  state_representation_optimizer.step()

  for target_param, param in zip(target_value_network.parameters(), value_network.parameters()):
    target_param.data.copy_(target_param.data * (1.0 - soft_tau) + param.data * soft_tau)

  for target_param, param in zip(target_policy_network.parameters(), policy_network.parameters()):
    target_param.data.copy_(target_param.data * (1.0 - soft_tau) + param.data * soft_tau)