# DDPG
> Experiment list: Interleaving + Diff / Interleaving + Diff + Param Noise

In [1]:
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../../interleaving')

import interleaving as interleaver

In [2]:
import pandas as pd
import numpy as np
import itertools
from tqdm.notebook import tqdm
import pickle

import torch
import torch.utils.data as data
from torch.utils.data import DataLoader
import torch.utils.data.sampler as sampler
import torch.nn as nn
import torch.nn.functional as F

from matplotlib import pyplot as plt
from collections import defaultdict
import warnings
from sklearn.preprocessing import LabelEncoder
import random
from datetime import datetime
import pytz
import line_profiler
from functools import reduce

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Fix Random Seeds

In [3]:
def same_seeds(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
  np.random.seed(seed)  
  torch.backends.cudnn.benchmark = False
  torch.backends.cudnn.deterministic = True

same_seeds(0)

## Prepare Data

### Available Files

||Title|File Name|
|---|-----|---------|
|order|Stream group by streamers|`streamer_stream_dict.pkl`|
||Stream list|`stream_list.pkl`|
|Context|Final user context|`user_context.pkl`|
||Streamer context|`streamer.pkl`|
||Item context|`item_pca_df.pkl`|
|Aux|stream: items list|`stream_item_dict.pkl`|
||stream: users list|`stream_users_dict.pkl`|
||item list the consumer actually bought|`user_bought_dict.pkl`|
|Explore-Diff|PCA based diff vector|`diff_vectors_pca.pkl`|
||SVD based diff vector|`diff_vectors_svd.pkl`|
||VAE based diff vector|`diff_vectors_vae.pkl`|
|Threshold|VAE reconstruction error df|`vae_recons_df.pkl`|

In [4]:
# STREAMER_STREAM_DICT = pd.read_pickle('../../data/streamer_stream_dict.pkl')
STREAM_LIST = pd.read_pickle('../../data/stream_list.pkl')
USER_CONTEXT = pd.read_pickle('../../data/user_context.pkl')
# STREAMER = pd.read_pickle('../../data/streamer.pkl')
ITEM_PCA_DF = pd.read_pickle('../../data/item_pca_df.pkl')
STREAM_ITEM_DICT = pd.read_pickle('../../data/stream_item_dict.pkl')
USER_BOUGHT_DICT = pd.read_pickle('../../data/user_bought_dict.pkl')
STREAM_USER_DICT = pd.read_pickle('../../data/stream_users_dict.pkl')
# Current used diff
ITEM_DIFF = pd.read_pickle('../../data/item_diff_vectors.pkl')
STREAM_ORDER_CNT = pd.read_pickle('../../data/stream_order_cnt_list.pkl')

In [5]:
DIFF_PCA = pd.read_pickle('../../data/diff_vectors_pca.pkl')
# DIFF_SVD = pd.read_pickle('../../data/diff_vectors_svd.pkl')
# DIFF_VAE = pd.read_pickle('../../data/diff_vectors_vae.pkl')
# VAE_RECONS_DF = pd.read_pickle('../../data/vae_recons_df.pkl')
# VAE_RECONS64_DF = pd.read_pickle('../../data/vae_recons_64_df.pkl')
# MUL150_PCA = pd.read_pickle('../../data/mul150_pca_diff.pkl')
# MUL150_VAE = pd.read_pickle('../../data/mul150_vae_diff.pkl')
# MUL64_PCA = pd.read_pickle('../../data/mul64_pca_diff.pkl')
# MUL64_VAE = pd.read_pickle('../../data/mul64_vae_diff.pkl')

---

## Replay

In [6]:
class RingBuffer(object):
    def __init__(self, maxlen):
        self.maxlen = maxlen
        self.start = 0
        self.length = 0
        self.data = [None for _ in range(maxlen)]

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        if idx < 0 or idx >= self.length:
            raise KeyError()
        return self.data[(self.start + idx) % self.maxlen]

    def append(self, v):
        if self.length < self.maxlen:
            # We have space, simply increase the length.
            self.length += 1
        elif self.length == self.maxlen:
            # No space, "remove" the first item.
            self.start = (self.start + 1) % self.maxlen
        else:
            # This should never happen.
            raise RuntimeError()
        self.data[(self.start + self.length - 1) % self.maxlen] = v

class ReplayBuffer:
  def __init__(self, limit=10000):
    self.limit = limit
    # init 4 RingBuffers: states/actions/rewards/ongoing
    self.states_buffer = RingBuffer(limit)
    self.next_states_buffer = RingBuffer(limit)
    self.actions_buffer = RingBuffer(limit)
    self.rewards_buffer = RingBuffer(limit)
    self.ongoings_buffer = RingBuffer(limit)
    
  def __len__(self):
    assert len(self.states_buffer) == len(self.next_states_buffer) == len(self.actions_buffer) == \
           len(self.rewards_buffer) == len(self.ongoings_buffer)
    return len(self.states_buffer)

  def remember(self, state, actions, reward, next_state, ongoing):
    assert len(self.states_buffer) == len(self.next_states_buffer) == len(self.actions_buffer) == \
           len(self.rewards_buffer) == len(self.ongoings_buffer)
    # append each element separately in different list
    self.states_buffer.append(state)
    self.actions_buffer.append(actions)
    self.rewards_buffer.append(reward)
    self.next_states_buffer.append(next_state)
    self.ongoings_buffer.append(ongoing)
    
  def get_batch(self, batch_size=100):
    assert len(self.states_buffer) == len(self.next_states_buffer) == len(self.actions_buffer) == \
           len(self.rewards_buffer) == len(self.ongoings_buffer)
    # same for sample_and_split
    states0_batch = []
    states1_batch = []
    actions_batch = []
    rewards_batch = []
    ongoings_batch = []
    batch_indexs = np.random.randint(0, len(self.states_buffer), size=min(len(self.states_buffer), batch_size))
    for i in batch_indexs:
      states0_batch.append(USER_CONTEXT.xs(self.states_buffer[i]))
      states1_batch.append(USER_CONTEXT.xs(self.next_states_buffer[i]))
      actions_batch.append(self.actions_buffer[i])
      rewards_batch.append(self.rewards_buffer[i])
      ongoings_batch.append(self.ongoings_buffer[i])
    assert len(states0_batch) == len(states1_batch) == len(actions_batch) == \
           len(rewards_batch) == len(ongoings_batch)
    return np.array(states0_batch), np.array(actions_batch), np.array(rewards_batch), np.array(states1_batch), np.array(ongoings_batch)

## Random Process

In [7]:
class RandomProcess(object):
    def reset_states(self):
        pass

class AnnealedGaussianProcess(RandomProcess):
    def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
        self.mu = mu
        self.sigma = sigma
        self.n_steps = 0

        if sigma_min is not None:
            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
            self.c = sigma
            self.sigma_min = sigma_min
        else:
            self.m = 0.
            self.c = sigma
            self.sigma_min = sigma

    @property
    def current_sigma(self):
        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
        return sigma


# Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
    def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
        super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
        self.theta = theta
        self.mu = mu
        self.dt = dt
        self.x0 = x0
        self.size = size
        self.reset_states()

    def sample(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x_prev = x
        self.n_steps += 1
        return x

    def reset_states(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)

## Models

In [8]:
def fanin_init(size, fanin=None):
    fanin = fanin or size[0]
    v = 1. / np.sqrt(fanin)
    return torch.Tensor(size).uniform_(-v, v)

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, mode, hidden1=400, hidden2=300, init_w=3e-3):
        super(Actor, self).__init__()
        self.mode = mode
        self.fc1 = nn.Linear(state_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, action_dim)
        self.norm1 = nn.LayerNorm(hidden1)
        self.norm2 = nn.LayerNorm(hidden2)
        self.norm3 = nn.LayerNorm(action_dim)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.init_weights(init_w)
    
    def init_weights(self, init_w):
        self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())
        self.fc2.weight.data = fanin_init(self.fc2.weight.data.size())
        self.fc3.weight.data.uniform_(-init_w, init_w)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.norm1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.norm2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.norm3(out)
        out = self.tanh(out)
        return out

    def add_parameter_noise(self, scalar=.1):
        self.fc1.weight.data += torch.randn_like(self.fc1.weight.data) * scalar
        self.fc2.weight.data += torch.randn_like(self.fc2.weight.data) * scalar
        self.fc3.weight.data += torch.randn_like(self.fc3.weight.data) * scalar
        
    def noised_add_disturb(self, weight=.1):
        for curr_param in self.parameters():
            curr_param += weight * np.random.uniform(-1, 1) * curr_param
        
    def actor_add_noised(self, source_network, weight=.05):
        for curr_param in self.parameters():
            curr_param = curr_param + curr_param * weight

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden1=400, hidden2=300, init_w=3e-3):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden1)
        self.fc2 = nn.Linear(hidden1+action_dim, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)
        self.relu = nn.ReLU()
        self.init_weights(init_w)
    
    def init_weights(self, init_w):
        self.fc1.weight.data = fanin_init(self.fc1.weight.data.size())
        self.fc2.weight.data = fanin_init(self.fc2.weight.data.size())
        self.fc3.weight.data.uniform_(-init_w, init_w)
    
    def forward(self, xs):
        x, a = xs
        out = self.fc1(x)
        out = self.relu(out)
        # debug()
        out = self.fc2(torch.cat([out,a],1))
        out = self.relu(out)
        out = self.fc3(out)
        return out

## DDPG

In [9]:
def to_tensor(input_df):
  return torch.tensor(input_df).to(DEVICE).float()

class DDPG(object):
  def __init__(self, actor_lr, critic_lr, state_dim, action_dim, mode='baseline',
               hidden1=400, hidden2=300, diff_multiplier=1, noised_decay=.05,
               top_k=10, max_memory=10000, depsilon=50000, discount=0.99,
               tau=0.001, batch_size=100, ou_theta=0.15, ou_mu=0.0, ou_sigma=0.2, 
               param_noise_scalar=.05, param_noise_scalar_alpha=1.01, desired_distance=.7,
               major_update_interval=500, noise_type='disturb_noise'):
    # State & Action dimension
    self.state_dim = state_dim
    self.action_dim = action_dim
    
    # Mode settings
    # baseline
    # diff: diff_pca, diff_svd, diff_vae
    # threshold: thres_vae, thres_vdbe
    # multiplier: MUL150_PCA, MUL150_VAE, MUL64_PCA, MUL64_VAE
    # Parameter noise: PARAM_NOISE
    self.mode = mode
    self.diff_multiplier = diff_multiplier
    
    # Param Noise
    self.distances = []
    self.desired_distance = desired_distance
    self.param_noise_scalar = param_noise_scalar
    self.param_noise_scalar_alpha = param_noise_scalar_alpha

    # Init Actor
    self.actor = Actor(state_dim, action_dim, mode, hidden1, hidden2)
    self.actor_noised = Actor(state_dim, action_dim, mode, hidden1, hidden2)
    self.actor_target = Actor(state_dim, action_dim, mode, hidden1, hidden2)
    self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
    # Init Critic
    self.critic = Critic(state_dim, action_dim, hidden1, hidden2)
    self.critic_target = Critic(state_dim, action_dim, hidden1, hidden2)
    self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)

    # Memory
    self.buffer = ReplayBuffer(limit=max_memory)

    # Exploration
    self.epsilon = 1.0
    self.depsilon = 1.0 / depsilon
    self.random_process = OrnsteinUhlenbeckProcess(size=action_dim, theta=ou_theta, mu=ou_mu, sigma=ou_sigma)

    # Reward related
    self.top_k = top_k
    
    # Loss Function
    self.loss_fn = nn.MSELoss()
    
    # Hyperparameters
    self.discount = discount
    self.tau = tau
    self.batch_size = batch_size
    self.noised_decay = noised_decay
    self.major_update_interval = major_update_interval
    self.noise_type = noise_type

    # Network to cuda
    self.actor.to(DEVICE)
    self.actor_noised.to(DEVICE)
    self.critic.to(DEVICE)
    self.actor_target.to(DEVICE)
    self.critic_target.to(DEVICE)
    
  def random_action(self):
    action = np.random.uniform(-1.,1.,self.action_dim).astype('float32')
    action_list = ITEM_PCA_DF.iloc[self.curr_stream_items].apply(lambda x: np.dot(x, action), axis=1).nlargest(self.top_k).index
    self.action_list = action_list
    return action
    
  def generate_latent_action(self, state, decay_epsilon=True, vdbe_epsilon=False):
    action = self.actor(to_tensor(state))
    '''
    Param Noise
    '''
    with torch.no_grad():
      self.actor_noised.load_state_dict(self.actor.state_dict().copy())
      if self.noise_type == 'disturb_noise':
        self.actor_noised.noised_add_disturb(self.param_noise_scalar)
      elif self.noise_type == 'param_noise':
        self.actor_noised.add_parameter_noise(self.param_noise_scalar)
      action_noised = self.actor_noised(to_tensor(state))

        # distance = torch.sqrt(torch.mean(torch.square(action - action_noised))).cpu().detach().numpy()
        # if distance < self.desired_distance:
        #     self.param_noise_scalar *= self.param_noise_scalar_alpha
        # else:
        #     self.param_noise_scalar /= self.param_noise_scalar_alpha
        # action = action_noised

      
    '''
    Add the diff vector to action
    '''
    current_diff = globals()[self.mode.upper()]
    # if state.name in current_diff.index:
      # for mode = [diff_pca, diff_svd, diff_vae, MUL150_VAE, MUL64_VAE, ITEM_DIFF]
      # action += to_tensor(current_diff.loc[state.name]) * self.diff_multiplier * max(self.epsilon, 0)
      # action_noised += to_tensor(current_diff.loc[state.name]) * self.diff_multiplier * max(self.epsilon, 0)

    # clamping
    action = torch.clamp(action, -1., 1.).cpu().detach().numpy()
    action_noised = torch.clamp(action_noised, -1., 1.).cpu().detach().numpy()
    
    action_list = ITEM_PCA_DF.iloc[self.curr_stream_items].apply(lambda x: np.dot(x, action), axis=1).nlargest(self.top_k).index
    action_noised_list = ITEM_PCA_DF.iloc[self.curr_stream_items].apply(lambda x: np.dot(x, action_noised), axis=1).nlargest(self.top_k).index
    self.interleaver = interleaver.Probabilistic([action_list, action_noised_list])
    # self.actions equals to rank in interleaving doc
    
    self.action_list = self.interleaver.interleave()
    
    if decay_epsilon:
      self.epsilon -= self.depsilon
        
    return action
  
  def get_actions_rewards(self):
    true_list = USER_BOUGHT_DICT[self.curr_user]
    hit_list, rewarded_items = [], set()
    for a in self.action_list:
      if a in true_list:
        hit_list.append(1)
        rewarded_items.add(a)
      else: hit_list.append(0)
    res = sum(hit_list)
    self.clicks = np.where(np.array(hit_list) == 1)[0]
    return res, rewarded_items
    
  def update_policy(self, major_update=True):
    # Calculate Interleaving results
    if len(self.clicks) > 0:
      result = self.interleaver.evaluate(self.action_list, self.clicks)
      # naive vs noised
      if len(result) > 0 and result[0][1] == 0: # means noised won
        self.actor.actor_add_noised(self.actor_noised, weight=self.noised_decay)
        if self.noised_decay == .05:
          self.noised_decay -= self.depsilon
      
    if not major_update: return 0.0, 0.0
  
    # Sample batch
    state_batch, action_batch, reward_batch, \
    next_state_batch, ongoing_batch = self.buffer.get_batch(self.batch_size)
        
    # Prepare for the target q batch
    next_q_values = self.critic_target([
        to_tensor(next_state_batch).detach(),
        self.actor_target(to_tensor(next_state_batch).detach()),
    ])
    
    # 如果 ongoing 為 True(1) 時，乘以 next_q 會有值
    # 但是 ongoing 為 False(0) 時，乘以 next_q 會等於 0
    target_q_batch = to_tensor(reward_batch).view(next_q_values.shape) + \
                       self.discount*to_tensor(ongoing_batch).view(next_q_values.shape)*next_q_values.detach()

    # Critic update
    self.critic_optim.zero_grad()

    q_batch = self.critic([ to_tensor(state_batch), to_tensor(action_batch) ])

    value_loss = self.loss_fn(q_batch, target_q_batch)
    value_loss.backward()
    self.critic_optim.step()

    # Actor update
    self.actor_optim.zero_grad()

    policy_loss = -self.critic([
        to_tensor(state_batch),
        self.actor(to_tensor(state_batch))
    ])

    policy_loss = policy_loss.mean()
    policy_loss.backward()
    self.actor_optim.step()

    # Target update
    self._soft_update(self.actor_target, self.actor)
    self._soft_update(self.critic_target, self.critic)
    return value_loss, policy_loss
    
  def _soft_update(self, target, source):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - self.tau) + param.data * self.tau
        )

## Main Method

In [10]:
def train(agent, reward_records=[], avg_rewards=[], hit_ratios=[], regret_list=[], coverage_list=[], diversity_list=[], warmup=100, ep_len=4000):
  agent.actor.train(True)
  agent.critic.train(True)
  batch_value_loss, batch_policy_loss = 0., 0.
  step_value_loss, step_policy_loss = 0., 0.
  step = 0
  # self.epsilon.clear()

  # ------------------- Episode (State) -------------------------------
  for stream in tqdm(STREAM_LIST[:3300]):
    # self.__user_episode_context()
    users = STREAM_USER_DICT[stream]

    agent.curr_stream = stream
    agent.curr_stream_items = STREAM_ITEM_DICT[stream]
    batch_value_loss, batch_policy_loss = 0., 0.
    stream_reward = 0
    false_positive_cnt = 0
    diversity_prod_set = set()
    coverage_rewarded_items = set()

    # ----------------- Runs (User under stream) ---------------------
    for i, user in enumerate(users):
      # update step
      step += 1
      
      # state & ongoing
      agent.curr_user = user
      state = USER_CONTEXT.loc[(user, stream)]
      ongoing = i != len(users) - 1

      # --------------- Actor net choose actions ----------------------
      # make actor to choose action
      # action = latent vector
      if step <= warmup:
        action = agent.random_action()
      else:
        action = agent.generate_latent_action(state)
          
      # --------------- Get next state & info to store ---------------
      # Generate reward
      reward, curr_rewarded_items = agent.get_actions_rewards()

      # next_state: next user's state
      next_user = users[i + 1] if i + 1 < len(users) else None
      
      # update metrics
      reward_records.append(reward)
      stream_reward += reward
      false_positive_cnt += (10 - reward)
      coverage_rewarded_items = coverage_rewarded_items | curr_rewarded_items
      diversity_prod_set = diversity_prod_set | set(agent.action_list)
      
      # Remember only when reward > 0 or the memory is short
      if reward > 0 or len(agent.buffer) < 10:
        storing_state = (user, stream)
        storing_next_state = (next_user, stream) if next_user else storing_state
        agent.buffer.remember(storing_state, action, reward, storing_next_state, ongoing)

      # --------------- Train on Actor and Critic --------------------
      # store pre-training value for td_error
      # old_Q = self.q_value()
      if step > warmup :
        major_update = step < 120000 or (step % agent.major_update_interval == 0)
        step_value_loss, step_policy_loss = agent.update_policy(major_update)
      # store post-training value for td_error
      # new_Q = self.q_value()
      batch_value_loss += step_value_loss
      batch_policy_loss += step_policy_loss

      # --------------- Update with TD error -------------------------
      # self.epsilon.update_at_step(self.asid, [(new_Q - old_Q), self.learn_step_counter], 1/len(self.stream_items))

    # --------------- Inspecting result at step -------------------------
    ep_avg_reward = round(sum(reward_records)/step, 2)
    avg_rewards.append(ep_avg_reward)
    
    ground_truth = STREAM_ORDER_CNT.iloc[stream]
    # hit ratio
    hr10 = stream_reward / ground_truth
    hit_ratios.append(hr10)
    # regret
    regret = false_positive_cnt / len(users)
    regret_list.append(regret)
    # coverage
    coverage = len(coverage_rewarded_items) / ground_truth
    coverage_list.append(coverage)
    # diversity
    diversity = len(diversity_prod_set) / len(STREAM_ITEM_DICT[stream])
    diversity_list.append(diversity)
    
    print(f'Stream: {stream}, user_count: {len(users)}, avg_reward@stream: {round(stream_reward/len(users), 2)}, avg_reward: {ep_avg_reward}, \
hr10: {round(hr10, 3)}, regret: {round(regret, 3)}, coverage: {round(coverage, 3)}, diversity: {round(diversity, 3)}, \
batch_value_loss: {round(float(batch_value_loss)/len(users), 3)}')
    
    # if len(reward_records) >= 247581: break

In [15]:
pd.set_option('mode.chained_assignment', None)

actor_lr = 1.0e-3
critic_lr = 1.0e-3
state_dim = USER_CONTEXT.shape[1]
action_dim = ITEM_PCA_DF.shape[1]
hidden1 = 400
hidden2 = 300
diff_multiplier = 1.049391
noised_decay = 0.5 # decay: 0.5 / not_decay: 1
top_k = 10
max_memory = 10000
depsilon = 50000
discount = 0.99
tau = 0.001
batch_size = 100
ou_theta = 0.15
ou_mu = 0.0
ou_sigma = 0.2
param_noise_scalar = 0.05
param_noise_scalar_alpha = 1.01
desired_distance = 0.7
major_update_interval = 1 # major: 500 / not_major: 1
noise_type = 'disturb_noise' # baseline: 'disturb_noise' / 'param_noise'

mode = 'diff_pca'

warnings.simplefilter(action='ignore', category=FutureWarning)

## Execute

In [None]:
%%time
ddpg_agent = DDPG(actor_lr, critic_lr, state_dim, action_dim, mode, hidden1, hidden2, diff_multiplier, noised_decay,
                  top_k, max_memory, depsilon, discount, tau, batch_size, ou_theta, ou_mu, ou_sigma, 
                  param_noise_scalar, param_noise_scalar_alpha, desired_distance, major_update_interval, noise_type)
reward_records = []
avg_rewards = []
hit_ratios, regret_list, coverage_list, diversity_list = [], [], [], []
train(ddpg_agent, reward_records, avg_rewards, hit_ratios, regret_list, coverage_list, diversity_list)

  0%|          | 0/3300 [00:00<?, ?it/s]

Stream: 0, user_count: 56, avg_reward@stream: 0.46, avg_reward: 0.46, hr10: 0.347, regret: 9.536, coverage: 0.253, diversity: 1.0, batch_value_loss: 0.0
Stream: 1, user_count: 534, avg_reward@stream: 0.41, avg_reward: 0.41, hr10: 0.131, regret: 9.592, coverage: 0.032, diversity: 1.0, batch_value_loss: 0.06
Stream: 2, user_count: 56, avg_reward@stream: 1.46, avg_reward: 0.5, hr10: 0.812, regret: 8.536, coverage: 0.119, diversity: 0.75, batch_value_loss: 0.154
Stream: 3, user_count: 29, avg_reward@stream: 1.17, avg_reward: 0.53, hr10: 0.739, regret: 8.828, coverage: 0.217, diversity: 0.714, batch_value_loss: 0.134
Stream: 4, user_count: 104, avg_reward@stream: 0.25, avg_reward: 0.5, hr10: 0.171, regret: 9.75, coverage: 0.092, diversity: 0.355, batch_value_loss: 0.053
Stream: 5, user_count: 83, avg_reward@stream: 0.81, avg_reward: 0.53, hr10: 0.744, regret: 9.193, coverage: 0.133, diversity: 0.875, batch_value_loss: 0.037
Stream: 6, user_count: 37, avg_reward@stream: 0.38, avg_reward: 0.5

In [17]:
mode

'diff_pca'

---
## Save Result & Model

In [18]:
diff_items = {'mode': f'interleaving', 
              'reward_records': reward_records, 
              'avg_rewards': avg_rewards,
              'hit_ratios': hit_ratios,
              'regret_list': regret_list,
              'coverage_list': coverage_list,
              'diversity_list': diversity_list}

with open(f'results/interleaving.pkl', 'wb') as handle:
    pickle.dump(diff_items, handle, protocol=pickle.HIGHEST_PROTOCOL)

torch.save(ddpg_agent, f'models/interleaving.pth')

model = torch.load(f'models/interleaving.pth')

model.depsilon == ddpg_agent.depsilon

True

---
## profiling

In [12]:
%load_ext line_profiler

In [13]:
ddpg1 = DDPG(actor_lr, critic_lr, state_dim, action_dim, mode, hidden1, hidden2, diff_multiplier,
                  top_k, max_memory, depsilon, discount, tau, batch_size, ou_theta, ou_mu, ou_sigma, 
                  param_noise_scalar, param_noise_scalar_alpha, desired_distance)
reward_record1 = []
avg_reward1 = []

In [14]:
%lprun -f train train(ddpg1, reward_record1, avg_reward1, ep_len=3)

  0%|          | 0/3 [00:00<?, ?it/s]

Stream: 0, user_count: 56, sum_rewards: 25,avg_reward@stream: 0.45, avg_reward: 0.45, batch_value_loss: 0.0, batch_policy_loss: 0.0 | reward_len: 29
<class 'numpy.ndarray'> 1 [] 0
2 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
<class 'numpy.ndarray'> 3 [] 0
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
3 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
1 [(1, 0)] 1 (1, 0)
interleave_add_noised
2 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
<class 'numpy.ndarray'> 1 [] 0
1 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
2 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0, 1)
1 [(0, 1)] 1 (0,

Timer unit: 1e-09 s

Total time: 43.1675 s
File: <ipython-input-10-8a7078876512>
Function: train at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def train(agent, reward_records=[], avg_rewards=[], warmup=100, ep_len=4000):
     2         1      41698.0  41698.0      0.0    agent.actor.train(True)
     3         1      22399.0  22399.0      0.0    agent.critic.train(True)
     4         1        179.0    179.0      0.0    batch_value_loss, batch_policy_loss = 0., 0.
     5         1        147.0    147.0      0.0    step_value_loss, step_policy_loss = 0., 0.
     6         1        204.0    204.0      0.0    step = 0
     7                                             # self.epsilon.clear()
     8                                           
     9                                             # ------------------- Episode (State) -------------------------------
    10         3   27740407.0 9246802.3      0.1    for 

1 [(0, 1)] 1 (0, 1)
3 [(0, 1)] 1 (0, 1)
Stream: 2, user_count: 56, sum_rewards: 370,avg_reward@stream: 1.27, avg_reward: 0.57, batch_value_loss: 0.091, batch_policy_loss: -2.08 | reward_len: 279


In [48]:
ddpg1.clicks.size

2

In [15]:
%lprun -f ddpg1.interleaver.compute_scores ddpg1.interleaver.compute_scores(ddpg1.action_list, ddpg1.clicks)

Timer unit: 1e-09 s

Total time: 0.0585661 s
File: ../../interleaving/interleaving/probabilistic.py
Function: compute_scores at line 138

Line #      Hits         Time  Per Hit   % Time  Line Contents
   138                                               @classmethod
   139                                               def compute_scores(cls, ranking, clicks, tau=3.0, n=10**4):
   140                                                   '''
   141                                                   ranking: an instance of Ranking
   142                                                   clicks: a list of indices clicked by a user
   143                                           
   144                                                   Return a list of scores of each ranker.
   145                                                   '''
   146         1        999.0    999.0      0.0          L = ranking
   147         1       8790.0   8790.0      0.0          C = {ranking[index] for index in cl

In [131]:
%lprun -f ddpg1.buffer.get_batch ddpg1.buffer.get_batch(ddpg1.batch_size)

Timer unit: 1e-09 s

Total time: 0.0471625 s
File: <ipython-input-120-a0bda88d93f5>
Function: get_batch at line 53

Line #      Hits         Time  Per Hit   % Time  Line Contents
    53                                             def get_batch(self, batch_size=100):
    54         1       3766.0   3766.0      0.0      assert len(self.states_buffer) == len(self.next_states_buffer) == len(self.actions_buffer) == \
    55         1        876.0    876.0      0.0             len(self.rewards_buffer) == len(self.ongoings_buffer)
    56                                               # same for sample_and_split
    57         1        198.0    198.0      0.0      states0_batch = []
    58         1        145.0    145.0      0.0      states1_batch = []
    59         1        131.0    131.0      0.0      actions_batch = []
    60         1        131.0    131.0      0.0      rewards_batch = []
    61         1        137.0    137.0      0.0      ongoings_batch = []
    62         1      56276.

In [15]:
a = [1, 2, 3, 4, 5] # Ranking 1
b = [4, 3, 5, 1, 2] # Ranking 2
method = interleaving.Probabilistic([a, b])
ranking = method.interleave()
ranking

[1, 2, 4, 3, 5]

In [16]:
clicks = np.uint32([1, 3]).astype(int).tolist()
result = method.evaluate(ranking, clicks)
result

compute
{0: 1.505927205808695, 1: 0.49407279419130523}


[(0, 1)]

In [17]:
type(clicks[0])

int

In [18]:
clicks

[1, 3]

In [19]:
clicks = [1, 3]
result = interleaving.Probabilistic.evaluate(ranking, clicks)
result

compute
{0: 1.505927205808695, 1: 0.49407279419130523}


[(0, 1)]

In [27]:
a = set([1, 2, 3])
b = set([1, 2, 4, 5, 3, 2])

In [31]:
a = a | b

In [32]:
a

{1, 2, 3, 4, 5}