# Main Training

In [1]:
import pandas as pd
import numpy as np
import itertools
from tqdm.notebook import tqdm
import pickle

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from matplotlib import pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder
import random
from datetime import datetime
import pytz

In [2]:
CONTEXT_REPS = pd.read_pickle('final_context.pkl')
STREAM_ITEM_DICT = pd.read_pickle('stream_item_dict.pkl')
BERT_BY_IDX_DF = pd.read_pickle('bert_by_idx_pca.pkl')
BOUGHT_DICT = pd.read_pickle('bought_dict.pkl')

In [3]:
CONTEXT_REPS.shape, len(STREAM_ITEM_DICT), BERT_BY_IDX_DF.shape, len(BOUGHT_DICT)

((1397141, 181), 7701, (162189, 160), 79207)

In [4]:
USER_LIST = CONTEXT_REPS.index.get_level_values('asid').tolist()

In [5]:
'''
retrieve full state -> should be exported to pickle
'''
USER_ALL_STREAM_INIT = CONTEXT_REPS.describe().loc['50%']

def get_full_state(asid, user_all_streams, stream_list, i):
  # Get full state: current_state = user_stream + item_stream
  # 第一次參加直播/cold start
  # CE paper: user_interest part init with random vector
  # TODO init with random values
  #      Cold start problem
  
  #! 前一個另外處理，其他人直接黏在一起
  user_part = USER_ALL_STREAM_INIT.copy() if (i - 1) == -1 else user_all_streams.loc[stream_list[(i - 1)]]
  return user_part

In [6]:
LB_ITEMS = [f'i{x}' for x in range(160)]
INPUT_DF_COL__USR = CONTEXT_REPS.columns.to_list()
INPUT_DF_COL = INPUT_DF_COL__USR + LB_ITEMS

'''
Convert state format to model input format
'''
def get_input(input_state):
  # Get item feats
  item_list = STREAM_ITEM_DICT[input_state.name[1]]
  item_feat = BERT_BY_IDX_DF.loc[item_list]

  # Create new df
  stream_item_feat = pd.DataFrame(columns=INPUT_DF_COL)

  # Fill in other context
  stream_item_feat = stream_item_feat.append([input_state]*len(item_list),ignore_index=True)

  # stream_item_feat
  stream_item_feat[LB_ITEMS] = item_feat.reset_index(drop=True)
  return stream_item_feat.astype('float32')


In [7]:
'''
Generate series: whether elements in A existed in list B
A, B: List
return: pd.Series
example:
  A: [1, 2, 4, 5]
  B: [1, 2, 3, 4, 5, 6, 7]
  return: Series([1, 1, 0, 1, 1, 0, 0], index=[1, 2, 3, 4, 5, 6, 7])
'''
def gen_exist_series(A, B):
  exist_list = [int(item in A) for item in B]
  return pd.Series(exist_list, index=B)

In [8]:
'''
Comparison function for reward
！考慮「所有」歷史購買紀錄！！！！！！
'''
def r(a, b):
  # if a==1 and b==0: return 0 # -1 when the rule is to punish unrec-bought
  # else: 
  return a & b

def get_reward(asid, stream, action_ids):
  items = action_ids.index # 不確定 action_id 裡面是 series 還是 list
  real_bought_ids = BOUGHT_DICT[asid]
  real_bought_ids_series = gen_exist_series(real_bought_ids, items)
  
  reward_list = [r(a, b) for a, b in zip(real_bought_ids_series.values, action_ids.values)]
  return pd.Series(reward_list, index=items)

In [9]:
def model_predict_top10(model, input_state):
  # Get all items
  full_input = get_input(input_state).astype('float32')
  
  # 紀錄所有預測結果
  predicts = model.predict(full_input)
  full_input['predict'] = predicts
  actions = full_input['predict'].nlargest(10).index #['i_item_id'].to_list()
  actions = full_input.loc[actions, 'i_item_id'].values
  return actions

## Replay

In [10]:
### Collecting Training Data
import pandas as pd
import numpy as np

class ReplayBuffer:
  def __init__(self, max_memory=100000, discount=.9):
    """
    Setup
    max_memory: the maximum number of experiences we want to store
    memory: a list of experiences
    discount: the discount factor for future experience
    In the memory the information whether the game ended at the state is stored seperately in a nested array
    [...
    [experience, game_over]
    [experience, game_over]
    ...]
    """
    self.max_memory = max_memory
    self.memory = list()
    self.discount = discount

  def remember(self, interest_score, states, game_over):
    # Save a state to memory
    self.memory.append([interest_score, states, game_over])
    # We don't want to store infinite memories, so if we have too many, we just delete the oldest one
    if len(self.memory) > self.max_memory:
      del self.memory[0]

  def get_batch(self, model, batch_size=10):
    # How many experiences do we have?
    len_memory = len(self.memory)

    # Calculate the number of actions that can possibly be taken in the game.
    # Actions: 0 = not recommend, 1 = recommend
    num_actions = model.output_shape[-1]

    # Dimensions of our observed states, ie, the input to our model.
    # Memory:  [
    #   [interest_score, [ [...state], action, reward, next_state_idx], game_over],
    #   [interest_score, [ [...state], action, reward, nexr_state_idx], game_over],
    #   ...
    # ]
    env_dim = len(INPUT_DF_COL)

    inputs = pd.DataFrame(columns=INPUT_DF_COL)
    targets = pd.DataFrame(columns=[0])
    
    
    
    # We draw states to learn from randomly
    for i, idx in enumerate(np.random.randint(0, len_memory, size=min(len_memory, batch_size))):      

      # Here we load one transition <s, a, r, s'> from memory
      state_t, action_t, reward_t, state_tp1 = self.memory[idx][1]
      # state_t = state_t.astype('float32')
      game_over = self.memory[idx][2]

      '''
      修改倒入 state 的方式 input = (state - item) + item_feat
      拆掉 model_predict 成 function
      
      here should be state_t * all_items
      '''
      state_t = get_input(state_t).astype('float32')
      # puts state into input
      inputs = pd.concat([inputs, state_t], axis=0)
      # TODO: Modify input shape 0

      # First we fill the target values with the predictions of the model.
      # They will not be affected by training (since the training loss for them is 0)
      # TODO
      '''
      每個 actions 都會被 predict 一個成績/reward
      '''

      # if the game ended, the reward is the final reward
      if game_over:  # if game_over is True
        state_t['reward'] = reward_t
        targets = pd.concat([targets, reward_t], axis=0).astype('float32')
      else:
        state_t['reward'] = model.predict(state_t).flatten()
        # 找到 action_t 們，指到 state_t 上去算 discount values
        action_t = action_t[action_t == 1].index.to_list()
        
        state_tp1 = get_input(state_tp1)
        Q_sa = np.max(model.predict(state_tp1)[0])
        # r + gamma * max Q(s',a')
        # DataFrame apply
        state_t.loc[action_t, 'reward'] = state_t.loc[action_t, 'reward'].apply(lambda x: x + self.discount * Q_sa).astype('float32')
        targets = pd.concat([targets, state_t['reward']], axis=0).astype('float32')
    return inputs, targets

## Train

In [16]:
def train(model, exp_replay, epochs, batch_size, num_episode=1000, verbose=1, reward_set='strict', hist=[], c_hist=[], rec_list=[]):
  # total_actions = ITEM_DF.shape[0]
  # total_episodes = len(USER_LIST)
  # Reset win counter
  c_win_cnt = 0

  for e in range(epochs):
    rec_cnt = 0
    win_cnt = 0
    loss = 0.
    # TODO/MAIN: Apply user preference changes as epsilon
    # epsilon for exploration - dependent inversely on the training epoch
    epsilon = 4 / ((e + 1) ** (1 / 2))

    # handling episodes by assigning users from USER_LIST
    # Each user represent an Episode
    episodes = np.random.choice(USER_LIST, num_episode, replace=False)

    print(f'Epoch {e} started.   Time: {datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S")}')
    # ------------------- Episode (User) -------------------------------
    for asid in episodes:
      # get [episode data, stream list, final stream] by asid
      user_all_streams = CONTEXT_REPS.xs(asid, level="asid")
      stream_list = user_all_streams.index
      final_stream = max(stream_list)
      
      # ----------------- Runs (User x All_Stream) ---------------------
      for i, stream in enumerate(stream_list):          
        game_over = stream == final_stream
                
        # Get full state: current_state = user_stream + item_stream
        # 用上一場紀錄預測下一場直播會購買的商品
        current_state = get_full_state(asid, user_all_streams, stream_list, i)
        stream_items = STREAM_ITEM_DICT[stream]
        
        # --------------- Explore/Exploit Section ----------------------
        if np.random.rand() <= epsilon:
          # Explore by randomly select 10/n items from candidate_items
          # Get all items from the stream
          sample_actions = random.sample(stream_items, 10) if len(stream_items) > 10 else stream_items
          action_ids = gen_exist_series(sample_actions, stream_items)
        else:
          # Exploit by choosing action from the model's prediction
          pred_actions = model_predict_top10(model, current_state)
          action_ids = gen_exist_series(pred_actions, stream_items)

        # --------------- Get next state & info to store ---------------
        reward = get_reward(asid, stream, action_ids)
        next_state = get_full_state(asid, user_all_streams, stream_list, i+1) if not game_over else []

        rec_cnt += 1
        if sum(reward) > 0:
          c_win_cnt += 1
          win_cnt += 1

        # --------------- Calculating Interest Changes -----------------
        interest_score = calculate_interest_change(user_all_streams, stream_list, i)

        # --------------- Store Experience -----------------------------
        exp_replay.remember(interest_score,
                            [current_state, action_ids, reward, next_state],
                            game_over)
        

        # --------------- Load batch of experiences --------------------
        inputs, targets = exp_replay.get_batch(model, batch_size=batch_size)
        # train model on experiences
        batch_loss = model.train_on_batch(inputs, targets)
        loss += batch_loss
        
        break
            
    if verbose > 0:
      print(f'Epoch: {e}/{epochs} | Loss {loss} | Win count {win_cnt} | Rec count {rec_cnt} | Time {datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S")}')
    
    # Track win history to later check if our model is improving at the game over time.
    hist.append(win_cnt)
    c_hist.append(c_win_cnt)
    rec_list.append(rec_cnt)
    
    break

In [12]:
pd.set_option('mode.chained_assignment', None)

# parameters
MAX_MEMORY = 1000  # Maximum number of experiences we are storing
BATCH_SIZE = 2  # Number of experiences we use for training per batch
EPOCH = 50
TOTAL_ACTIONS = 1 # probability of ordering
NUM_EPISODE = 100
HIDDEN_SIZE = 512


warnings.simplefilter(action='ignore', category=FutureWarning)

### Main Method

In [17]:
exp_replay = ReplayBuffer(max_memory=MAX_MEMORY)# Our model's architecture parameters
input_size = 473 # The input shape for model - this comes from the output shape of the CNN Mobilenet

# Setting up the model with keras.
model = keras.Sequential()
model.add(Dense(HIDDEN_SIZE, input_shape=(input_size,), activation='relu'))
model.add(Dense(HIDDEN_SIZE, activation='tanh'))
model.add(Dense(TOTAL_ACTIONS))
model.compile(Adam(learning_rate=.000001), "mse")

hist = []
c_hist = []
rec_list = []

# Training the model
train(model, 
      exp_replay, 
      epochs=EPOCH, 
      batch_size=BATCH_SIZE, 
      num_episode=NUM_EPISODE, 
      verbose=1, 
      reward_set='strict',
      hist=hist,
      c_hist=c_hist,
      rec_list=rec_list)
plt.plot(range(EPOCH), hist)

Epoch 0 started.   Time: 15:27:30


NameError: name 'interest_score' is not defined