# Runing

# DQN Implementation

In [1]:
from preprocessing import USER_STREAM_CONTEXT, ITEM_DF, ITEM_STREAM_DF, REAL_BOUGHT_DF, LAST_BOUGHT_STREAM, LB_ITEMS, USER_LIST, LB_CE, STREAM_LIST
from utils import gen_exist_series, get_full_state, get_reward, calculate_interest_change, get_input, model_predict_top10, INPUT_DF_COL
from replay import ReplayBuffer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from matplotlib import pyplot as plt
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import random
import itertools
from datetime import datetime
import pytz

In [2]:
if tf.test.gpu_device_name(): 
    print('Default GPU Device:\
      {}'.format(tf.test.gpu_device_name()))
else:
   print("Please install GPU version of TF")

Default GPU Device:      /device:GPU:0


## Train DQN model
* Input: `user_df` 253, `item_df` 231(BERT: 768), interact (?), `reward` 1
* Output: recommend a list of items
* Methods Needed
    * Environment Function
    * Choose Action
    * Store Transition
    * Learn

In [3]:
### Training Process

def train(model, exp_replay, epochs, batch_size, num_episode=1000, verbose=1, reward_set='strict'):
  # total_actions = ITEM_DF.shape[0]
  total_episodes = len(USER_LIST)
  # Reset win counter
  win_cnt = 0
  win_hist = []

  for e in range(epochs):
    loss = 0.
    # TODO/MAIN: Apply user preference changes as epsilon
    # epsilon for exploration - dependent inversely on the training epoch
    epsilon = 4 / ((e + 1) ** (1 / 2))

    # handling episodes by assigning users from USER_LIST
    # Each user represent an Episode
    episodes = random.sample(range(total_episodes), num_episode)

    print(f'Epoch {e} started.   Time: {datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S")}')
    # ------------------- Episode (User) -------------------------------
    for user_episode in episodes:
      # game_over = False
      # get episode data by user phone number
      user_phone = USER_LIST[user_episode]
      user_all_streams = USER_STREAM_CONTEXT.xs(user_phone, level="聯絡電話")
      stream_list = user_all_streams.index
      final_stream = LAST_BOUGHT_STREAM.loc[user_phone, '場次']
      
      
      # ----------------- Runs (User x All_Stream) ---------------------
      for i, stream in enumerate(stream_list):          
        game_over = stream == final_stream
                
        # Get full state: current_state = user_stream + item_stream
        # 用上一場紀錄預測下一場直播會購買的商品
        current_state = get_full_state(user_all_streams, stream_list, i)
        stream_items = current_state['cand_item']
        
        # --------------- Explore/Exploit Section ----------------------
        if np.random.rand() <= epsilon:
          # Explore by randomly select 10/n items from candidate_items
          # Get all items from the stream
          sample_actions = random.sample(stream_items, 10) if len(stream_items) > 10 else stream_items
          action_ids = gen_exist_series(sample_actions, stream_items)
        else:
          # Exploit by choosing action from the model's prediction
          pred_actions = model_predict_top10(model, current_state)
          action_ids = gen_exist_series(pred_actions, stream_items)

        # --------------- Get next state & info to store ---------------
        reward = get_reward(user_phone, stream, action_ids)
        next_state = get_full_state(user_all_streams, stream_list, i+1) if not game_over else []

        if sum(reward) > 0:
          win_cnt += 1

        # --------------- Calculating Interest Changes -----------------
        interest_score = calculate_interest_change(user_all_streams, stream_list, i)

        # --------------- Store Experience -----------------------------
        exp_replay.remember(interest_score,
                            [current_state, action_ids, reward, next_state],
                            game_over)
        

        # --------------- Load batch of experiences --------------------
        inputs, targets = exp_replay.get_batch(model, batch_size=batch_size)
        # train model on experiences
        batch_loss = model.train_on_batch(inputs, targets)
        loss += batch_loss
            
    if verbose > 0:
      print(f'Epoch: {e}/{epochs} | Loss {loss} | Win count {win_cnt} | Time {datetime.now(pytz.timezone("Asia/Taipei")).strftime("%H:%M:%S")}')
    
    # Track win history to later check if our model is improving at the game over time.
    win_hist.append(win_cnt)
  return win_hist

In [6]:
pd.set_option('mode.chained_assignment', None)

# parameters
MAX_MEMORY = 1000  # Maximum number of experiences we are storing
BATCH_SIZE = 5  # Number of experiences we use for training per batch
EPOCH = 50
TOTAL_ACTIONS = 1 # probability of ordering
NUM_EPISODE = 100
HIDDEN_SIZE = 512


warnings.simplefilter(action='ignore', category=FutureWarning)

### Main Method

In [None]:
exp_replay = ReplayBuffer(max_memory=MAX_MEMORY)# Our model's architecture parameters
input_size = 473 # The input shape for model - this comes from the output shape of the CNN Mobilenet

# Setting up the model with keras.
model = keras.Sequential()
model.add(Dense(HIDDEN_SIZE, input_shape=(input_size,), activation='relu'))
model.add(Dense(HIDDEN_SIZE, activation='tanh'))
model.add(Dense(TOTAL_ACTIONS))
model.compile(Adam(learning_rate=.000001), "mse")


# Training the model
hist = train(model, 
             exp_replay, 
             epochs=EPOCH, 
             batch_size=BATCH_SIZE, 
             num_episode=NUM_EPISODE, 
             verbose=1, 
             reward_set='strict')
plt.plot(range(EPOCH), hist)

Epoch 0 started.   Time: 15:59:35


In [None]:
plt.plot(range(50), hist)
plt.title('Cumulated hit at each epoch(buy > 25)')
plt.xlabel('Epoch')
plt.ylabel('Cumulated hit count')
plt.show()

In [None]:
hist_ = hist[1:]
result = [105]
hist_ = [a - b for a, b in zip(hist_, hist)]
for a in hist_: result.append(a)

In [None]:
plt.plot(range(50), result)
plt.title('Hit at each epoch(buy > 25)')
plt.xlabel('Epoch')
plt.ylabel('Hit count')
plt.show()

In [None]:
result

In [None]:
[105, 110, 109, 106, 134, 109, 117, 126, 102, 147, 98, 123, 131, 108, 106, 109, 87, 92, 105, 88, 128, 108, 64, 116, 79, 85, 95, 74, 89, 89, 69, 81, 71, 88, 81, 91, 96, 70, 94, 63, 98, 49, 66, 71, 70, 52, 59, 93, 48, 54]


## Experiment Result
When `buy_threshold` = 9, the average `win_cnt` of the first 3 epoches is 77(80/??/232)

In [None]:
hist