In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
!pip uninstall tensorflow -y
!pip install -q tensorflow==2.11.0

In [None]:
# https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2
!pip install -U -q sentence-transformers

In [4]:
import pandas as pd
import numpy as np
import re
import random

import tensorflow as tf
from tensorflow.keras import models, Model, layers, regularizers

import time
from datetime import datetime
from dateutil.relativedelta import relativedelta
import math
import gc

In [5]:
import warnings
warnings.filterwarnings("ignore")

#讀入資料

In [6]:
def read_raw_data():
  raw_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/extract_data_221123.csv')

  # 把日期相關欄位轉為日期格式 yyyy-mm-dd hh:mm:ss
  raw_data['order_time'] = pd.DatetimeIndex(raw_data['訂單日期'])
  raw_data['first_order'] = pd.DatetimeIndex(raw_data['first_order'])
  raw_data['concert_time'] = pd.DatetimeIndex(raw_data['concert_time'])

  # 衍生欄位
  raw_data['dow'] = raw_data['concert_time'].dt.day_name()

  # 使用者下訂時間距音樂會時間分組
  # 當天: 0；1~7天: 1；8~14天: 2；15~30天: 3；31~60天: 4；60天以上: 5；沒買: 6
  raw_data['order_concert'] = raw_data['concert_time'] - raw_data['order_time']
  raw_data['order_concert'] = raw_data['order_concert'].dt.days
  raw_data['order_concert_group'] = raw_data['order_concert'].apply(lambda x : 0 if x == 0 else
                                                                    1 if x <= 7 else
                                                                    2 if x <= 14 else
                                                                    3 if x <= 30 else
                                                                    4 if x <= 60 else 5)

  raw_data = raw_data.rename(columns = {'訂購人會員代碼' : 'userID',
                                        '場次代碼' : 'itemID'})



  ## ===== user data  ==== ##
  # 有 81位使用者沒有資料
  # 最後模型沒有用到 age & gender
  user_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/raw_member_data_210827.csv')
  user_data['bd'] = pd.DatetimeIndex(user_data['生日'])

  raw_data = raw_data.merge(user_data, left_on = 'userID', right_on = '會員代碼')

  #raw_data['age'] = raw_data['order_time'].dt.year-raw_data['bd'].dt.year
  #raw_data['gender'] = raw_data['性別'].apply(lambda x : 'F' if x=='女' else 'M')

  # 18以下；19-29；30-65；65+
  #raw_data['age_group'] = pd.cut(raw_data['age'],
  #                              bins = [0, 18, 30, 65, 200], labels = [0, 1, 2, 3],
  #                              include_lowest = True)

  ## ===== category  ==== ##
  # 音樂會類別：國內外、類別
  # 獨奏、獨唱、獨唱獨奏合併為一列
  # 大陸音樂團體、打擊樂、歌劇、爵士樂、管絃樂併入其他音樂節目
  cat = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/category.csv')

  cat['category'] = cat['category'].replace('獨奏', '獨唱獨奏')
  cat['category'] = cat['category'].replace('獨唱', '獨唱獨奏')
  cat['category'] = cat['category'].replace('大陸音樂團體', '其他音樂節目')
  cat['category'] = cat['category'].replace('打擊樂', '其他音樂節目')
  cat['category'] = cat['category'].replace('歌劇', '其他音樂節目')
  cat['category'] = cat['category'].replace('爵士樂', '其他音樂節目')
  cat['category'] = cat['category'].replace('管絃樂', '其他音樂節目')

  cat = cat.rename(columns = {'場次代碼' : 'itemID'})

  raw_data = raw_data.merge(cat, on = 'itemID')

  # 目標變數欄
  raw_data['y'] = 1

  # 為節省記憶體空間，另存新檔
  file_name = 'raw_data.csv'
  raw_data.to_csv(file_name, index = False, encoding='utf8')

  return file_name

## dictionaries

In [11]:
# 各類別變數字典
def dic_generate(raw_data_file_name):
  raw_data = pd.read_csv(raw_data_file_name)
  raw_data['bd'] = pd.DatetimeIndex(raw_data['bd'])
  raw_data['concert_time'] = pd.DatetimeIndex(raw_data['concert_time'])

  dow_dict = raw_data[['itemID', 'dow']].drop_duplicates().set_index('itemID').T.to_dict('list')
  inter_dict = raw_data[['itemID', 'inter']].drop_duplicates().set_index('itemID').T.to_dict('list')
  cat_dict = raw_data[['itemID', 'category']].drop_duplicates().set_index('itemID').T.to_dict('list')
  concert_time_dict = raw_data[['itemID', 'concert_time']].drop_duplicates().set_index('itemID').T.to_dict()

  return dow_dict, inter_dict, cat_dict, concert_time_dict


In [13]:
# 產生raw_data
raw_data_file_name = read_raw_data()
# 產生類別字典
dow_dict, inter_dict, cat_dict, concert_time_dict = dic_generate(raw_data_file_name)

## title embedding

In [14]:
# https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2

from sentence_transformers import SentenceTransformer

def title_embedding(raw_data_file_name):
  raw_data = pd.read_csv(raw_data_file_name)
  raw_data['title'] = raw_data['產品名稱']

  title_model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')

  title = raw_data[['title', 'itemID']].drop_duplicates().reset_index(drop = True)
  title_enc = title_model.encode(title['title'])

  # 建立 title_idx 作為之後對應時的 index
  title['title_idx'] = [x for x in range(len(title))]
  title_enc_dict = title[['itemID', 'title_idx']].set_index('itemID').T.to_dict('list')

  return title_enc, title, title_enc_dict

In [None]:
# 產生 title_idx
title_enc, title, title_enc_dict = title_embedding(raw_data_file_name)

#train set

## expand

製作「未購買」紀錄：

挑選訓練時間前18個月內下架的節目，為訓練集內的使用者製作「未購買」的資料。每位使用者的「未購買」資料筆數設為訓練集長度的1/500。

In [17]:
def train_filter(train_set):
  while True:
    item_group = train_set.groupby('itemID').size().to_frame('item_count').reset_index()
    item_group = item_group[item_group['item_count'] > 1]
    train_set = train_set[train_set['itemID'].isin(list(item_group['itemID']))]

    user_group = train_set.groupby('userID').size().to_frame('user_count').reset_index()
    user_group = user_group[user_group['user_count'] > 1]
    train_set = train_set[train_set['userID'].isin(list(user_group['userID']))]

    if min(item_group['item_count']) > 1 and min(user_group['user_count']) > 1:
      return train_set

In [18]:
t = datetime(2019, 1, 1)

def train_expand(d):
  raw_data = pd.read_csv(raw_data_file_name)
  #把日期相關欄位轉為日期格式 yyyy-mm-dd hh:mm:ss
  raw_data['order_time'] = pd.DatetimeIndex(raw_data['訂單日期'])
  raw_data['first_order'] = pd.DatetimeIndex(raw_data['first_order'])
  raw_data['concert_time'] = pd.DatetimeIndex(raw_data['concert_time'])

  train_time = t + relativedelta(days = d*15)
  train_set = raw_data[raw_data['order_time'] < train_time]

  # 只保留有一筆以上的資料
  # 大約會少掉 20,000筆
  train_set = train_filter(train_set)

  # 使用者最近一次下單 (recent)
  recent = train_set[['userID', 'order_time']].sort_values(by = 'order_time', ascending = False).drop_duplicates(subset='userID')
  recent['order_time'] = (train_time - recent['order_time']).dt.days

  # 最近一次下單時間分組
  # 0: 0~7 / 1: 8~14 / 2: 15~21 / 3: 22~31/ 4: 31~60 / 5: 61~90 / 6: 91~180 / 7: 181~365 / 8: 366~730 / 9: 731~1095 / 10: 1096+
  recent['recent_group'] = recent['order_time'].apply(lambda x : 0 if x <=7 else
                                                      (1 if x <= 14 else
                                                      2 if x <= 21 else
                                                      3 if x <= 31 else
                                                      4 if x <= 60 else
                                                      5 if x <= 90 else
                                                      6 if x <= 180 else
                                                      7 if x <= 365 else
                                                      8 if x <= 730 else
                                                      9 if x <= 1095 else
                                                      10))
  recent_group_dict = recent[['userID', 'recent_group']].set_index('userID').T.to_dict('list')

  # 使用者下訂時間距音樂會時間分組
  # 當天: 0；1~7天: 1；8~14天: 2；15~30天: 3；31~60天: 4；60天以上: 5；沒買: 6（為「未購買」紀錄）
  train_set['order_concert'] = train_set['concert_time'] - train_set['order_time']
  train_set['order_concert'] = train_set['order_concert'].dt.days
  train_set['order_concert_group'] = train_set['order_concert'].apply(lambda x : 0 if x == 0 else
                                                                      1 if x <= 7 else
                                                                      2 if x <= 14 else
                                                                      3 if x <= 30 else
                                                                      4 if x <= 60 else 5)

  train_set = train_set.drop_duplicates(subset = ['userID', 'itemID'])

  # 訓練集中 train_time 後尚在架上的節目
  avail_event = train_set[(train_set['concert_time'] > train_time) & (train_set['first_order'] <= train_time)]['itemID'].unique()

  # 每個使用者產生未購買的資料
  # 從近18個月內下架的節目中挑
  train_expand_time = train_time - relativedelta(months = 18)

  expand_users = train_set['userID'].unique()
  expand_items = list(train_set[(train_set['concert_time'] < train_time) & (train_set['concert_time'] >= train_expand_time)]['itemID'].unique())

  sample_size = int(len(train_set)/500)
  total_expand = []

  for u in expand_users:
    # 組合使用者和所有產品的 tuple
    expand_pairs = set((u, i) for i in expand_items)

    # 移除使用者買過的產品
    user_sub = train_set[train_set['userID'] == u]
    train_pairs = set(zip(user_sub['userID'], user_sub['itemID']))
    expand_pairs = expand_pairs - train_pairs

    # 抽樣
    # 如果可選產品小於 sample_size 則全部保留
    if len(expand_pairs) > sample_size:
      expand_pairs = random.sample(expand_pairs, sample_size)

    total_expand = total_expand + expand_pairs

  # 組合 expand 資料
  expand_train = pd.DataFrame(total_expand, columns = ['userID', 'itemID'])
  expand_train['dow'] = expand_train['itemID'].apply(lambda x: dow_dict[x][0])
  expand_train['inter'] = expand_train['itemID'].apply(lambda x: inter_dict[x][0])
  expand_train['category'] = expand_train['itemID'].apply(lambda x: cat_dict[x][0])
  expand_train['order_concert_group'] = 6
  expand_train['y'] = 0

  # 合併資料
  train_set = pd.concat([train_set, expand_train])

  # 為所有資料補上使用者最近購買分組
  train_set['recent_group'] = train_set['userID'].apply(lambda x : recent_group_dict[x][0])

  # 只保留必要欄位
  train_set = train_set[['userID', 'itemID', 'dow', 'inter',
                         'category', 'recent_group', 'order_concert_group',
                         'y']]

  train_set = train_set.sample(frac = 1).reset_index(drop = True)

  train_path = 'train.csv'
  train_set.to_csv(train_path, index = False)

  return train_time, train_path, avail_event, recent_group_dict

##編流水號

In [19]:
def make_idx(file_name):
  train_data = pd.read_csv(file_name)

  # 音樂會編流水號
  events = train_data['itemID'].drop_duplicates().sort_values().to_frame()
  events['item_idx'] = range(0, len(events))
  event_id_idx = events.set_index('itemID').T.to_dict('list')

  # 使用者編流水號
  users = train_data['userID'].drop_duplicates().sort_values().to_frame()
  users['user_idx'] = range(0, len(users))
  user_id_idx = users.set_index('userID').T.to_dict('list')

  #合併到 train 中
  train_data['user_idx'] = train_data['userID'].apply(lambda x : user_id_idx[x][0])
  train_data['item_idx'] = train_data['itemID'].apply(lambda x : event_id_idx[x][0])

  # categorical col vocabularies
  feature_dict = {'userID' : train_data['userID'].unique(),
                  'itemID' : train_data['itemID'].unique(),
                  'dow' : train_data['dow'].unique(),
                  'inter' : train_data['inter'].unique(),
                  'category' : train_data['category'].unique(),
                  'recent_group' : train_data['recent_group'].unique(),
                  'order_concert_group' : train_data['order_concert_group'].unique()}

  train_data.to_csv(file_name, index = False)

  return user_id_idx, event_id_idx, feature_dict

## tensorflow dataset

In [20]:
def data_interaction(train_path):
  train_set = pd.read_csv(train_path)

  # interaction data
  # 使用者交易紀錄字典
  # 使用者如果對同一產品下單兩次，除非兩次下單時間與音樂會時間相隔日數可以落於不同組 (order_concert_group)，不然兩筆資料會只留一筆
  interactions_dict = train_set.groupby(['userID', 'user_idx', 'itemID', 'item_idx',
                                         'dow', 'inter', 'category', 'recent_group', 'order_concert_group'])['y'].sum().reset_index()
  del train_set

  labels = interactions_dict['y']
  labels = tf.data.Dataset.from_tensor_slices(labels)

  # 製作 interaction 用 title dataset
  train_title = interactions_dict['itemID'].to_frame()

  train_title['title_idx'] = train_title['itemID'].apply(lambda x : title_enc_dict[x][0])
  train_title = train_title['title_idx']
  train_title = title_enc[train_title]

  train_title = {'title_enc' : train_title}
  train_title = tf.data.Dataset.from_tensor_slices(train_title)

  interactions_dict =  {name: np.array(value) for name, value in interactions_dict.items()} #{'userID':array(['userID', ...]), 'itemID':array(['itemID', ...])}    print('interactions_dict')
  interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)

  # 合併 interactions 和 train_title
  # 為 ZipDataset
  interactions = tf.data.Dataset.zip((interactions, train_title))

  # map interactions and items to an identifier
  # key 是模型 input 層的名稱
  interactions = interactions.map(lambda x, y : {
      'userID' : x['userID'],
      'itemID' : x['itemID'],
      'users_in' : x['user_idx'],
      'recent_group_in' : x['recent_group'],
      'order_concert_group_in' : x['order_concert_group'],
      'items_in' : x['item_idx'],
      'dows_in' : x['dow'],
      'inter_in' : x['inter'],
      'cat_in' : x['category'],
      'title_in' : y['title_enc']
  })

  interactions = tf.data.Dataset.zip((interactions, labels))

  return interactions

# 模型架構

### create_model

In [21]:
def create_model():
  users_in = layers.Input(name="users_in", shape=(1,))
  items_in = layers.Input(name="items_in", shape=(1,))
  dows_in = layers.Input(name='dows_in', shape=(1,), dtype=tf.string)
  inter_in = layers.Input(name='inter_in', shape=(1,), dtype=tf.string)
  cat_in = layers.Input(name='cat_in', shape=(1,), dtype=tf.string)
  recent_group_in = layers.Input(name='recent_group_in', shape=(1,))
  order_concert_group_in = layers.Input(name='order_concert_group_in', shape=(1,))
  title_in = layers.Input(name='title_in', shape=(512,))

  user_embedding = int(len(feature_dict['userID'])/2)
  item_embedding = int(len(feature_dict['itemID'])/2)
  dow_embedding =  int(len(feature_dict['dow'])/2)
  inter_embedding = int(len(feature_dict['inter'])/2)
  cat_embedding = int(len(feature_dict['category'])/2)
  recent_embedding = int(len(feature_dict['recent_group'])/2)
  order_embedding = int(len(feature_dict['order_concert_group'])/2)

  cf_emb = min(user_embedding, item_embedding)

  drop_out = 0.2
  l2 = 0.02

  ##  ===================== ##
  # feature embedding

  users_emb = layers.Embedding(name='users_emb', input_dim=len(feature_dict['userID'])+1,
                               output_dim=user_embedding)(users_in)
  users_emb = layers.Reshape(target_shape=(user_embedding,))(users_emb)

  items_emb = layers.Embedding(name='items_emb', input_dim=len(feature_dict['itemID'])+1,
                               output_dim=item_embedding)(items_in)
  items_emb = layers.Reshape(target_shape=(item_embedding, ))(items_emb)

  # cf embedding
  users_emb_cf = layers.Embedding(name="users_emb_cf", input_dim=len(feature_dict['userID'])+1,
                               output_dim=cf_emb)(users_in)
  users_emb_cf = layers.Reshape(target_shape = (cf_emb,))(users_emb_cf)

  items_emb_cf = layers.Embedding(name="items_emb_cf", input_dim=len(feature_dict['itemID'])+1,
                               output_dim=cf_emb)(items_in)
  items_emb_cf = layers.Reshape(target_shape = (cf_emb,))(items_emb_cf)

  ## dow embedding
  dows_sl= layers.StringLookup(vocabulary=feature_dict['dow'], mask_token=None, name='dow_sl')
  dows_emb = dows_sl(dows_in)
  dows_emb = layers.Embedding(name='dows_emb', input_dim=len(feature_dict['dow'])+1,
                              output_dim=dow_embedding)(dows_emb)
  dows_emb = layers.Reshape(target_shape = (dow_embedding,))(dows_emb)

  ## inter embedding
  inters_sl= layers.StringLookup(vocabulary=feature_dict['inter'], name='inters_sl')
  inters_emb = inters_sl(inter_in)
  inters_emb = layers.Embedding(name='inters_emb', input_dim=len(feature_dict['inter'])+1,
                                output_dim=inter_embedding)(inters_emb)
  inters_emb = layers.Reshape(target_shape = (inter_embedding,))(inters_emb)

  ## category embedding
  cats_sl= layers.StringLookup(vocabulary=feature_dict['category'], name='cats_sl')
  cats_emb = cats_sl(cat_in)
  cats_emb = layers.Embedding(name='cats_emb', input_dim=len(feature_dict['category'])+1,
                              output_dim=cat_embedding)(cats_emb)
  cats_emb = layers.Reshape(target_shape = (cat_embedding,))(cats_emb)

  ## recent group
  recent_il = layers.IntegerLookup(vocabulary=feature_dict['recent_group'])
  recent_emb = recent_il(recent_group_in)
  recent_emb = layers.Embedding(name='recent_emb', input_dim=len(feature_dict['recent_group'])+1,
                               output_dim=recent_embedding)(recent_emb)
  recent_emb = layers.Reshape(target_shape = (recent_embedding,))(recent_emb)

  ## order_concert_group
  order_il = layers.IntegerLookup(vocabulary=feature_dict['order_concert_group'])
  order_emb = order_il(order_concert_group_in)
  order_emb = layers.Embedding(name='order_emb', input_dim=len(feature_dict['order_concert_group'])+1,
                               output_dim=order_embedding)(order_emb)
  order_emb = layers.Reshape(target_shape = (order_embedding,))(order_emb)

  # title_enc
  title_emb = layers.Dense(512)(title_in)

  ##  ===================== ##
  # cf
  # 使用者與物品內嵌向量內積
  cf_xx = tf.math.multiply(users_emb_cf, items_emb_cf)

  ##  ===================== ##
  # nn
  feature_embs = tf.concat([users_emb, items_emb, order_emb,
                            cats_emb, dows_emb,
                            recent_emb, inters_emb,
                            title_emb], axis = 1)

  nn_layer = layers.Dense(256, activation = 'relu',
                          kernel_regularizer=regularizers.L2(l2))(feature_embs)
  nn_layer = layers.Dropout(drop_out)(nn_layer)
  nn_layer = layers.Dense(128, activation='relu',
                          kernel_regularizer=regularizers.L2(l2))(nn_layer)
  nn_layer = layers.Dropout(drop_out)(nn_layer)
  nn_layer = layers.Dense(64, activation='relu',
                          kernel_regularizer=regularizers.L2(l2))(nn_layer)
  nn_layer = layers.Dense(16, activation='relu',
                          kernel_regularizer=regularizers.L2(l2))(nn_layer)

  ##  ===================== ##
  # concat everything
  # cf 與 nn 輸出相接
  final_nn = tf.concat([cf_xx, nn_layer], axis = 1)

  y_out = layers.Dense(units=2, activation = 'softmax', name='y_out')(final_nn)
  model = models.Model(inputs=[users_in, items_in,
                               dows_in,  inter_in, cat_in, recent_group_in,
                               order_concert_group_in, title_in],
                      outputs=y_out, name="Neural_CollaborativeFiltering")

  return model


# 建立 / 讀入 df

In [None]:
# 紀錄推薦清單長度為 10 與長度為 5 的結果

file_name_10 = 'df_10.csv'
file_name_5 = 'df_5.csv'

try:
  df_10 = pd.read_csv(file_name_10)
  print('read in df_10 file')
except:
  df_10 = pd.DataFrame(columns =  ['userID', 'rec_items', 'actual_event', 'actual_order', 'precision', 'recall', 'map', 'd'])
  df_10.to_csv(file_name_10, index = False)
  print('create df_10 file')

try:
  df_5 = pd.read_csv(file_name_5)
  print('read in df_5 file')
except:
  df_5 = pd.DataFrame(columns =  ['userID', 'rec_items', 'actual_event', 'actual_order', 'precision', 'recall', 'map', 'd'])
  df_5.to_csv(file_name_5, index = False)
  print('create df_5 file')


# 參數設定

In [23]:
# Model parameters
epochs = 50
batch_size = 100
lr = 0.0005

rec_list_10 = 10
rec_list_5 = 5

# callbacks
# val_loss 連續 5代都沒下降就結束訓練
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                  patience = 5,
                                                  restore_best_weights = True
                                                  )


#main loop

In [None]:
# 23個訓練時間依序進行
for d in range(23):
  print('d=', d)
  train_time, train_path, avail_event, recent_group_dict = train_expand(d)

  user_id_idx, item_id_idx, feature_dict = make_idx(train_path)

  interactions = data_interaction(train_path)

  # shuffle & split
  shuffled = interactions.shuffle(100_000, reshuffle_each_iteration=False)
  del interactions

  # 用 dataset 無法在 fit 中使用 validation_split
  # 所以先分好：訓練集 0.9、餘下的為驗證集
  train_len = int(len(shuffled)*0.9)
  val_len = len(shuffled) - train_len

  train = shuffled.take(train_len)
  val = shuffled.skip(train_len).take(val_len)

  cached_train = train.batch(1000).cache()
  cached_val = val.batch(1000).cache()

  # 儲存 logs
  log_file_name = '/content/drive/MyDrive/Colab Notebooks/final_model/train_' + str(d) + '.log'
  csv_logger = tf.keras.callbacks.CSVLogger(log_file_name)

  model = create_model()

  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                loss= tf.keras.losses.SparseCategoricalCrossentropy(),
                metrics= tf.keras.metrics.SparseCategoricalAccuracy())

  model.fit(cached_train, validation_data = cached_val, shuffle=True,
            epochs=epochs, batch_size=batch_size,
            callbacks=[early_stopping, csv_logger])

  del shuffled, train, val, cached_train, cached_val
  gc.collect()

  ### =============== ###
  # 預測
  # 只用訓練集中的使用者，且使用者買過訓練集中已上架節目的資料做為測試
  s_time = time.time()
  raw_data = pd.read_csv(raw_data_file_name)

  #把日期相關欄位轉為日期格式 yyyy-mm-dd hh:mm:ss
  raw_data['order_time'] = pd.DatetimeIndex(raw_data['訂單日期'])
  raw_data['first_order'] = pd.DatetimeIndex(raw_data['first_order'])
  raw_data['concert_time'] = pd.DatetimeIndex(raw_data['concert_time'])
  raw_data['title'] = raw_data['產品名稱']

  test = raw_data[raw_data['order_time'] >= train_time]
  test = test[(test['userID'].isin(feature_dict['userID'])) & (test['itemID'].isin(avail_event))]
  del raw_data
  gc.collect()

  test['user_idx'] = test['userID'].apply(lambda x : user_id_idx[x][0])
  test['item_idx'] = test['itemID'].apply(lambda x : item_id_idx[x][0])

  # user_idx 清單
  test_user = list(test['user_idx'].unique())

  for i in range(len(test_user)):
    df_10 = pd.DataFrame(columns =  ['userID', 'rec_items', 'actual_event', 'actual_order', 'precision', 'recall', 'map', 'd'])
    df_5 = pd.DataFrame(columns =  ['userID', 'rec_items', 'actual_event', 'actual_order', 'precision', 'recall', 'map', 'd'])
    dic_10 = {}
    dic_5 = {}

    # 製作測試資料：候選節目為 avail_event
    # 需要重建 item 資料
    rec_items = pd.DataFrame(avail_event, columns = ['itemID'])
    rec_items['item_idx'] = rec_items['itemID'].apply(lambda x : item_id_idx[x][0])
    rec_items = rec_items.merge(title[['title', 'itemID']], how = 'left', on = ['itemID'])
    rec_items['concert_time'] = rec_items['itemID'].apply(lambda x : concert_time_dict[x]['concert_time'])
    rec_items['dow'] = rec_items['itemID'].apply(lambda x : dow_dict[x][0])
    rec_items['inter'] = rec_items['itemID'].apply(lambda x : inter_dict[x][0])
    rec_items['category'] = rec_items['itemID'].apply(lambda x : cat_dict[x][0])

    user = test_user[i]
    rec_items['user_idx'] = user
    userID = list(user_id_idx.keys())[list(user_id_idx.values()).index([user])]
    rec_items['userID'] = userID
    rec_items['recent_group'] = rec_items['userID'].apply(lambda x : recent_group_dict[x][0])

    # 計算 order_concert 分組
    rec_items['order_concert'] = rec_items['concert_time'] - train_time
    rec_items['order_concert'] = rec_items['order_concert'].dt.days
    rec_items['order_concert_group'] = rec_items['order_concert'].apply(lambda x : 0 if x == 0 else
                                                                        1 if x <= 7 else
                                                                        2 if x <= 14 else
                                                                        3 if x <= 30 else
                                                                        4 if x <= 60 else 5)

    # 找出 test_enc
    test_title_idx = rec_items.merge(title, on = 'itemID', how = 'left')['title_idx']
    test_enc = title_enc[test_title_idx]

    # 預測
    preds = model.predict([rec_items['user_idx'], rec_items['item_idx'], rec_items['dow'],
                          rec_items['inter'], rec_items['category'], rec_items['recent_group'],
                          rec_items['order_concert_group'],
                          test_enc], verbose = 0)

    # 將預測分數併到 rec_items 中
    preds = pd.DataFrame(preds, columns = ['pred_0', 'pred_1'])
    rec_items = rec_items.reset_index(drop = True)
    rec_items = rec_items.merge(preds['pred_1'].rename('pred'), left_index = True, right_index = True)
    rec_items = rec_items.sort_values(by = 'pred', ascending = False).reset_index(drop = True)

    # 使用者實際購買
    actual_event = test[test['user_idx'] == user].drop_duplicates(subset='itemID')
    actual_event_id = list(actual_event['itemID'])
    actual_event_name = list(set(actual_event['title']))

    # 直接用產品名稱找 hit
    # 刪除產品名相同的列
    rec_items = rec_items.drop_duplicates(subset = 'title').reset_index(drop = True)
    rec_items['hit'] = rec_items['title'].apply(lambda x : 1 if x in actual_event_name else 0)

    actual_order = list(rec_items[rec_items['title'].isin(actual_event_name)].index+1)

    # top k
    rec_items_10 = rec_items[:rec_list_10]
    rec_items_5 = rec_items[:rec_list_5]
    print(rec_items_10)

    precision_10 = rec_items_10['hit'].sum() / rec_list_10
    precision_5 = rec_items_5['hit'].sum() / rec_list_5
    recall_10 = rec_items_10['hit'].sum() / len(set(actual_event_name)) #同名只會有一個 hit
    recall_5 = rec_items_5['hit'].sum() / len(set(actual_event_name))

    # 計算 average precision
    rec_items_10['rank'] =[r for r in range(1, rec_list_10+1)]
    rec_items_10['cumsum'] = rec_items_10['hit'].cumsum()
    rec_items_10['rr'] = rec_items_10['cumsum']/rec_items_10['rank']
    map_10 = (rec_items_10['rr']*rec_items_10['hit']).sum()/len(set(actual_event_name))

    rec_items_5['rank'] =[r for r in range(1, rec_list_5+1)]
    rec_items_5['cumsum'] = rec_items_5['hit'].cumsum()
    rec_items_5['rr'] = rec_items_5['cumsum']/rec_items_5['rank']
    map_5 = (rec_items_5['rr']*rec_items_5['hit']).sum()/len(set(actual_event_name))

    dic_10 = {'userID' : rec_items_10['userID'].unique(), 'rec_items' : list(rec_items_10['itemID']),
              'actual_event' : actual_event_id, 'actual_order' : actual_order,
              'precision' : precision_10, 'recall' : recall_10, 'map' : map_10, 'd' : d}
    df_10 = df_10.append(dic_10, ignore_index = True)
    df_10.to_csv(file_name_10, mode = 'a', index = False, header = False)

    dic_5 = {'userID' : rec_items_5['userID'].unique(), 'rec_items' : list(rec_items_5['itemID']),
              'actual_event' : actual_event_id, 'actual_order' : actual_order,
              'precision' : precision_5, 'recall' : recall_5, 'map' : map_5, 'd' : d}
    df_5 = df_5.append(dic_5, ignore_index = True)
    df_5.to_csv(file_name_5, mode = 'a', index = False, header = False)

  del actual_event, actual_event_id, actual_event_name, actual_order, avail_event
  del df_10, df_5, dic_10, dic_5, item_id_idx, user_id_idx
  del recent_group_dict
  del test, test_user
  del rec_items, preds

  gc.collect()

  del model
  gc.collect()

  #from numba import cuda
  from keras import backend as K

  K.clear_session()
  gc.collect()

df_10 = pd.read_csv(file_name_10)
print('n = 10')
print('mean precision', df_10['precision'].mean())
print('mean recall', df_10['recall'].mean())
print('mean average precision', df_10['map'].mean())
print()

df_5 = pd.read_csv(file_name_5)
print('n = 5')
print('mean precision', df_5['precision'].mean())
print('mean recall', df_5['recall'].mean())
print('mean average precision', df_5['map'].mean())


# end session

In [None]:
from google.colab import runtime
runtime.unassign()