# Load Data

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [3]:
raw_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/extract_data_221123.csv')

raw_data['order_time'] = pd.DatetimeIndex(raw_data['訂單日期'])
raw_data['last_time'] = pd.DatetimeIndex(raw_data['last_order'])
raw_data['concert_time'] = pd.DatetimeIndex(raw_data['concert_time'])
raw_data['first_order'] = pd.DatetimeIndex(raw_data['first_order'])

raw_data = raw_data.rename(columns = {'訂購人會員代碼' : 'userID',
                                      '場次代碼' : 'itemID'})

## ===== user data  ==== ##
user_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/raw_member_data_210827.csv')
user_data['bd'] = pd.DatetimeIndex(user_data['生日'])

raw_data = raw_data.merge(user_data, left_on = 'userID', right_on = '會員代碼')
raw_data['age'] = raw_data['order_time'].dt.year-raw_data['bd'].dt.year
raw_data['gender'] = raw_data['性別'].apply(lambda x : 'F' if x=='女' else 'M')

# 18以下；19-29；30-65；65+
raw_data['age_group'] = pd.cut(raw_data['age'],
                              bins = [0, 18, 30, 65, 200], labels = [0, 1, 2, 3],
                              include_lowest = True)

raw_data['rating'] = 1

In [4]:
title_dict = raw_data[['itemID', '產品名稱']].drop_duplicates().set_index('itemID').T.to_dict('list')

# train & test

In [5]:
def train_filter(train_set):
  while True:
    item_group = train_set.groupby('itemID').size().to_frame('item_count').reset_index()
    item_group = item_group[item_group['item_count'] > 1]
    train_set = train_set[train_set['itemID'].isin(list(item_group['itemID']))]

    user_group = train_set.groupby('userID').size().to_frame('user_count').reset_index()
    user_group = user_group[user_group['user_count'] > 1]
    train_set = train_set[train_set['userID'].isin(list(user_group['userID']))]

    if min(item_group['item_count']) > 1 and min(user_group['user_count']) > 1:
      return train_set

In [6]:
t = datetime(2019, 1, 1) # 訓練集尾端初始值

def train_test(d):
  train_time = t + relativedelta(days = d*15)
  train = raw_data[raw_data['order_time'] < train_time]
  train = train_filter(train)
  train = train.sort_values(by = 'userID')

  # 訓練集中的使用者
  train_user = train['userID'].unique()
  # 訓練集中尚在架上的節目
  avail_event = train[(train['concert_time'] > train_time) & (train['first_order'] <= train_time)]['itemID'].unique()

  # 測試集
  test = raw_data[raw_data['order_time'] >= train_time]
  # 只保留於測試時間前有交易紀錄、且購買測試時間前已上架節目之交易資料
  test = test[(test['userID'].isin(train_user)) & (test['itemID'].isin(avail_event))]
  test = test.sort_values(by = 'userID')

  return train, test, avail_event

# cosine similarity

In [7]:
#item-itme
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

# 製作 item-item cosine similarity matrix
#from post: https://medium.com/radon-dev/item-item-collaborative-filtering-with-binary-or-unary-data-e8f0b465b2c3
def calculate_similarity(train):
  user_item = train.pivot_table(values='rating', #先只看有或沒有，unary
                               index='userID',
                               columns='itemID',
                               fill_value=0)
  """Calculate the column-wise cosine similarity for a sparse
     matrix. Return a new dataframe matrix with similarities.
  """
  #print(user_item)

  data_sparse = sparse.csr_matrix(user_item)
  #原本列為使用者、欄為item，轉置後列(x)為item，計算item-item的cosine similarity
  similarities = cosine_similarity(data_sparse.transpose())
  sim = pd.DataFrame(data=similarities, index= user_item.columns, columns= user_item.columns)

  return user_item, sim
#end from post

# 讀入df

In [8]:
file_name_10 = 'df_10.csv'
file_name_5 = 'df_5.csv'

try:
  df_10 = pd.read_csv(file_name_10)
  print('read in df_10 file')
except:
  df_10 = pd.DataFrame(columns =  ['userID', 'rec_items', 'actual_event', 'actual_order', 'precision', 'recall', 'map', 'd'])
  df_10.to_csv(file_name_10, index = False)
  print('create df_10 file')

try:
  df_5 = pd.read_csv(file_name_5)
  print('read in df_5 file')
except:
  df_5 = pd.DataFrame(columns =  ['userID', 'rec_items', 'actual_event', 'actual_order', 'precision', 'recall', 'map', 'd'])
  df_5.to_csv(file_name_5, index = False)
  print('create df_5 file')


create df_10 file
create df_5 file


# main loop

In [9]:
#recommendation list

import numpy as np
import time

#推薦清單長度
rec_list_10 = 10
rec_list_5 = 5

def main_loop(d):
  #訓練集、測試集
  train, test, avail_event = train_test(d)
  test_user = test['userID'].unique()

  #user-item matrix、物品相似度
  user_item, item_sim = calculate_similarity(train)

  #score= (item-item similarity matrix x user對item的rating) / 該item對所有item的相似度總和
  #因為是unary，rating=0/1，所以score=計算所有節目與會員購買節目之相似度總和，除以該節目的相似度總和
  #計算每個 item對所有 item的相似度總和
  item_sim_sum = item_sim.sum(axis = 1) #分母

  #計算score
  for u in test_user:
    df_10 = pd.DataFrame(columns =  ['userID', 'rec_items', 'actual_event', 'actual_order', 'precision', 'recall', 'map', 'd'])
    df_5 = pd.DataFrame(columns =  ['userID', 'rec_items', 'actual_event', 'actual_order', 'precision', 'recall', 'map', 'd'])
    dic_10 = {}
    dic_5 = {}

    score_item = item_sim.dot(user_item.loc[u].transpose()) #分子
    score_item = pd.DataFrame(score_item.transpose().values / item_sim_sum.values, index = score_item.transpose().index, columns=[u])

    #選出rec_candidate中分數最高的n場
    rec_item = score_item.loc[avail_event].reset_index()
    rec_item['title'] = rec_item['itemID'].apply(lambda x : title_dict[x][0])

    #evaluation
    # 使用者曾購買節目
    user_event_id = test[test['userID'] == u]['itemID'].unique()
    user_event_name = list(set([title_dict[i][0] for i in user_event_id]))

    # 以節目名稱標 hit
    # 刪除產品名相同的列
    rec_item = rec_item.drop_duplicates(subset = 'title').sort_values(by = u, ascending = False).reset_index(drop = True)
    rec_item['hit'] = rec_item['title'].apply(lambda x : 1 if x in user_event_name else 0)

    actual_order = list(rec_item[rec_item['title'].isin(user_event_name)].index+1)

    # top k
    rec_items_10 = rec_item[:rec_list_10]
    rec_items_5 = rec_item[:rec_list_5]

    precision_10 = rec_items_10['hit'].sum() / rec_list_10
    precision_5 = rec_items_5['hit'].sum() / rec_list_5
    recall_10 = rec_items_10['hit'].sum() / len(user_event_name) #同名只會有一個 hit
    recall_5 = rec_items_5['hit'].sum() / len(user_event_name)

    # 計算 average precision
    rec_items_10['rank'] =[r for r in range(1, rec_list_10+1)]
    rec_items_10['cumsum'] = rec_items_10['hit'].cumsum()
    rec_items_10['rr'] = rec_items_10['cumsum']/rec_items_10['rank']
    map_10 = (rec_items_10['rr']*rec_items_10['hit']).sum()/len(user_event_name)

    rec_items_5['rank'] =[r for r in range(1, rec_list_5+1)]
    rec_items_5['cumsum'] = rec_items_5['hit'].cumsum()
    rec_items_5['rr'] = rec_items_5['cumsum']/rec_items_5['rank']
    map_5 = (rec_items_5['rr']*rec_items_5['hit']).sum()/len(user_event_name)

    dic_10 = {'userID' :u, 'rec_items' : list(rec_items_10['itemID']),
              'actual_event' : user_event_id, 'actual_order' : actual_order,
              'precision' : precision_10, 'recall' : recall_10, 'map' : map_10, 'd' : d}
    df_10 = df_10.append(dic_10, ignore_index = True)
    df_10.to_csv(file_name_10, mode = 'a', index = False, header = False)

    dic_5 = {'userID' : u, 'rec_items' : list(rec_items_5['itemID']),
              'actual_event' : user_event_id, 'actual_order' : actual_order,
              'precision' : precision_5, 'recall' : recall_5, 'map' : map_5, 'd' : d}
    df_5 = df_5.append(dic_5, ignore_index = True)
    df_5.to_csv(file_name_5, mode = 'a', index = False, header = False)


#parallel

In [None]:
for i in range(23):
  main_loop(i)

In [None]:
df_10 = pd.read_csv(file_name_10)
print('n = 10')
print('mean precision', df_10['precision'].mean())
print('mean recall', df_10['recall'].mean())
print('mean average precision', df_10['map'].mean())
print()

df_5 = pd.read_csv(file_name_5)
print('n = 5')
print('mean precision', df_5['precision'].mean())
print('mean recall', df_5['recall'].mean())
print('mean average precision', df_5['map'].mean())

In [None]:
print(df_10.groupby('d')['recall'].mean())
print()

print(df_5.groupby('d')['recall'].mean())

In [None]:
from google.colab import runtime
runtime.unassign()