## 載入套件

In [1]:
from __future__ import unicode_literals, print_function, division
import numpy as np
import sys
import math
import csv
import gzip
import pickle
from sklearn.neighbors import NearestNeighbors

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

## 參數設置

In [2]:
activate_codes_num = -1
next_k_step = 1
training_chunk = 0
test_chunk = 1

num_nearest_neighbors = 300
within_decay_rate = 0.9
group_decay_rate = 0.7
group_size = 7
topk = 10

## 生成字典與計算最終向量維度



In [3]:
def generate_dictionary_BA(files, attributes_list):
    dictionary_table = {}
    counter_table = {}

    # attributes_list 僅有 MATERIAL_NUMBER
    for attr in attributes_list:
        dictionary = {}
        dictionary_table[attr] = dictionary
        counter_table[attr] = 0

    csv.field_size_limit(128) # 設置 CSV 讀取的最大長度

    for filename in files:
        count = 0 #用於追蹤當前文件中處理的行數
        with open(filename, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='|') # 指定逗號（,）為字段分隔符，並將豎線（|）作為引號字符
            for row in reader:
                if count == 0: # 跳過第一行（標題行）:
                    count += 1
                    continue
                key = attributes_list[0] # 取出第一個屬性，即 MATERIAL_NUMBER
                if row[3] not in dictionary_table[key]: # 如果 row[3] 的值還沒有在字典中，則將其添加到字典中，並賦予一個索引
                    dictionary_table[key][row[3]] = counter_table[key]
                    counter_table[key] = counter_table[key] + 1 # 更新 counter_table 中對應屬性的計數
                    count += 1

    total = 0
    for key in counter_table.keys():
        total = total + counter_table[key]

    print('# dimensions of final vector: ' + str(total) + ' | '+str(count-1))

    return dictionary_table, total, counter_table


## 從 CSV 文件中讀取數據並處理，將其轉換為特定格式的數據結構

**目的：在歷史與未來中，處理每位用戶的所有購物籃，並於前後加入 -1 標記表示開始與結束**

```
# data_chunk[0]["2"] :[[-1], array([50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,67]), ... array([ 52, 106, 142, 143, 144, 145, 146, 147]), [-1]]
# data_chunk[1]["2"]:[[-1], array([   5,    9,   50,  125,  145, 1357]), [-1]]
```



In [4]:
# 從 CSV 文件中讀取數據並處理，將其轉換為特定格式的數據結構
def read_claim2vector_embedding_file_no_vector(files): #
    attributes_list = ['MATERIAL_NUMBER'] # 一個包含要處理的屬性名稱的列表(存了商品編號）
    print('----- Start Dictionary Generation -----')
    dictionary_table, num_dim, counter_table = generate_dictionary_BA(files, attributes_list) # 生成字典並返回三個值 #files是./data/TaFang_history_NB.csv 跟 ./data/TaFang_future_NB.csv
    print('----- Finish Dictionary Generation -----')
    usr_attr = 'CUSTOMER_ID'
    ord_attr = 'ORDER_NUMBER'
    freq_max = 200
    data_chunk = [] # 用於儲存處理後數據的列表
    day_gap_counter = []
    claims_counter = 0
    num_claim = 0

    # 迭代處理 files 列表中的每個文件，先重新命名 history 之後再 future
    for file_id in range(len(files)):
        count = 0
        data_chunk.append({})
        filename = files[file_id] # 打開文件
        with open(filename, 'r') as csvfile:
            reader = csv.DictReader(csvfile) # 讀取 CSV 文件

            # 用於在下面的循環中跟踪上一行的數據
            last_pid_date = '*'
            last_pid = '-1'
            last_days = -1
            # 2 more elements in the end for start and end states
            feature_vector = []

            # 對 CSV 文件中的每一行進行迭代
            for row in reader:
                cur_pid_date = row[usr_attr] + '_' + row[ord_attr] # 從當前行生成一個唯一的標識符 (1_14)
                cur_pid = row[usr_attr] # 並獲取用戶 ID (1)

                # 如果當前用戶 ID 不等於上一個用戶 ID，則在 data_chunk 中為這個新用戶創建一個新條目
                if cur_pid != last_pid:
                    # start state
                    tmp = [-1]
                    data_chunk[file_id][cur_pid] = []
                    data_chunk[file_id][cur_pid].append(tmp)

                # 如果當前的 cur_pid_date 不等於 last_pid_date，則對 feature_vector 進行排序並將其添加到 data_chunk 中的相應位置，然後重置 feature_vector
                if cur_pid_date not in last_pid_date:
                    if last_pid_date not in '*' and last_pid not in '-1':
                        sorted_feature_vector = np.sort(feature_vector)
                        data_chunk[file_id][last_pid].append(sorted_feature_vector)
                        if len(sorted_feature_vector) > 0:
                            count = count + 1
                    feature_vector = []

                    claims_counter = 0
                # 如果當前用戶 ID 不等於上一個用戶 ID，則在 data_chunk 中的上一個用戶 ID 下添加一個結束標記
                if cur_pid != last_pid:
                    # end state
                    if last_pid not in '-1':

                        tmp = [-1]
                        data_chunk[file_id][last_pid].append(tmp)
                        #print(data_chunk[file_id][last_pid])

                key = attributes_list[0]
                within_idx = dictionary_table[key][row[key]]
                previous_idx = 0

                for j in range(attributes_list.index(key)):
                    previous_idx = previous_idx + counter_table[attributes_list[j]]
                idx = within_idx + previous_idx

                if idx not in feature_vector:
                    feature_vector.append(idx)

                # 更新 last_pid_date 和 last_pid 值
                last_pid_date = cur_pid_date
                last_pid = cur_pid
                #last_days = cur_days
                if file_id == 1:
                    claims_counter = claims_counter + 1

            # 在循環結束後，將最後一個 feature_vector 添加到 data_chunk 中
            if last_pid_date not in '*' and last_pid not in '-1':
                data_chunk[file_id][last_pid].append(np.sort(feature_vector))

    # #儲存 data_chunk 資料用於查看
    # def save_top_20_to_txt(data_chunk, file_index, file_name):
    #   with open(file_name, 'w') as f:
    #       for key, value in list(data_chunk[file_index].items())[:20]:
    #           f.write(f"{key}: {value}\n")
    # # save_top_20_to_txt(data_chunk, 0, 'data/data_chunk_0_top20.txt')
    # save_top_20_to_txt(data_chunk, 1, 'data/data_chunk_1_top20.txt')

    # 正確答案
    #np.savez('data/answer.npz', **data_chunk[1])
    #print("data_chunk[1]:", data_chunk[1])
    print("num_dim+2 :", num_dim + 2)
    return data_chunk, num_dim + 2 # 返回處理後的數據

## 分割資料集

In [5]:
def partition_the_data_validate(data_chunk, key_set, next_k_step):
    # key_set = ['1', '2', '3', '4', '5', ...'13972'] （即 Future 中所有用戶的 ID）
    # next_k_step = 1
    print('----- Start Splitting Data Set -----')

    filtered_key_set = [] # 用於儲存經過篩選後的 key
    past_chunk = 0
    future_chunk = 1

    for key in key_set:
        if len(data_chunk[past_chunk][key]) <= 3:
            continue
        if len(data_chunk[future_chunk][key]) < 2 + next_k_step:
            continue
        filtered_key_set.append(key)

    training_key_set = filtered_key_set[0:int(4 / 5 * len(filtered_key_set)*0.9)]
    validation_key_set = filtered_key_set[int(4 / 5 * len(filtered_key_set)*0.9):int(4 / 5 * len(filtered_key_set))]
    test_key_set = filtered_key_set[int(4 / 5 * len(filtered_key_set)):]

    print('Number of training instances: ' + str(len(training_key_set)))
    print('Number of validation instances: ' + str(len(validation_key_set)))
    print('Number of test instances: ' + str(len(test_key_set)))

    # print('Training Key Set: ',training_key_set)

    print("training_key_set[0]: ", training_key_set[0])
    print("validation_key_set[0]: ",validation_key_set[0])
    print("test_key_set[0]: ",test_key_set[0])
    print('----- Finish splitting the data set -----')

    return training_key_set, validation_key_set, test_key_set

## 分群計算，獲得所有群組的平均向量

**將一系列的歷史數據向量按給定的群組大小分組，並計算每個群組中的向量的平均值**

假設 his_list 長度為 10，group_size 為 3，\
est_num_vec_each_block = 10 / 3 ≈ 3.33（每個群組中平均應有大約 3.33 個向量）\
base_num_vec_each_block = 3.33 向下取整 = 3（每個群組至少應有 3 個向量）\
residual = 3.33 - 3 = 0.33 （差異）\
num_vec_has_extra_vec = 1（有 1 個群組將比其他群組多一個向量）

In [6]:
def group_history_list(his_list,group_size):
    grouped_vec_list = []
    if len(his_list) < group_size:
        #sum = np.zeros(len(his_list[0]))
        for j in range(len(his_list)):
            grouped_vec_list.append(his_list[j])

        return grouped_vec_list, len(his_list)
    else:
        est_num_vec_each_block = len(his_list)/group_size
        base_num_vec_each_block = int(np.floor(len(his_list)/group_size))
        residual = est_num_vec_each_block - base_num_vec_each_block

        num_vec_has_extra_vec = int(np.round(residual * group_size))

        if residual == 0:
            for i in range(group_size):
                if len(his_list)<1:
                    print('len(his_list)<1')
                sum = np.zeros(len(his_list[0]))
                for j in range(base_num_vec_each_block):
                    if i*base_num_vec_each_block+j >= len(his_list):
                        print('i*num_vec_each_block+j')
                    sum += his_list[i*base_num_vec_each_block+j]
                grouped_vec_list.append(sum/base_num_vec_each_block)
        else:

            for i in range(group_size - num_vec_has_extra_vec):
                sum = np.zeros(len(his_list[0]))
                for j in range(base_num_vec_each_block):
                    if i*base_num_vec_each_block+j >= len(his_list):
                        print('i*base_num_vec_each_block+j')
                    sum += his_list[i*base_num_vec_each_block+j]
                    last_idx = i * base_num_vec_each_block + j
                grouped_vec_list.append(sum/base_num_vec_each_block)

            est_num = int(np.ceil(est_num_vec_each_block))
            start_group_idx = group_size - num_vec_has_extra_vec

            if len(his_list) - start_group_idx*base_num_vec_each_block >= est_num_vec_each_block:
                for i in range(start_group_idx,group_size):
                    sum = np.zeros(len(his_list[0]))
                    for j in range(est_num):
                        # if residual+(i-1)*est_num_vec_each_block+j >= len(his_list):
                        #     print('residual+(i-1)*num_vec_each_block+j')
                        #     print('len(his_list)')
                        iidxx = last_idx + 1+(i-start_group_idx)*est_num+j
                        if  iidxx >= len(his_list) or iidxx<0:
                            print('last_idx + 1+(i-start_group_idx)*est_num+j')
                        sum += his_list[iidxx]
                    grouped_vec_list.append(sum/est_num)
        return grouped_vec_list, group_size

## 整合所有群組的平均向量，獲得用戶歷史向量


In [7]:
def temporal_decay_sum_history(data_set, key_set, output_size,group_size,within_decay_rate,group_decay_rate):

    print('Temporal Decay Sum History ...')

    sum_history = {}

    for key in key_set:

        vec_list = data_set[key]
        num_vec = len(vec_list) - 2
        his_list = []

        for idx in range(1,num_vec+1):
            his_vec = np.zeros(output_size)
            decayed_val = np.power(within_decay_rate,num_vec-idx)
            for ele in vec_list[idx]:
                his_vec[ele] = decayed_val
            his_list.append(his_vec) # 包含了用戶所有交易的時間衰減向量

        grouped_list,real_group_size = group_history_list(his_list,group_size)
        his_vec = np.zeros(output_size)
        for idx in range(real_group_size):
            decayed_val = np.power(group_decay_rate, group_size - 1 - idx)
            if idx>=len(grouped_list):
                print( 'idx: '+ str(idx))
                print('len(grouped_list): ' + str(len(grouped_list)))
            his_vec += grouped_list[idx]*decayed_val
        sum_history[key] = his_vec/real_group_size
    return sum_history

## 取得鄰居向量

測試集、訓練集、驗證集分開處理

In [8]:
# data_chunk、測試集用戶向量、測試集、訓練集用戶向量、訓練集、鄰居索引

def test_get_neighbors_vectors(data_chunk, temporal_decay_sum_history_test,test_key_set,temporal_decay_sum_history_training,training_key_set,index):

    print("（Test）Get the neighbor vector of each user ...")
    # history_dict = {}  # 用於保存 test_history 和 sum_training_history 的關聯
    user_neighbor_set = []  # 初始化一個空列表來存儲數據
    target_set = []

    for test_key_id in range(len(test_key_set)):
        test_key = test_key_set[test_key_id]
        test_history = temporal_decay_sum_history_test[test_key] # 真向量
        #target_set.append(data_chunk[test_chunk][test_key_set[test_key_id]]) # [[-1], array([ 270,  669, 1653, 1886, 1987, 4718, 6677]), [-1]]
        #target_set.append((test_key, data_chunk[test_chunk][test_key][1]))

        answer = data_chunk[test_chunk][test_key][1]
        answer_vector = np.zeros(len(test_history))

        # 將目標變量中的元素對應的位置設為 1
        for ii in answer:
            answer_vector[ii] = 1
        answer_tensor = torch.from_numpy(answer_vector)
        target_set.append((test_key, data_chunk[test_chunk][test_key][1],answer_tensor))


        sum_training_history = np.zeros(len(test_history))

        for indecis in index[test_key_id]:
            training_key = training_key_set[indecis]
            sum_training_history += temporal_decay_sum_history_training[training_key]

        sum_training_history = sum_training_history/len(index[test_key_id]) #真向量
        user_neighbor_set.append((test_key, test_history, sum_training_history)) # 特定用戶向量與其對應的鄰居向量，Ex. ([0.5, 0.2, 0.8], [0.3, 0.37, 0.37])

    for i, item in enumerate(user_neighbor_set[:20]):
        print(f"Item {i}: {item}")
        print()

    for i, item in enumerate(target_set[:20]):
        print(f"Target Item {i}: {item}")
        print()

    with gzip.GzipFile('preprocessing-data/TaFeng_test_user_and_neighbor_set.gz', 'wb') as fp:
      pickle.dump(user_neighbor_set, fp)
      print("The test set user vectors and neighbor vectors have been saved !")

    with gzip.GzipFile('preprocessing-data/TaFeng_test_answer.gz', 'wb') as fp:
      pickle.dump(target_set, fp)
      print("The test set answer have been saved !")

    return user_neighbor_set

In [9]:
def validation_get_neighbors_vectors(data_chunk, temporal_decay_sum_history_validation,validation_key_set,temporal_decay_sum_history_training,training_key_set,index):

    print("(Validation) Get the neighbor vector of each user ...")
    # history_dict = {}  # 用於保存 test_history 和 sum_training_history 的關聯
    user_neighbor_set = []  # 初始化一個空列表來存儲數據
    target_set = []

    for validation_key_id in range(len(validation_key_set)):
        validation_key = validation_key_set[validation_key_id]
        validation_history = temporal_decay_sum_history_validation[validation_key] # 真向量
        #target_set.append(validation_key, data_chunk[test_chunk][validation_key_set[validation_key_id]][1]) # 正確答案[[-1], array([ 270,  669, 1653, 1886, 1987, 4718, 6677]), [-1]]
        #target_set.append((validation_key, data_chunk[test_chunk][training_key_set[validation_key_id]][1]))

        answer = data_chunk[test_chunk][validation_key][1]
        answer_vector = np.zeros(len(validation_history))

        # 將目標變量中的元素對應的位置設為 1
        for ii in answer:
            answer_vector[ii] = 1
        answer_tensor = torch.from_numpy(answer_vector)
        target_set.append((validation_key, data_chunk[test_chunk][validation_key][1],answer_tensor))

        sum_training_history = np.zeros(len(validation_history))

        for indecis in index[validation_key_id]:
            training_key = training_key_set[indecis]
            sum_training_history += temporal_decay_sum_history_training[training_key]

        sum_training_history = sum_training_history/len(index[validation_key_id]) #真向量
        user_neighbor_set.append((validation_key, validation_history, sum_training_history)) # 特定用戶向量與其對應的鄰居向量，Ex. ([0.5, 0.2, 0.8], [0.3, 0.37, 0.37])

    # 打印 user_neighbor_set 的前 20 个条目
    for i, item in enumerate(user_neighbor_set[:20]):
        print(f"User Neighbor Item {i}: {item}")
        print()

    # 打印 target_set 的前 20 个条目
    for i, item in enumerate(target_set[:20]):
        print(f"Target Item {i}: {item}")
        print()

    with gzip.GzipFile('preprocessing-data/TaFeng_validation_user_and_neighbor_set.gz', 'wb') as fp:
      pickle.dump(user_neighbor_set, fp)
      print("The validation set user vectors and neighbor vectors have been saved !")

    with gzip.GzipFile('preprocessing-data/TaFeng_validation_answer.gz', 'wb') as fp:
      pickle.dump(target_set, fp)
      print("The validation set answer have been saved !")

    return user_neighbor_set

In [10]:
def training_get_neighbors_vectors(data_chunk,temporal_decay_sum_history_training,training_key_set,index):

    print("(Training) Get the neighbor vector of each user ...")
    # history_dict = {}  # 用於保存 test_history 和 sum_training_history 的關聯
    user_neighbor_set = []  # 初始化一個空列表來存儲數據
    target_set = []

    for training_key_id in range(len(training_key_set)):
        training_key = training_key_set[training_key_id]
        training_history = temporal_decay_sum_history_training[training_key] # 真向量
        #target_set.append((training_key, data_chunk[test_chunk][training_key_set[training_key_id]][1]))
        #target_set.append((training_key, data_chunk[test_chunk][training_key][1]))

        answer = data_chunk[test_chunk][training_key][1]
        answer_vector = np.zeros(len(training_history))

        # 將目標變量中的元素對應的位置設為 1
        for ii in answer:
            answer_vector[ii] = 1
        answer_tensor = torch.from_numpy(answer_vector)
        target_set.append((training_key, data_chunk[test_chunk][training_key][1],answer_tensor))

        sum_training_history = np.zeros(len(training_history))

        for indecis in index[training_key_id]:
            training_key2 = training_key_set[indecis]
            sum_training_history += temporal_decay_sum_history_training[training_key2]

        sum_training_history = sum_training_history/len(index[training_key_id]) #真向量
        user_neighbor_set.append((training_key, training_history, sum_training_history)) # 特定用戶向量與其對應的鄰居向量，Ex. ([0.5, 0.2, 0.8], [0.3, 0.37, 0.37])

    for i, item in enumerate(user_neighbor_set[:20]):
      print(f"Item {i}: {item}")
      print()

    for i, item in enumerate(target_set[:20]):
      print(f"Target Item {i}: {item}")
      print()

    with gzip.GzipFile('preprocessing-data/TaFeng_training_user_and_neighbor_set.gz', 'wb') as fp:
      pickle.dump(user_neighbor_set, fp)
      print("The Training set user vectors and neighbor vectors have been saved !")

    with gzip.GzipFile('preprocessing-data/TaFeng_training_answer.gz', 'wb') as fp:
      pickle.dump(target_set, fp)
      print("The Training set answer have been saved !")

    return user_neighbor_set

## KNN 尋找相似鄰居

對數據集中的數據行最近鄰居搜索，返回每個測試點的最近鄰居索引（indices）

In [11]:
def KNN(query_set, target_set, k):

    print("Start Looking for Neighbors ...")

    history_mat = []
    for key in target_set.keys():
        history_mat.append(target_set[key])
    test_mat = []
    query_keys = list(query_set.keys())  # 保存查詢鍵列表
    for key in query_set.keys():
        test_mat.append(query_set[key])

    nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute').fit(history_mat)
    distances, indices = nbrs.kneighbors(test_mat)

    print(len(test_mat))
    #print(indices[0]) #[[3305 2654 ... 696][1313 ... 5668 453]]
    #print('Index :',indices[0][0])
    #print('Distances:',distances)
    print("Finish Looking for Neighbors !")

    return indices

In [12]:
def KNN_training(query_set, target_set, k):

    print("Start Looking for Neighbors ...")
    keys_list = list(target_set.keys())
    #print(keys_list)

    history_mat = []
    for key in target_set.keys(): # key 是 User ID
        history_mat.append(target_set[key])
    test_mat = []
    query_keys = list(query_set.keys())  # 保存查詢鍵列表
    for key in query_set.keys():
        test_mat.append(query_set[key])

    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='brute').fit(history_mat)
    distances, indices = nbrs.kneighbors(test_mat)
    all_filtered_indices = []  # 用于存储所有查询点的过滤后的邻居索引

    for i, key in enumerate(query_keys):
        index_to_remove = keys_list.index(key)  # 获取要删除的索引
        filtered_indices = [index for index in indices[i] if index != index_to_remove]
        all_filtered_indices.append(filtered_indices)

    print( all_filtered_indices[0])
    print('Index :',all_filtered_indices[0][0])

    print("Finish Looking for Neighbors !")

    return indices

## 取得用戶向量與對應的鄰居向量

In [13]:
def get_user_neighbor_vectors(data_chunk, training_key_set, test_key_set, validation_key_set, num_nearest_neighbors,temporal_decay_sum_history_test, temporal_decay_sum_history_training,temporal_decay_sum_history_validation):

    # 獲得每位用戶的鄰居索引
    index_test = KNN(temporal_decay_sum_history_test, temporal_decay_sum_history_training,num_nearest_neighbors)
    index_validation = KNN(temporal_decay_sum_history_validation, temporal_decay_sum_history_training,num_nearest_neighbors)
    index_training = KNN_training(temporal_decay_sum_history_training, temporal_decay_sum_history_training,num_nearest_neighbors)

    # 取得每位用戶與鄰居向量
    test_user_neighbor_set = test_get_neighbors_vectors(data_chunk,temporal_decay_sum_history_test, test_key_set, temporal_decay_sum_history_training, training_key_set, index_test)
    validation_user_neighbor_set = validation_get_neighbors_vectors(data_chunk,temporal_decay_sum_history_validation, validation_key_set, temporal_decay_sum_history_training, training_key_set, index_validation)
    training_user_neighbor_set = training_get_neighbors_vectors(data_chunk, temporal_decay_sum_history_training, training_key_set, index_training)

    status = "OK"
    return status

# 評估指標

In [14]:
def get_precision_recall_Fscore(groundtruth,pred):
    a = groundtruth
    b = pred
    correct = 0
    truth = 0
    positive = 0

    for idx in range(len(a)):
        if a[idx] == 1:
            truth += 1
            if b[idx] == 1:
                correct += 1
        if b[idx] == 1:
            positive += 1

    flag = 0
    if 0 == positive:
        precision = 0
        flag = 1
        #print('postivie is 0')
    else:
        precision = correct/positive
    if 0 == truth:
        recall = 0
        flag = 1
        #print('recall is 0')
    else:
        recall = correct/truth

    if flag == 0 and precision + recall > 0:
        F = 2*precision*recall/(precision+recall)
    else:
        F = 0
    return precision, recall, F, correct

def get_HT(groundtruth, pred_rank_list,k):
    count = 0
    for pred in pred_rank_list:
        if count >= k:
            break
        if groundtruth[pred] == 1:
            return 1
        count += 1

    return 0

def get_NDCG1(groundtruth, pred_rank_list,k):
    count = 0
    dcg = 0
    for pred in pred_rank_list:
        if count >= k:
            break
        if groundtruth[pred] == 1:
            dcg += (1)/math.log2(count+1+1)
        count += 1
    idcg = 0
    num_real_item = np.sum(groundtruth)
    num_item = int(num_real_item)
    for i in range(num_item):
        idcg += (1) / math.log2(i + 1 + 1)
    ndcg = dcg / idcg
    return ndcg

# 主程式

In [15]:
files = ['./cleaned_dataset/TaFeng_history.csv', './cleaned_dataset/TaFeng_future.csv'] # 歷史與未來的兩個檔案

# 讀取和處理兩個檔案
# data_chunk 長度為 2，其保存在歷史與未來中，對於每位用戶的所有購物籃。[0] 即為 history，data_chunk[1] 即為 future
data_chunk, input_size = read_claim2vector_embedding_file_no_vector(files)
training_key_set, validation_key_set, test_key_set = partition_the_data_validate(data_chunk, list(data_chunk[test_chunk]), 1) # 將數據分為訓練、驗證和測試集

# 取得用戶向量
temporal_decay_sum_history_training = temporal_decay_sum_history(data_chunk[training_chunk],training_key_set, input_size,group_size, within_decay_rate,group_decay_rate)
temporal_decay_sum_history_test = temporal_decay_sum_history(data_chunk[training_chunk],test_key_set, input_size,group_size, within_decay_rate,group_decay_rate)
temporal_decay_sum_history_validation = temporal_decay_sum_history(data_chunk[training_chunk],validation_key_set, input_size,group_size, within_decay_rate,group_decay_rate)

# # 取得所有用戶向量與其對應的鄰居向量
status = get_user_neighbor_vectors(data_chunk, training_key_set, test_key_set,validation_key_set,num_nearest_neighbors,temporal_decay_sum_history_test, temporal_decay_sum_history_training,temporal_decay_sum_history_validation)

print(status)

----- Start Dictionary Generation -----
# dimensions of final vector: 12085 | 2
----- Finish Dictionary Generation -----
num_dim+2 : 12087
----- Start Splitting Data Set -----
Number of training instances: 10059
Number of validation instances: 1117
Number of test instances: 2795
training_key_set[0]:  1069
validation_key_set[0]:  1858204
test_key_set[0]:  1974492
----- Finish splitting the data set -----
Temporal Decay Sum History ...
Temporal Decay Sum History ...
Temporal Decay Sum History ...
Start Looking for Neighbors ...
2795
Finish Looking for Neighbors !
Start Looking for Neighbors ...
1117
Finish Looking for Neighbors !
Start Looking for Neighbors ...
[2346, 9895, 8336, 8693, 3156, 2033, 9877, 9934, 7369, 1063, 7266, 131, 9854, 742, 6687, 9853, 9922, 8283, 9883, 1196, 2351, 7985, 9892, 1577, 9940, 3911, 9800, 2894, 9930, 9033, 9842, 7557, 6484, 4925, 2213, 1964, 6610, 3386, 7818, 5420, 9900, 243, 9885, 9929, 9927, 2519, 8385, 3113, 1416, 7551, 5246, 5310, 9928, 9905, 9882, 9856