## 1. 載入套件

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm # 進度條
import os

from gensim.models import Word2Vec
import multiprocessing as mp
print('Number of CPU cores:', mp.cpu_count())

Number of CPU cores: 24


## 2. 讀取資料

#### Ta Feng Dataset

In [2]:
df = pd.read_csv('data/TaFeng_clean.csv') # "TaFeng_clean" or "Dunnhumby_clean"
df

Unnamed: 0,CUSTOMER_ID,PRODUCT_ID,TRANSACTION_DT,CART_ID,new_item_id
0,1069,9556439880610,2000-11-13,0,0
1,1069,4710176008699,2000-11-13,0,1
2,1069,4710320224661,2001-01-21,1,2
3,1069,4710022101208,2001-01-21,1,3
4,1069,4712603661644,2001-01-21,1,4
...,...,...,...,...,...
588020,20002000,20513184,2001-01-20,93899,1379
588021,20002000,4714541091071,2001-01-20,93899,7079
588022,20002000,4710018008634,2001-01-20,93899,2660
588023,20002000,4710085120680,2001-02-05,93900,309


## 3. Item2Vec 方法

### 使用 gensim 的 Word2vec (將Window sizes設為很大)
https://github.com/ikatsov/tensor-house/blob/master/recommendations/item2vec.ipynb

In [3]:
# 建立一個 cart list(相當於句子)，其中每個 cart 還包含此購物車中的所有 item ID (相當於單詞) list
def make_item_corpus(df_list):  # df_list: 購物車,新項目id 串列
    print('建立項目語料庫...')
    item_list = [] # 暫存單個購物車的商品 ID
    cart_list = [] # 儲存所有的購物車
    new_cart_id = df_list[0][0]
    for (CART_ID, ITEM_ID) in tqdm(df_list):
        if new_cart_id != CART_ID: # 表示開始了一個新的購物車
            cart_list.append(item_list) # 將當前購物車的商品 ID 列表添加到 cart_list
            item_list = [] # 為下一個購物車的商品 ID 做準備
            new_cart_id = CART_ID # 更新為當前處理的購物車 ID
        item_list.append(ITEM_ID) # 添加當前商品 ID
    cart_list.append(item_list)
    print(cart_list[:10])
    return cart_list

In [4]:
def word2vec(cart_list, DATASET_NAME, TRAIN_ITEM_MODEL):
#     TRAIN_ITEM_MODEL = True   # True - create a new model, False - load a previosuly created model
    LOGGING_ELABLED = True # 啟用日誌記錄
    MODEL_DIR = 'data/item2vec_models' #  定義模型將要保存的目錄

    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    # 設置了日誌記錄的格式和級別
    if LOGGING_ELABLED:
        import logging
        logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

    # 設定 Word2Vec 模型的嵌入向量維度為 16
    WORD_DIM = 64
    # 生成模型文件的保存路徑和文件名
    model_filename = f'data/item2vec_models/item2vec_{DATASET_NAME}.{WORD_DIM}d-testttttt.model'

    # 創建一個新的 Word2Vec 模型實例
    # sentences=cart_list: 模型訓練數據，即購物車列表，其中每個購物車被視為一個「句子」，購物車中的商品 ID 是「句子」中的「單詞」。
    # window: 窗口大小，是指在每個目標詞周圍考慮的詞的數量。較大的窗口大小會考慮更多的詞，但也可能導致訓練變慢。
    # vector_size：向量維度，即每個詞將被轉換成多大的向量
    # sg：選擇訓練算法。1＝ 使用 Skip-gram 算法，0＝ 使用 CBOW 算法。Skip-gram 通常在處理較大的數據集時效果更好
    # hs=0: 表示不使用分層 softmax 優化訓練
    # negative：設定負採樣數量，Ex. 5 表示每個目標詞將與 5 個「負」詞一起被用於訓練
    # ns_exponent：控制負採樣過程中詞頻的影響，數值越低，低頻詞被選為「負詞」的機率就越高
    # workers：設定訓練過程要使用的線程數，是基於系統的 CPU 核心數量
    # min_count：設定詞頻閾值，只有出現次數大於或等於 min_count 的詞才會被考慮在內

    if TRAIN_ITEM_MODEL:
        model = Word2Vec(sentences=cart_list,
                    window=500,
                    vector_size=WORD_DIM,
                    sg=1,
                    hs=0,
                    negative=5,
                    ns_exponent=0.75,
                    workers=4, # mp.cpu_count()
                    min_count=1
                    )

        model.save(model_filename)
        print(f'Model saved to [{model_filename}]')

    else:
        model = Word2Vec.load(model_filename)
        print(f'Model loaded from [{model_filename}]')
    return model

#### Ta Feng Dataset

In [5]:
# 購物車,新項目id 串列
df_list = df[['CART_ID','new_item_id']].values.tolist() # Tafeng: new_item_id / Dunnhumby: NEW_ITEM_ID
df_list[:10]

[[0, 0],
 [0, 1],
 [1, 2],
 [1, 3],
 [1, 4],
 [2, 5],
 [2, 1],
 [2, 6],
 [2, 7],
 [2, 8]]

In [6]:
cart_list = make_item_corpus(df_list)
model = word2vec( cart_list, "TaFeng", True ) # "TaFeng" or "Dunnhumby"

建立項目語料庫...


  0%|          | 0/588025 [00:00<?, ?it/s]

[[0, 1], [2, 3, 4], [5, 1, 6, 7, 8], [9], [10, 11], [10, 12, 13], [14, 15, 16, 17, 18, 19], [20, 21, 22, 23, 24, 25], [26, 27, 28, 29, 30, 31, 32, 33, 34], [32, 35]]


INFO - 16:56:49: collecting all words and their counts
INFO - 16:56:49: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:56:49: PROGRESS: at sentence #10000, processed 60798 words, keeping 9560 word types
INFO - 16:56:49: PROGRESS: at sentence #20000, processed 121502 words, keeping 11211 word types
INFO - 16:56:49: PROGRESS: at sentence #30000, processed 181121 words, keeping 11741 word types
INFO - 16:56:49: PROGRESS: at sentence #40000, processed 251162 words, keeping 11980 word types
INFO - 16:56:49: PROGRESS: at sentence #50000, processed 331634 words, keeping 12058 word types
INFO - 16:56:49: PROGRESS: at sentence #60000, processed 399666 words, keeping 12076 word types
INFO - 16:56:49: PROGRESS: at sentence #70000, processed 454638 words, keeping 12081 word types
INFO - 16:56:49: PROGRESS: at sentence #80000, processed 510101 words, keeping 12085 word types
INFO - 16:56:49: PROGRESS: at sentence #90000, processed 566180 words, keeping 12085 word types


Model saved to [data/item2vec_models/item2vec_TaFeng.64d-testttttt.model]


In [7]:
# Prepare the embeddings

# 提取訓練得到的詞向量
# model.wv 是一個 KeyedVectors 實例，它儲存所有詞的向量表示
word_vectors = model.wv

# 提取模型詞彙表中所有詞的列表。key_to_index 是一個字典，它將詞映射到它們在詞向量矩陣中的索引
vocab = list(model.wv.key_to_index.keys())

# 創建一個字典，其中每個商品 ID（從 vocab）都映射到它的向量表示，這相當於從 model.wv 提取每個商品的嵌入向量
item2vector_dict = {arg:model.wv[arg] for arg in vocab}
print(item2vector_dict[1])

#以便每行代表一個商品的嵌入向量。.values 將 DataFrame 轉換為 NumPy 數組，方便進一步分析和處理。
X = pd.DataFrame(item2vector_dict).T.values

# 商品嵌入向量的維度(即矩陣的形狀), 詞彙表的大小(即有多少個不同的商品 ID), 詞彙表中的第一個商品 ID
X.shape, len(vocab), vocab[0]

[ 0.2744593  -0.06972869  0.12451126  0.15126161  0.04890731 -0.17976627
  0.11041114 -0.1475229  -0.55520064  0.08602219 -0.02034035 -0.1156313
 -0.39300516  0.11624952  0.03018883  0.20018095 -0.2252368   0.20514482
 -0.30678663  0.5383943   0.44021764  0.24018863  0.0804408  -0.38274306
 -0.34148282  0.2132978  -0.13633774  0.0169643   0.06408934 -0.16542639
  0.0288031   0.24389538 -0.14087474 -0.3469186  -0.06307347  0.3839908
 -0.01595192 -0.10058598  0.02708765  0.20208071  0.01800046  0.14465913
 -0.04738482 -0.08663592  0.22521757 -0.1462983  -0.16864687 -0.29218966
  0.10396431  0.15574908 -0.07660682 -0.08800297 -0.07759112  0.18041314
  0.17303097  0.15739022  0.13600916 -0.13677712 -0.15032443  0.26188704
 -0.16819656  0.25068197 -0.20056173  0.09130795]


((12085, 64), 12085, 172)

#### 檢查

In [8]:
# 單獨提取 key_to_index 屬性，它是一個字典，將每個商品 ID 映射到其在模型詞向量矩陣中的索引。這對於理解特定商品的位置和查找特定商品的向量很有用

model.wv.key_to_index

{172: 0,
 11: 1,
 445: 2,
 670: 3,
 510: 4,
 206: 5,
 37: 6,
 142: 7,
 780: 8,
 1011: 9,
 351: 10,
 515: 11,
 18: 12,
 198: 13,
 374: 14,
 659: 15,
 525: 16,
 657: 17,
 976: 18,
 17: 19,
 697: 20,
 30: 21,
 656: 22,
 19: 23,
 1381: 24,
 23: 25,
 382: 26,
 1000: 27,
 9: 28,
 302: 29,
 754: 30,
 79: 31,
 100: 32,
 2140: 33,
 2070: 34,
 948: 35,
 450: 36,
 544: 37,
 926: 38,
 835: 39,
 1055: 40,
 316: 41,
 2983: 42,
 547: 43,
 1148: 44,
 399: 45,
 1015: 46,
 29: 47,
 373: 48,
 1095: 49,
 493: 50,
 5: 51,
 2660: 52,
 387: 53,
 470: 54,
 425: 55,
 1292: 56,
 309: 57,
 1650: 58,
 508: 59,
 2165: 60,
 1491: 61,
 548: 62,
 504: 63,
 237: 64,
 716: 65,
 1431: 66,
 84: 67,
 380: 68,
 1510: 69,
 449: 70,
 2597: 71,
 859: 72,
 1469: 73,
 188: 74,
 529: 75,
 1016: 76,
 1773: 77,
 355: 78,
 1376: 79,
 982: 80,
 1046: 81,
 790: 82,
 761: 83,
 3205: 84,
 623: 85,
 236: 86,
 1777: 87,
 321: 88,
 526: 89,
 329: 90,
 349: 91,
 931: 92,
 1043: 93,
 503: 94,
 185: 95,
 1634: 96,
 459: 97,
 2004: 98,
 269: 

In [9]:
print(item2vector_dict[0])
print(item2vector_dict[172])

item2vector_dict

[ 0.22683981  0.22054446  0.10293441  0.36279407  0.11660517 -0.15873444
  0.3446863  -0.10191789 -0.60256255  0.22636715  0.08521863 -0.20319217
 -0.53405553  0.16914053  0.07159636 -0.10881545 -0.22665371  0.3292266
 -0.5436907   0.8927455   0.3223739   0.5118548  -0.25780022 -0.1674987
 -0.66067827  0.51831967 -0.07198093 -0.05509546  0.37069842 -0.14837071
  0.21924454  0.4758068  -0.18020092 -0.280614   -0.12698755  0.37664822
  0.01708253  0.11531627 -0.04196359 -0.10225628  0.3481526   0.04910057
 -0.3683064   0.07965107  0.17266966 -0.36050487 -0.39134645 -0.38626534
  0.31705153  0.5796713   0.0287996   0.13964634 -0.07049368  0.03839892
  0.10503736  0.34329018  0.13034636 -0.61309505 -0.06742617  0.12760326
 -0.16648823  0.06017849 -0.0867815   0.07847928]
[-0.08993561  0.2079665   0.04921354  0.11272669 -0.36242244  0.17890461
 -0.08409926 -0.22432566 -0.8681603  -0.52758145 -0.2613575  -0.5094962
 -0.3401165   0.38210964  0.08813766  0.13566989  0.5997439   0.00152849
 -0.

{172: array([-0.08993561,  0.2079665 ,  0.04921354,  0.11272669, -0.36242244,
         0.17890461, -0.08409926, -0.22432566, -0.8681603 , -0.52758145,
        -0.2613575 , -0.5094962 , -0.3401165 ,  0.38210964,  0.08813766,
         0.13566989,  0.5997439 ,  0.00152849, -0.5034547 ,  0.20006518,
         0.72072905,  0.09222884,  0.29559085, -0.6640137 , -0.18046781,
         0.06562667,  0.00848093,  0.19111738,  0.2301161 , -0.20323825,
        -0.04602247,  0.07973167, -0.16458045, -0.44253075,  0.46353146,
         0.353113  ,  0.03975028, -0.34660995,  0.42578006,  0.2365155 ,
        -0.03587336,  0.22177136, -0.20607789, -0.30815423,  0.42957106,
        -0.11816357,  0.06614228, -0.05249067,  0.27689838, -0.00212044,
         0.10492659,  0.10711379, -0.03238023,  0.0517767 ,  0.5477569 ,
        -0.04418988,  0.5267664 , -0.19244705, -0.03473813,  0.379957  ,
        -0.25628462,  0.11327576, -0.3229673 , -0.47917166], dtype=float32),
 11: array([ 0.5122984 ,  0.27866036,  0.0