In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import gzip
import pickle
import json
from torch.utils.data import Dataset, DataLoader
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [2]:
dataset = "Dunnhumby" # "Tafeng" or "Dunnhumby"
k = 30

# 隨資料集調整
batch_size = 32  # Tafeng = 64 / Dunnhumby = 32
learning_rate = 0.00001  # Tafeng = 0.0001 / Dunnhumby = 0.00001
vector_size = 3005  # Tafeng = 12087 / Dunnhumby = 3005
num_products = 3005  # Tafeng = 12087 / Dunnhumby = 3005

#固定參數設置
epochs = 80
embed_dim = 64
ffn_hidden_dim = 256
decay_rate = 0.3
dropout_rate = 0.3
num_heads = 4
num_trans_layers = 1
max_seq_length = 75


device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [3]:
with gzip.open(f'data/preprocessed_data/{dataset}_training_answer.gz', 'rb') as f:
    training_answers = pickle.load(f)

with gzip.open(f'data/preprocessed_data/{dataset}_validation_answer.gz', 'rb') as f:
    validation_answers = pickle.load(f)

with gzip.open(f'data/preprocessed_data/{dataset}_test_answer.gz', 'rb') as f:
    test_answers = pickle.load(f)

true_training_basket_dict = {item[0]: item[2].float() if not isinstance(item[2], torch.Tensor) else item[2].float() for item in training_answers}
true_validation_basket_dict = {item[0]: item[2].float() if not isinstance(item[2], torch.Tensor) else item[2].float() for item in validation_answers }
true_test_basket_dict = {item[0]: item[2].float() if not isinstance(item[2], torch.Tensor) else item[2].float() for item in test_answers }

training_embedding_file = f'data/{dataset}/basketembedding/training_basketembedding_{embed_dim}.pkl.gz'
training_neighbors_file = f'data/{dataset}/training_neighbors_for_dlim.json.gz'

validation_embedding_file = f'data/{dataset}/basketembedding/validation_basketembedding_{embed_dim}.pkl.gz'
validation_neighbors_file = f'data/{dataset}/validation_neighbors_for_dlim.json.gz'

test_embedding_file = f'data/{dataset}/basketembedding/test_basketembedding_{embed_dim}.pkl.gz'
test_neighbors_file = f'data/{dataset}/test_neighbors_for_dlim.json.gz'

In [4]:
class BasketDataset(Dataset):
    # 接收訓練集的嵌入向量文件路徑、鄰居信息文件路徑、真實購物籃字典以及最大序列長度作為參數
    def __init__(self, training_embedding_file, training_neighbors_file, true_training_basket_dict,max_seq_length=max_seq_length):
        with gzip.open(training_embedding_file, 'rb') as f:
            self.basket_embeddings = pickle.load(f)
        with gzip.open(training_neighbors_file, 'rb') as f:
            self.neighbors = json.load(f)
        self.true_training_basket_dict = true_training_basket_dict
        self.max_seq_length = max_seq_length
        
    # 返回數據集中的樣本數量
    def __len__(self):
        return len(self.neighbors)

    # 計算與最晚日期的差值，並返回這些差值的列表
    def calculate_relative_dates(self, transaction_dates):
        #dates = [np.datetime64(date) for date in transaction_dates] # Tafeng 要跑這行
        dates = [np.datetime64(f"{str(date)[:4]}-{str(date)[4:6]}-{str(date)[6:]}") for date in transaction_dates] # Dunnhumby 要跑這行
        max_date = max(dates) + np.timedelta64(1, 'D')
        relative_dates = [(max_date - date).astype(int) for date in dates]
        return relative_dates


    # 按索引獲取數據集中的單個樣本
    def __getitem__(self, idx):
        user_id, neighbors_ids = self.neighbors[idx]

        # 獲取指定用戶的購物籃嵌入向量和交易日期，並計算相對日期
        user_data = self.basket_embeddings.get(user_id, [])
        user_embeddings = [torch.tensor(embedding[0]) for embedding in user_data]
        user_dates = [embedding[1] for embedding in user_data]
        user_dates = self.calculate_relative_dates(user_dates)

        # 初始化用戶的嵌入向量和交易日期的填充張量。如果用戶的購物籃數據少於最大序列長度，則使用零和 -1 進行填充
        user_embeddings_padded = torch.zeros((self.max_seq_length, len(user_embeddings[0])))
        user_dates_padded = torch.full((self.max_seq_length,), -1, dtype=torch.int64)  # 使用 -1 填充日期

        # pad_sequence 是 PyTorch 中的一個函數，用於將一系列長度不一的序列填充到相同的長度。
        if user_embeddings:
            user_embeddings_tensor = pad_sequence(user_embeddings, batch_first=True) # 將不同長度的用戶購物籃嵌入向量填充到相同的長度，生成一個統一的張量 
            user_dates_tensor = torch.tensor(user_dates, dtype=torch.int64)
            user_seq_len = min(self.max_seq_length, len(user_dates)) # 實際要使用的序列長度

            user_embeddings_padded[:user_seq_len, :] = user_embeddings_tensor[:user_seq_len, :]
            user_dates_padded[:user_seq_len] = user_dates_tensor[:user_seq_len]

        # 初始化邻居嵌入向量和交易日期的填充列表
        neighbor_embeddings_padded = torch.zeros((300, self.max_seq_length, len(user_embeddings[0])))
        neighbor_dates_padded = torch.full((300, self.max_seq_length), -1, dtype=torch.int64)  # 使用 -1 填充日期

        # 填充邻居的购物篮嵌入向量和交易日期
        for i, neighbor_id in enumerate(neighbors_ids):
            n_data = self.basket_embeddings.get(neighbor_id, []) # 獲取該鄰居的購物籃數據。如果找不到對應的數據，則返回一個空列表。

            # 分別從鄰居的購物籃數據中提取嵌入向量和交易日期
            n_embeddings = [torch.tensor(embedding[0]) for embedding in n_data]
            n_dates = [embedding[1] for embedding in n_data]
            n_dates = self.calculate_relative_dates(n_dates)
            
            if n_embeddings:
                n_embeddings_tensor = pad_sequence(n_embeddings, batch_first=True)
                n_dates_tensor = torch.tensor(n_dates, dtype=torch.int64)
                seq_len = min(self.max_seq_length, len(n_dates))

                neighbor_embeddings_padded[i, :seq_len, :] = n_embeddings_tensor[:seq_len, :]
                neighbor_dates_padded[i, :seq_len] = n_dates_tensor[:seq_len]

        true_basket_vector = self.true_training_basket_dict.get(user_id, torch.zeros(vector_size))
        return user_embeddings_padded, user_dates_padded, neighbor_embeddings_padded, neighbor_dates_padded, true_basket_vector

def create_dataloader(embedding_file, neighbors_file, batch_size, true_basket_dict):
    dataset = BasketDataset(embedding_file, neighbors_file, true_basket_dict, max_seq_length=max_seq_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

In [5]:
class TemporalAttention(nn.Module):
    def __init__(self, decay_rate, embedding_dim):
        super(TemporalAttention, self).__init__()
        self.decay_rate = nn.Parameter(torch.tensor(decay_rate))
        self.embedding_dim = embedding_dim

    def forward(self, basket_sequence, transaction_dates):
        mask = (transaction_dates != -1).float()
        decay_weights = torch.exp(-self.decay_rate * transaction_dates)
        decay_weights = decay_weights * mask
        decay_weights_sum = decay_weights.sum(1, keepdim=True)
        normalized_weights = decay_weights / decay_weights_sum
        user_embedding = torch.sum(normalized_weights.unsqueeze(-1) * basket_sequence, dim=1)
        return user_embedding

class TransformerLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, ffn_hidden_dim, dropout_rate):
        super(TransformerLayer, self).__init__()
        self.multihead_attn = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=num_heads)
        self.feed_forward = FeedForward(embedding_dim, ffn_hidden_dim, dropout_rate)
        self.layer_norm1 = nn.LayerNorm(embedding_dim)
        self.layer_norm2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, src):
        attn_output, _ = self.multihead_attn(src, src, src)
        src = self.layer_norm1(src + attn_output)
        ffn_output = self.feed_forward(src)
        src = self.layer_norm2(src + ffn_output)
        return src

class FeedForward(nn.Module):
    def __init__(self, embedding_dim, ffn_hidden_dim, dropout_rate):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, ffn_hidden_dim)
        self.fc2 = nn.Linear(ffn_hidden_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.layer_norm = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x_ffn = self.fc2(F.relu(self.fc1(x)))
        x = self.layer_norm(x + self.dropout(x_ffn))
        return x

class MLPLayer(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLPLayer, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [6]:
class RecommendationModel(nn.Module):
    def __init__(self, embedding_dim, num_heads, decay_rate, ffn_hidden_dim, num_products, dropout_rate, num_trans_layers=num_trans_layers):
        super(RecommendationModel, self).__init__()
        self.temporal_attention = TemporalAttention(decay_rate, embedding_dim)
        self.transformer_layers = nn.ModuleList([
            TransformerLayer(embedding_dim, num_heads, ffn_hidden_dim, dropout_rate) for _ in range(num_trans_layers)
        ])
        self.mlp = MLPLayer(embedding_dim, ffn_hidden_dim, num_products)

    def forward(self, user_basket_sequence, user_transaction_dates, neighbor_basket_sequence, neighbor_transaction_dates):
        user_embedding = self.temporal_attention(user_basket_sequence, user_transaction_dates)
        neighbor_embeddings = torch.stack([     
            self.temporal_attention(neighbor_seq, neighbor_dates)
            for neighbor_seq, neighbor_dates in zip(neighbor_basket_sequence, neighbor_transaction_dates)
        ]).transpose(0, 1)

        for layer in self.transformer_layers:
            neighbor_embeddings = layer(neighbor_embeddings)

        neighbor_embedding = neighbor_embeddings[-1]
        combined_embedding = user_embedding + neighbor_embedding
        output = self.mlp(combined_embedding.squeeze(0))

        return output

In [7]:
def calculate_topk_metrics(predictions, targets, k):
    # 将模型输出转换为 top-k 二值向量
    _, top_indices = torch.topk(predictions, k, dim=1)
    topk_binary_vector = torch.zeros_like(predictions)
    topk_binary_vector.scatter_(1, top_indices, 1)

    # 计算 true positives, false positives, false negatives
    true_positives = torch.sum(topk_binary_vector * targets, dim=1)
    false_positives = torch.sum(topk_binary_vector * (1 - targets), dim=1)
    false_negatives = torch.sum((1 - topk_binary_vector) * targets, dim=1)

    # 计算指标
    recall = torch.mean(true_positives / (true_positives + false_negatives))
    precision = torch.mean(true_positives / (true_positives + false_positives))
    f1 = 2 * (precision * recall) / (precision + recall)
    
    # 计算 Hit Ratio (HR)
    hr = torch.mean((true_positives > 0).float())

    return recall.item(), precision.item(), f1.item(), hr.item()

In [8]:
def ndcg_score(predictions, targets, k):

    # 获取 top-k 预测项的索引
    _, top_indices = torch.topk(predictions, k, dim=1)
    
    # 生成 DCG 分数
    dcg = 0.0
    for i in range(1, k + 1):
        dcg += ((2 ** targets.gather(1, top_indices[:, i - 1].view(-1, 1)) - 1) / torch.log2(torch.tensor(i + 1).float())).squeeze()

    # 生成理想的 DCG 分数 (IDCG)
    _, ideal_indices = torch.topk(targets, k, dim=1)
    idcg = 0.0
    for i in range(1, k + 1):
        idcg += ((2 ** targets.gather(1, ideal_indices[:, i - 1].view(-1, 1)) - 1) / torch.log2(torch.tensor(i + 1).float())).squeeze()

    # 处理 IDCG 为 0 的情况，防止除以零
    idcg[idcg == 0] = 1.0

    # 计算 NDCG
    ndcg = torch.mean(dcg / idcg)

    return ndcg.item()

In [9]:
training_loader = create_dataloader(training_embedding_file, training_neighbors_file, batch_size, true_training_basket_dict)
validation_loader = create_dataloader(validation_embedding_file, validation_neighbors_file, batch_size, true_validation_basket_dict)
test_loader = create_dataloader(test_embedding_file, test_neighbors_file, batch_size, true_test_basket_dict)

# 實例化模型
recommendation_model =  RecommendationModel(embedding_dim=embed_dim, num_heads=num_heads, decay_rate=decay_rate, ffn_hidden_dim=ffn_hidden_dim, num_products=num_products, dropout_rate=dropout_rate).to(device)

# 定義損失函數和優化器
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(recommendation_model.parameters(), lr=learning_rate)

In [10]:
def validate_model(recommendation_model, validation_loader, device, loss_function, calculate_topk_metrics, ndcg_score, k):

    recommendation_model.eval()  # 设置模型为评估模式

    val_loss = 0.0
    val_metrics = {'recall': 0.0, 'precision': 0.0, 'f1': 0.0, 'hr': 0.0, 'ndcg': 0.0}

    for batch in validation_loader:
        user_embeddings, user_transaction_dates, neighbor_embeddings, neighbor_transaction_dates, true_basket_vector = batch
        user_embeddings = user_embeddings.to(device)
        user_transaction_dates = user_transaction_dates.to(device)
        neighbor_embeddings = [ne.to(device) for ne in neighbor_embeddings]
        neighbor_transaction_dates = [nt.to(device) for nt in neighbor_transaction_dates]
        true_basket_vector = true_basket_vector.to(device)

        with torch.no_grad():
            
            predicted_scores = recommendation_model(user_embeddings, user_transaction_dates, neighbor_embeddings, neighbor_transaction_dates)
            #normalized_ans = predicted_scores / torch.sum(predicted_scores)

            loss = loss_function(predicted_scores, true_basket_vector.float())
            val_loss += loss.item()
            
            recall, precision, f1, hr = calculate_topk_metrics(predicted_scores, true_basket_vector, k)
            ndcg = ndcg_score(predicted_scores, true_basket_vector, k)

            val_metrics['recall'] += recall
            val_metrics['precision'] += precision
            val_metrics['f1'] += f1
            val_metrics['hr'] += hr
            val_metrics['ndcg'] += ndcg

    avg_loss = val_loss / len(validation_loader)
    avg_metrics = {k: val_metrics[k] / len(validation_loader) for k in val_metrics}
    
    return avg_loss, avg_metrics

In [11]:
def test_model(recommendation_model, test_loader, device, loss_function, calculate_topk_metrics, ndcg_score, k):

    recommendation_model.eval()  # 设置模型为评估模式

    test_loss = 0.0
    test_metrics = {'recall': 0.0, 'precision': 0.0, 'f1': 0.0, 'hr': 0.0, 'ndcg': 0.0}

    for batch in test_loader:
        user_embeddings, user_transaction_dates, neighbor_embeddings, neighbor_transaction_dates, true_basket_vector = batch
        user_embeddings = user_embeddings.to(device)
        user_transaction_dates = user_transaction_dates.to(device)
        neighbor_embeddings = [ne.to(device) for ne in neighbor_embeddings]
        neighbor_transaction_dates = [nt.to(device) for nt in neighbor_transaction_dates]
        true_basket_vector = true_basket_vector.to(device)

        with torch.no_grad():
            
            predicted_scores = recommendation_model(user_embeddings, user_transaction_dates, neighbor_embeddings, neighbor_transaction_dates)
            #normalized_ans = predicted_scores / torch.sum(predicted_scores)
            
            loss = loss_function(predicted_scores, true_basket_vector.float())
            test_loss += loss.item()

            recall, precision, f1, hr = calculate_topk_metrics(predicted_scores, true_basket_vector, k)
            ndcg = ndcg_score(predicted_scores, true_basket_vector, k)
        
            test_metrics['recall'] += recall
            test_metrics['precision'] += precision
            test_metrics['f1'] += f1
            test_metrics['hr'] += hr
            test_metrics['ndcg'] += ndcg

    avg_loss = test_loss / len(test_loader)
    avg_metrics = {k: test_metrics[k] / len(test_loader) for k in test_metrics}

    return avg_loss, avg_metrics

In [12]:
best_val_ndcg = -float('inf') 
patience = 2
no_improvement_count = 0
best_model_state = None

for epoch in range(epochs):
    
    recommendation_model.train()
    training_progress_bar = tqdm(training_loader, desc=f'Epoch {epoch+1}/{epochs}', unit='batch')
    
    for batch in training_progress_bar:
        
        user_embeddings, user_transaction_dates, neighbor_embeddings, neighbor_transaction_dates, true_basket_vector = batch
        user_embeddings = user_embeddings.to(device)
        user_transaction_dates = user_transaction_dates.to(device)
        neighbor_embeddings = [ne.to(device) for ne in neighbor_embeddings]
        neighbor_transaction_dates = [nt.to(device) for nt in neighbor_transaction_dates]
        true_basket_vector = true_basket_vector.to(device)

        
        predicted_scores = recommendation_model(user_embeddings, user_transaction_dates, neighbor_embeddings, neighbor_transaction_dates)
        #normalized_ans = predicted_scores / torch.sum(predicted_scores)

        loss = loss_function(predicted_scores, true_basket_vector.float())  # 确保 answer_vector 是 float 类型

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        training_progress_bar.set_description(f"Epoch {epoch+1}/{epochs} Loss: {loss.item() / len(training_loader)}")

     # 在每个epoch结束后进行验证
    val_loss, val_metrics = validate_model(
        recommendation_model, validation_loader, device, loss_function, calculate_topk_metrics, ndcg_score, k)
    
    tqdm.write(f'Validation Loss: {val_loss:.4f} | Recall: {val_metrics["recall"]:.4f} | Precision: {val_metrics["precision"]:.4f} | F1 Score: {val_metrics["f1"]:.4f} | NDCG: {val_metrics["ndcg"]:.4f} | HR: {val_metrics["hr"]:.4f}')

    if val_metrics['ndcg'] > best_val_ndcg:
        best_val_ndcg = val_metrics['ndcg']
        no_improvement_count = 0
        best_model_state = {
            'recommendation_model': recommendation_model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
    else:
        no_improvement_count += 1
    
    # 如果没有改进的计数达到了 patience，则停止训练
    if no_improvement_count >= patience:
        print("Early stopping due to no improvement in validation NDCG.")
        break

if best_model_state:
    torch.save(best_model_state, 'DLIM_Best_model.pth')

# 加载最佳模型状态
best_model_state = torch.load('DLIM_Best_model.pth')
recommendation_model.load_state_dict(best_model_state['recommendation_model'])
optimizer.load_state_dict(best_model_state['optimizer'])

# 在所有训练循环结束后调用测试函数
test_loss, test_metrics = test_model(
    recommendation_model, test_loader, device, loss_function, calculate_topk_metrics, ndcg_score, k)
tqdm.write(f'Test Loss: {test_loss:.4f} | Recall: {test_metrics["recall"]:.4f} | Precision: {test_metrics["precision"]:.4f} | F1 Score: {test_metrics["f1"]:.4f} | NDCG: {test_metrics["ndcg"]:.4f} | HR: {test_metrics["hr"]:.4f}')

Epoch 1/80 Loss: 0.0019078120640817397: 100%|██████████| 289/289 [13:24<00:00,  2.78s/batch]


Validation Loss: 0.5503 | Recall: 0.0051 | Precision: 0.0019 | F1 Score: nan | NDCG: 0.0031 | HR: 0.0530


Epoch 2/80 Loss: 0.0009909687776466555: 100%|██████████| 289/289 [13:23<00:00,  2.78s/batch]


Validation Loss: 0.2896 | Recall: 0.0066 | Precision: 0.0021 | F1 Score: nan | NDCG: 0.0036 | HR: 0.0578


Epoch 3/80 Loss: 0.00045248051415677716: 100%|██████████| 289/289 [13:21<00:00,  2.77s/batch]


Validation Loss: 0.1271 | Recall: 0.0068 | Precision: 0.0021 | F1 Score: nan | NDCG: 0.0037 | HR: 0.0597


Epoch 4/80 Loss: 0.0002226193522499507: 100%|██████████| 289/289 [13:21<00:00,  2.78s/batch] 


Validation Loss: 0.0628 | Recall: 0.0380 | Precision: 0.0093 | F1 Score: nan | NDCG: 0.0330 | HR: 0.2500


Epoch 5/80 Loss: 0.00015028443664415485: 100%|██████████| 289/289 [13:21<00:00,  2.78s/batch]


Validation Loss: 0.0379 | Recall: 0.0767 | Precision: 0.0176 | F1 Score: nan | NDCG: 0.0841 | HR: 0.3930


Epoch 6/80 Loss: 0.00010542623750272506: 100%|██████████| 289/289 [13:25<00:00,  2.79s/batch]


Validation Loss: 0.0274 | Recall: 0.0959 | Precision: 0.0237 | F1 Score: 0.0374 | NDCG: 0.1078 | HR: 0.4631


Epoch 7/80 Loss: 8.575282475321351e-05: 100%|██████████| 289/289 [13:23<00:00,  2.78s/batch] 


Validation Loss: 0.0229 | Recall: 0.1165 | Precision: 0.0299 | F1 Score: 0.0465 | NDCG: 0.1198 | HR: 0.5152


Epoch 8/80 Loss: 6.510553217676684e-05: 100%|██████████| 289/289 [13:25<00:00,  2.79s/batch] 


Validation Loss: 0.0195 | Recall: 0.1374 | Precision: 0.0344 | F1 Score: 0.0544 | NDCG: 0.1356 | HR: 0.5303


Epoch 9/80 Loss: 4.839368976626842e-05: 100%|██████████| 289/289 [13:20<00:00,  2.77s/batch] 


Validation Loss: 0.0180 | Recall: 0.1572 | Precision: 0.0392 | F1 Score: 0.0623 | NDCG: 0.1404 | HR: 0.5663


Epoch 10/80 Loss: 5.898452573375306e-05: 100%|██████████| 289/289 [13:21<00:00,  2.77s/batch] 


Validation Loss: 0.0173 | Recall: 0.1683 | Precision: 0.0424 | F1 Score: 0.0672 | NDCG: 0.1494 | HR: 0.5890


Epoch 11/80 Loss: 6.653604365137622e-05: 100%|██████████| 289/289 [13:21<00:00,  2.77s/batch] 


Validation Loss: 0.0167 | Recall: 0.1810 | Precision: 0.0439 | F1 Score: 0.0695 | NDCG: 0.1610 | HR: 0.5890


Epoch 12/80 Loss: 6.682421225783735e-05: 100%|██████████| 289/289 [13:21<00:00,  2.77s/batch] 


Validation Loss: 0.0166 | Recall: 0.1612 | Precision: 0.0418 | F1 Score: 0.0658 | NDCG: 0.1434 | HR: 0.5720


Epoch 13/80 Loss: 5.606653420157911e-05: 100%|██████████| 289/289 [13:22<00:00,  2.78s/batch] 


Validation Loss: 0.0162 | Recall: 0.1664 | Precision: 0.0434 | F1 Score: 0.0684 | NDCG: 0.1465 | HR: 0.5720
Early stopping due to no improvement in validation NDCG.
Test Loss: 0.0168 | Recall: 0.1634 | Precision: 0.0433 | F1 Score: 0.0678 | NDCG: 0.1408 | HR: 0.5819
