In [16]:
import os
import json
import numpy as np
import pickle

dataset_folder = '/home/jy1559/Mar2025_Module/Datasets'
dataset_name = 'Retail_Rocket'
INTER_FILE = os.path.join(dataset_folder, dataset_name, "interactions_revised.json")
CAND_FILE = os.path.join(dataset_folder, dataset_name, "candidate_sets_revised.npz")
ITEM_EMB = os.path.join(dataset_folder, dataset_name, "item_embedding_normalized_revised.pickle")
ART_EMB = os.path.join(dataset_folder, dataset_name, "articles_embeddings.pickle")

# dense 버전 파일명
INTER_OUT = os.path.join(dataset_folder, dataset_name, "interactions_dense.json")
CAND_OUT = os.path.join(dataset_folder, dataset_name, "candidate_sets_dense.npz")
ITEM_OUT = os.path.join(dataset_folder, dataset_name, "item_embedding_dense.pickle")
ART_OUT = os.path.join(dataset_folder, dataset_name, "articles_embeddings_dense.pickle")
MAP_FILE = os.path.join(dataset_folder, dataset_name, "item_id_mapping.pkl")

In [17]:
# 1) 모든 item id 수집
with open(INTER_FILE, "r", encoding="utf-8") as f:
    data_json = json.load(f)

item_set = set()
for sessions in data_json["data"].values():
    for sess in sessions:
        item_set.update(int(t[0]) for t in sess)

print(f"Unique item count: {len(item_set):,}")

# 2) dense id (0=PAD, 1~N)
dense_ids = {orig: idx+1 for idx, orig in enumerate(sorted(item_set))}
reverse   = {v:k for k,v in dense_ids.items()}

# 3) 저장
with open(MAP_FILE, "wb") as f:
    pickle.dump({"id2idx": dense_ids, "idx2id": reverse}, f)
print(">> mapping saved :", MAP_FILE)


Unique item count: 100,822
>> mapping saved : /home/jy1559/Mar2025_Module/Datasets/Retail_Rocket/item_id_mapping.pkl


In [18]:
from copy import deepcopy

dense_json = deepcopy(data_json)
for user, sessions in dense_json["data"].items():
    for sess in sessions:
        for trip in sess:
            trip[0] = dense_ids[int(trip[0])]             # id 재매핑

with open(INTER_OUT, "w", encoding="utf-8") as f:
    json.dump(dense_json, f)
print(">> saved :", INTER_OUT)


>> saved : /home/jy1559/Mar2025_Module/Datasets/Retail_Rocket/interactions_dense.json


In [19]:
import numpy as np, pickle, tqdm, torch

K = 128                      # 후보 크기 (원본과 동일)
N_dense = len(dense_ids)     # 46 033
pad_row = np.zeros(K, dtype=np.int32)

# 1) 원본 후보셋 로드
old = np.load(CAND_FILE)["candidate_tensor"]   # shape [364047, K]

# 2) 새 행렬  [N_dense+1, K]
new = np.zeros((N_dense+1, K), dtype=np.int32)  # 0행 PAD

# 3) 각 dense_id 행 채우기
for orig_id, d_id in tqdm.tqdm(dense_ids.items(), desc="remap"):
    # 원본 후보 리스트
    cand_row = old[orig_id]                    # [K] orig ids
    # orig→dense 매핑 (PAD=0)
    new[d_id] = np.vectorize(lambda x: dense_ids.get(int(x), 0),
                             otypes=[np.int32])(cand_row)

# 4) 저장
np.savez_compressed(CAND_OUT, candidate_tensor=new)
print(">> saved :", CAND_OUT, new.shape)   # (46 034, 128)


remap: 100%|██████████| 100822/100822 [00:02<00:00, 36173.07it/s]


>> saved : /home/jy1559/Mar2025_Module/Datasets/Retail_Rocket/candidate_sets_dense.npz (100823, 128)


In [20]:
import torch
with open(ITEM_EMB, "rb") as f:
    emb_dict = pickle.load(f)   # {'embedding_tensor': Tensor[N+1,384]}

orig_tensor = emb_dict["embedding_tensor"]
D = orig_tensor.shape[1]
dense_tensor = torch.zeros(len(dense_ids)+1, D)   # 0행 PAD 0벡터

for orig_id, dense_id in dense_ids.items():
    dense_tensor[dense_id] = orig_tensor[orig_id]

pickle.dump({"embedding_tensor": dense_tensor}, open(ITEM_OUT,"wb"))
print(">> saved :", ITEM_OUT, dense_tensor.shape)


>> saved : /home/jy1559/Mar2025_Module/Datasets/Retail_Rocket/item_embedding_dense.pickle torch.Size([100823, 384])


In [21]:
import torch, numpy as np, pickle

with open(ART_EMB, "rb") as f:
    art = pickle.load(f)           # ndarray  shape [N+1, 384]

art_t = torch.as_tensor(art, dtype=torch.float32)   # 🔹 numpy → torch
art_dense = torch.zeros(len(dense_ids)+1, art_t.shape[1])

for orig_id, dense_id in dense_ids.items():
    art_dense[dense_id] = art_t[orig_id]            # 🔹 torch ← torch

pickle.dump({"embedding_tensor": art_dense}, open(ART_OUT, "wb"))
print(">> saved :", ART_OUT, art_dense.shape)

FileNotFoundError: [Errno 2] No such file or directory: '/home/jy1559/Mar2025_Module/Datasets/Retail_Rocket/articles_embeddings.pickle'

In [None]:
import pickle, torch

# ── 1. numpy → torch 변환 후 allclose 검증 ─────────────────────────────
rand_orig = next(iter(dense_ids))
rand_dense = dense_ids[rand_orig]

art_np   = art[rand_orig]                     # numpy 1D
art_t    = torch.as_tensor(art_np, dtype=torch.float32)
is_equal = torch.allclose(art_t, art_dense[rand_dense])

print("▶ PAD row (0~3)", art_dense[0,:4])
print(f"▶ random id check  orig {rand_orig} → dense {rand_dense}")
print("   vec equal? ", is_equal)

# ── 2. 새로 저장된 파일들의 차원 확인 ────────────────────────────────
with open(ITEM_OUT, "rb") as f:
    item_emb_dense = pickle.load(f)["embedding_tensor"]
print("\nitem_embedding_dense.pickle :", item_emb_dense.shape)   # (N_dense+1, 384)

cand_dense = np.load(CAND_OUT)["candidate_tensor"]
print("candidate_sets_dense.npz     :", cand_dense.shape)        # (N_dense+1, K)

with open(ART_OUT, "rb") as f:
    art_dense_loaded = pickle.load(f)["embedding_tensor"]
print("articles_embeddings_dense    :", art_dense_loaded.shape)  # (N_dense+1, 384)

# ── 3. id-매핑 dict 크기 확인 ────────────────────────────────────────
with open(MAP_FILE, "rb") as f:
    maps = pickle.load(f)
print("\nMapping sizes  id2idx:", len(maps['id2idx']), "| idx2id:", len(maps['idx2id']))


▶ PAD row (0~3) tensor([0., 0., 0., 0.])
▶ random id check  orig 7 → dense 1
   vec equal?  False

item_embedding_dense.pickle : torch.Size([100823, 384])
candidate_sets_dense.npz     : (100823, 128)

Mapping sizes  id2idx: 100822 | idx2id: 100822
