In [13]:
import os
import json
import torch
from torch.utils.data import Dataset, DataLoader, random_split

datasets = ["Globo", "LFM-BeyMS", "Retail_Rocket"]
dataset_folder = "/home/jy1559/Mar2025_Module/Datasets"
metadata_path = os.path.join(dataset_folder, datasets[0], "item_metadata.json")
interaction_path = os.path.join(dataset_folder, datasets[0], "interactions.json")

class SeqRecDataset(Dataset):
    def __init__(self, interactions_path, item_metadata_path):
        with open(interactions_path, 'r', encoding='utf-8') as f:
            self.interactions = json.load(f)
        
        with open(item_metadata_path, 'r', encoding='utf-8') as f:
            self.item_metadata = json.load(f)
        self.interaction_data = self.interactions["data"]
        self.index = self.interactions["index"]
        self.users = list(self.interaction_data.keys())

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user_id = self.users[idx]
        sessions = self.interaction_data[user_id]
        
        user_sessions = []
        for session in sessions:
            session_interactions = []
            for interaction in session:
                item_id, timestamp, add_info = interaction
                session_interactions.append({
                    'item_id': item_id,
                    'timestamp': timestamp,
                    'additional_info': add_info
                })
            user_sessions.append(tuple(session_interactions))
        user_sessions = tuple(user_sessions)
        
        return {
            'user_id': user_id,
            'sessions': user_sessions
        }

# Dataset 로드
dataset = SeqRecDataset(interaction_path, metadata_path)

# train/valid/test (8:1:1) 분할
train_size = int(0.8 * len(dataset))
valid_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - valid_size
train_set, valid_set, test_set = random_split(dataset, [train_size, valid_size, test_size], generator=torch.Generator().manual_seed(42))

# DataLoader 생성 (배치 크기: 32, 임의 설정 가능)
train_loader = DataLoader(train_set, batch_size=8, shuffle=True, collate_fn=lambda x: x)
valid_loader = DataLoader(valid_set, batch_size=8, shuffle=False, collate_fn=lambda x: x)
test_loader = DataLoader(test_set, batch_size=8, shuffle=False, collate_fn=lambda x: x)

# 사용 예시
for batch in train_loader:
    print(batch)  # batch는 list 형태로 user의 interaction 시퀀스들이 담겨 있음
    break

[{'user_id': '227210', 'sessions': (({'item_id': '156355', 'timestamp': 1507432049479, 'additional_info': ['1', '3', '4', '2', '2', '21', 2, True]}, {'item_id': '272218', 'timestamp': 1507432079479, 'additional_info': ['1', '3', '4', '2', '2', '21', 2, True]}),)}, {'user_id': '243424', 'sessions': (({'item_id': '348113', 'timestamp': 1507558442752, 'additional_info': ['1', '1', '4', '17', '5', '25', 2, True]}, {'item_id': '348608', 'timestamp': 1507558472752, 'additional_info': ['1', '1', '4', '17', '5', '25', 2, True]}),)}, {'user_id': '101246', 'sessions': (({'item_id': '199198', 'timestamp': 1506982069611, 'additional_info': ['1', '3', '4', '2', '2', '13', 3, True]}, {'item_id': '272143', 'timestamp': 1506982196564, 'additional_info': ['1', '3', '4', '2', '1', '13', 3, True]}, {'item_id': '118683', 'timestamp': 1506982226564, 'additional_info': ['1', '3', '4', '2', '1', '13', 3, True]}), ({'item_id': '159762', 'timestamp': 1507144405693, 'additional_info': ['1', '3', '4', '2', '2', 

In [14]:
print(dataset.index)

{'item_id': 0, 'timestamp': 1, 'add_info': 2, 'add_index': {'country': 0, 'deviceGroup': 1, 'environment': 2, 'os': 3, 'referrer_type': 4, 'region': 5, 'session_size': 6, 'session_start': 7}}


In [16]:
from datetime import datetime

def create_user_embedding(interaction, item_metadata):
    user_sessions = []
    for session in interaction["sessions"]:
        user_session = []
        for each_interaction in session:
            user_session.append(create_embedding_sentence(each_interaction, item_metadata))
        user_sessions.append(tuple(user_session))
    return {"user_id": interaction["user_id"], "sessions": tuple(user_sessions)}

def create_embedding_sentence(interaction, item_metadata):
    """
    interaction: {'user_id', 'item_id', 'timestamp', 'additional_info'}
    item_metadata: dict[item_id] = sentence
    """
    item_id = interaction['item_id']
    timestamp = interaction['timestamp']
    additional_info = interaction['additional_info']
    
    item_info = item_metadata.get(item_id, "No metadata available")
    timestamp_sec = timestamp / 1000 
    readable_time = datetime.fromtimestamp(timestamp_sec).strftime('%Y-%m-%d %H:%M:%S')
    sentence_parts = [f"Item Info: {item_info}", f"Interaction Time: {readable_time}"]

    if additional_info:
        add_info = []
        for name, idx in dataset.index["add_index"].items():
            add_info.append(f"{name}: {additional_info[idx]}")
        add_info = ", ".join(add_info)
        sentence_parts.append(f"Additional Info: {add_info}")

    final_sentence = " | ".join(sentence_parts)
    return final_sentence

sample_sequence = train_set[0]  # 특정 user의 interaction sequence
print(sample_sequence)
embedding_sentence = create_user_embedding(sample_sequence, dataset.item_metadata)
print(embedding_sentence)
    # 이 sentence를 LLM 기반 embedding 모델에 전달하면 됨.


{'user_id': '68217', 'sessions': (({'item_id': '158082', 'timestamp': 1506957463071, 'additional_info': ['1', '1', '4', '17', '2', '25', 2, True]}, {'item_id': '160974', 'timestamp': 1506957493071, 'additional_info': ['1', '1', '4', '17', '2', '25', 2, True]}), ({'item_id': '206112', 'timestamp': 1507125968312, 'additional_info': ['1', '1', '4', '17', '2', '25', 2, True]}, {'item_id': '233717', 'timestamp': 1507125998312, 'additional_info': ['1', '1', '4', '17', '2', '25', 2, True]}), ({'item_id': '293301', 'timestamp': 1507138677788, 'additional_info': ['1', '1', '4', '17', '2', '25', 2, True]}, {'item_id': '159762', 'timestamp': 1507138707788, 'additional_info': ['1', '1', '4', '17', '2', '25', 2, True]}), ({'item_id': '235616', 'timestamp': 1507658248599, 'additional_info': ['1', '1', '4', '17', '2', '25', 2, True]}, {'item_id': '156052', 'timestamp': 1507658278599, 'additional_info': ['1', '1', '4', '17', '2', '25', 2, True]}), ({'item_id': '128260', 'timestamp': 1507745215200, 'ad

In [None]:
import os
import json
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from datetime import datetime

datasets = ["Globo", "LFM-BeyMS", "Retail_Rocket"]
dataset_folder = "/home/jy1559/Mar2025_Module/Datasets"
metadata_path = os.path.join(dataset_folder, datasets[0], "item_metadata.json")
interaction_path = os.path.join(dataset_folder, datasets[0], "interactions.json")

class SeqRecDataset(Dataset):
    def __init__(self, interactions_path, item_metadata_path):
        with open(interactions_path, 'r', encoding='utf-8') as f:
            interactions_json = json.load(f)
            self.interaction_data = interactions_json["data"]
            self.index = interactions_json["index"]

        with open(item_metadata_path, 'r', encoding='utf-8') as f:
            self.item_metadata = json.load(f)

        self.users = list(self.interaction_data.keys())

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user_id = self.users[idx]
        sessions = self.interaction_data[user_id]

        user_sessions = []
        for session in sessions:
            session_interactions = []
            prev_timestamp = None

            for interaction in session:
                item_id, timestamp, add_info = interaction

                embedding_sentence = self._create_embedding_sentence(
                    item_id, timestamp, add_info
                )

                # 이전 interaction과의 시간 간격 (초 단위)
                delta_t = (
                    0 if prev_timestamp is None
                    else (timestamp - prev_timestamp) / 1000
                )
                prev_timestamp = timestamp

                session_interactions.append({
                    'embedding_sentence': embedding_sentence,
                    'delta_t': f"{delta_t}s"
                })
            user_sessions.append(tuple(session_interactions))

        return {
            'user_id': user_id,
            'sessions': tuple(user_sessions)
        }

    def _create_embedding_sentence(self, item_id, timestamp, additional_info):
        item_info = self.item_metadata.get(item_id, "No metadata available")
        readable_time = datetime.fromtimestamp(timestamp / 1000).strftime('%Y-%m-%d %H:%M:%S')
        sentence_parts = [
            f"Item Info: {item_info}",
            f"Interaction Time: {readable_time}"
        ]

        if additional_info:
            add_info_parts = []
            for name, idx in self.index["add_index"].items():
                add_info_parts.append(f"{name}: {additional_info[idx]}")
            add_info_str = ", ".join(add_info_parts)
            sentence_parts.append(f"Additional Info: {add_info_str}")

        return " | ".join(sentence_parts)

# Dataset 로드
dataset = SeqRecDataset(interaction_path, metadata_path)

# 데이터셋 분할 (8:1:1)
train_size = int(0.8 * len(dataset))
valid_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - valid_size

train_set, valid_set, test_set = random_split(
    dataset,
    [train_size, valid_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

# DataLoader 생성 (예시 배치 크기)
batch_size = 8
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=lambda x: x)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=lambda x: x)

# 샘플 사용 예시
sample_batch = next(iter(train_loader))
print(sample_batch[0])  # 한 유저의 데이터 구조 확인


{'user_id': '106477', 'sessions': (({'embedding_sentence': 'Item Info: Item ID: 166581, Category ID: 289, Word count: 210 created at 2017-10-03 02:02:35 | Interaction Time: 2017-10-03 08:26:34 | Additional Info: country: 1, deviceGroup: 3, environment: 4, os: 2, referrer_type: 2, region: 13, session_size: 2, session_start: True', 'delta_t': None}, {'embedding_sentence': 'Item Info: Item ID: 272143, Category ID: 399, Word count: 184 created at 2017-10-03 01:31:10 | Interaction Time: 2017-10-03 08:27:04 | Additional Info: country: 1, deviceGroup: 3, environment: 4, os: 2, referrer_type: 2, region: 13, session_size: 2, session_start: True', 'delta_t': 30.0}), ({'embedding_sentence': 'Item Info: Item ID: 271261, Category ID: 399, Word count: 205 created at 2017-10-10 03:26:44 | Interaction Time: 2017-10-10 09:18:20 | Additional Info: country: 1, deviceGroup: 3, environment: 4, os: 2, referrer_type: 2, region: 13, session_size: 2, session_start: True', 'delta_t': None}, {'embedding_sentence

In [None]:
import torch

def seq_collate_fn(batch):
    batch_size = len(batch)
    max_sessions = max(len(user['sessions']) for user in batch)
    max_interactions = max(len(sess) for user in batch for sess in user['sessions'])

    sentences_batch = []
    delta_t_batch = []
    session_mask = []        # 실제 세션 존재 여부
    interaction_mask = []    # 실제 interaction 존재 여부

    for user in batch:
        user_sentences = []
        user_delta_t = []
        user_session_mask = []
        user_interaction_mask = []

        for session in user['sessions']:
            sess_len = len(session)
            sentences = [interaction['embedding_sentence'] for interaction in session]
            delta_ts = [0 if interaction['delta_t'] is None else interaction['delta_t'] for interaction in session]

            # interaction padding
            pad_len = max_interactions - sess_len
            sentences += ["[PAD]"] * pad_len
            delta_ts += [0] * pad_len
            interaction_presence = [1]*sess_len + [0]*pad_len

            user_sentences.append(sentences)
            user_delta_t.append(delta_ts)
            user_interaction_mask.append(interaction_presence)
            user_session_mask.append(1)

        # session padding
        pad_sessions = max_sessions - len(user['sessions'])
        user_sentences += [["[PAD]"]*max_interactions] * pad_sessions
        user_delta_t += [[0]*max_interactions] * pad_sessions
        user_interaction_mask += [[0]*max_interactions] * pad_sessions
        user_session_mask += [0]*pad_sessions

        sentences_batch.append(user_sentences)
        delta_t_batch.append(user_delta_t)
        interaction_mask.append(user_interaction_mask)
        session_mask.append(user_session_mask)

    return {
        'embedding_sentences': sentences_batch,   # [batch_size, max_sessions, max_interactions]
        'delta_ts': torch.tensor(delta_t_batch, dtype=torch.float32),  # [batch_size, max_sessions, max_interactions]
        'interaction_mask': torch.tensor(interaction_mask, dtype=torch.float32),  # 같은 shape
        'session_mask': torch.tensor(session_mask, dtype=torch.float32),  # [batch_size, max_sessions]
    }


ValueError: not enough values to unpack (expected 3, got 1)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)


  from .autonotebook import tqdm as notebook_tqdm


Sentence embeddings:
tensor([[ 6.7657e-02,  6.3496e-02,  4.8713e-02,  7.9305e-02,  3.7448e-02,
          2.6529e-03,  3.9375e-02, -7.0984e-03,  5.9361e-02,  3.1537e-02,
          6.0098e-02, -5.2905e-02,  4.0607e-02, -2.5931e-02,  2.9843e-02,
          1.1269e-03,  7.3515e-02, -5.0382e-02, -1.2239e-01,  2.3703e-02,
          2.9727e-02,  4.2477e-02,  2.5634e-02,  1.9952e-03, -5.6919e-02,
         -2.7160e-02, -3.2904e-02,  6.6025e-02,  1.1901e-01, -4.5879e-02,
         -7.2621e-02, -3.2584e-02,  5.2341e-02,  4.5055e-02,  8.2531e-03,
          3.6702e-02, -1.3942e-02,  6.5392e-02, -2.6427e-02,  2.0639e-04,
         -1.3664e-02, -3.6281e-02, -1.9504e-02, -2.8974e-02,  3.9427e-02,
         -8.8409e-02,  2.6243e-03,  1.3671e-02,  4.8306e-02, -3.1157e-02,
         -1.1733e-01, -5.1169e-02, -8.8529e-02, -2.1896e-02,  1.4299e-02,
          4.4417e-02, -1.3481e-02,  7.4339e-02,  2.6638e-02, -1.9876e-02,
          1.7919e-02, -1.0605e-02, -9.0426e-02,  2.1327e-02,  1.4120e-01,
         -6.4717e