In [10]:
import os
import json
from datetime import datetime

def load_json(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)

def save_json(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def convert_to_unix_ms(ts):
    """숫자나 문자열 ts를 Unix 밀리초(ms) 정수로 변환."""
    if isinstance(ts, (int, float)):
        # ts가 1e11 미만이면 초 단위로 가정 (현재 시간은 약 1e9~1e10 초)
        if ts < 1e11:
            return int(ts * 1000)
        else:
            return int(ts)
    elif isinstance(ts, str):
        ts = ts.strip()
        if not ts:
            return 0
        try:
            dt = datetime.strptime(ts, "%Y-%m-%d %H:%M:%S")
        except Exception:
            try:
                dt = datetime.fromisoformat(ts)
            except Exception:
                return 0
        return int(dt.timestamp() * 1000)
    else:
        return 0

def normalize_interaction(interaction):
    """
    입력 interaction을 분석하여 (item_id, timestamp, additional_info_dict)를 반환.
    
    - Case A: 이미 dict 형태로 {"item_id": ..., "timestamp": ..., "additional_info": {…} }  
      → item_id와 timestamp는 그대로 사용하고, 추가정보는 오직 interaction["additional_info"] (dict)만 사용.
    
    - Case B: list 형태, e.g. [item_id, timestamp]  
      → additional_info는 빈 dict.
    
    - Case C: dict 형태로 {"itemid": ..., "timestamp": ..., ... }  
      → item_id = interaction["itemid"], timestamp 변환, 추가정보는 (itemid, timestamp 제외한) 나머지 key–value.
    """
    if isinstance(interaction, dict):
        # Case A
        if "item_id" in interaction:
            item_id = str(interaction["item_id"])
            ts = convert_to_unix_ms(interaction.get("timestamp"))
            # 오직 "additional_info" 내부 dict만 사용 (외부 key는 무시)
            add_info = {}
            if "additional_info" in interaction and isinstance(interaction["additional_info"], dict):
                add_info = interaction["additional_info"]
            return item_id, ts, add_info
        # Case C
        elif "itemid" in interaction:
            item_id = str(interaction["itemid"])
            ts = convert_to_unix_ms(interaction.get("timestamp"))
            add_info = {k: v for k, v in interaction.items() if k not in {"itemid", "timestamp"}}
            return item_id, ts, add_info
    elif isinstance(interaction, list) and len(interaction) >= 2:
        # Case B
        item_id = str(interaction[0])
        ts = convert_to_unix_ms(interaction[1])
        return item_id, ts, {}
    return None

def gather_additional_keys(data):
    """
    data: 사용자별, 세션별 interaction 데이터 (여러 형식 혼재)
    각 interaction을 정규화(normalize)한 후, 추가정보 dict에 포함된 모든 key를 수집하여
    알파벳 순으로 정렬한 리스트를 반환.
    """
    keys_set = set()
    for user, sessions in data.items():
        for session in sessions:
            for interaction in session:
                norm = normalize_interaction(interaction)
                if norm is None:
                    continue
                _, _, add_info = norm
                # add_info는 dict임
                keys_set.update(add_info.keys())
    return sorted(keys_set)

def convert_interactions_compact_with_index(data):
    """
    data: 원래의 사용자별, 세션별 interaction 데이터.
    각 interaction을 normalize한 후, [item_id, timestamp, additional_info_list]로 변환.
    additional_info_list는 전체 데이터에서 자동 수집한 추가 key 목록(additional_keys)의 순서대로 값을 채움.
    
    최종 반환 구조:
      {
         "index": {
             "item_id": 0,
             "timestamp": 1,
             "additional_info": 2,
             "add_index": { <key>: <index>, ... }
         },
         "data": { ... }  # 사용자별, 세션별 변환된 interaction 리스트
      }
    """
    additional_keys = gather_additional_keys(data)
    
    new_data = {}
    for user, sessions in data.items():
        new_sessions = []
        for session in sessions:
            new_session = []
            for interaction in session:
                norm = normalize_interaction(interaction)
                if norm is None:
                    continue
                item_id, ts, add_info = norm
                # additional_info_list: additional_keys 순서대로 값을 추출 (없으면 None)
                add_info_list = [add_info.get(key, None) for key in additional_keys]
                new_session.append([item_id, ts, add_info_list])
            new_sessions.append(new_session)
        new_data[user] = new_sessions
    
    index_mapping = {
        "item_id": 0,
        "timestamp": 1,
        "additional_info": 2,
        "add_index": { key: idx for idx, key in enumerate(additional_keys) }
    }
    return {"index": index_mapping, "data": new_data}

def process_file_compact(input_filename):
    data = load_json(input_filename)
    # 만약 최상위에 "data"가 없으면 data 자체가 사용자별 dict라고 가정.
    if not isinstance(data, dict) or "data" not in data:
        wrapped_data = data
    else:
        wrapped_data = data.get("data", data)
    new_data = convert_interactions_compact_with_index(wrapped_data)
    base, ext = os.path.splitext(input_filename)
    output_filename = base + "_revised" + ext
    save_json(new_data, output_filename)
    print(f"Processed {input_filename} -> {output_filename}")

def main():
    # 처리할 파일들 (예시)
    files = ["/home/jy1559/Mar2025_Module/Datasets/Globo/user_interactions.json", "/home/jy1559/Mar2025_Module/Datasets/LFM-BeyMS/user_sessions.json", "/home/jy1559/Mar2025_Module/Datasets/Retail_Rocket/retailrocket_user_interactions_filtered.json"]
    for filename in files:
        if os.path.exists(filename):
            process_file_compact(filename)
        else:
            print(f"File {filename} not found.")

if __name__ == "__main__":
    main()


Processed /home/jy1559/Mar2025_Module/Datasets/Globo/user_interactions.json -> /home/jy1559/Mar2025_Module/Datasets/Globo/user_interactions_revised.json
Processed /home/jy1559/Mar2025_Module/Datasets/LFM-BeyMS/user_sessions.json -> /home/jy1559/Mar2025_Module/Datasets/LFM-BeyMS/user_sessions_revised.json
Processed /home/jy1559/Mar2025_Module/Datasets/Retail_Rocket/retailrocket_user_interactions_filtered.json -> /home/jy1559/Mar2025_Module/Datasets/Retail_Rocket/retailrocket_user_interactions_filtered_revised.json


In [1]:
files = ["/home/jy1559/Mar2025_Module/Datasets/Globo/user_interactions.json", "/home/jy1559/Mar2025_Module/Datasets/LFM-BeyMS/user_sessions.json", "/home/jy1559/Mar2025_Module/Datasets/Retail_Rocket/retailrocket_user_interactions_filtered.json"]

In [None]:
import json
import sys
import os
from datetime import datetime

def parse_timestamp(ts):
    """
    ts가 timestamp 정보입니다.
    - 숫자형이면 그대로 int형으로 반환 (ms 단위라고 가정)
    - 문자열이면, 만약 전부 숫자로 이루어져 있으면 int로 변환,
      그렇지 않으면 "%Y-%m-%d %H:%M:%S" 형식으로 파싱하여 ms 단위 timestamp 반환.
    """
    if isinstance(ts, (int, float)):
        return int(ts)
    elif isinstance(ts, str):
        if ts.isdigit():
            return int(ts)
        else:
            try:
                dt = datetime.strptime(ts, "%Y-%m-%d %H:%M:%S")
                return int(dt.timestamp() * 1000)
            except ValueError:
                raise ValueError(f"Unrecognized timestamp format: {ts}")
    else:
        raise TypeError("Unsupported timestamp type")

def parse_interaction(inter):
    """
    다양한 형식의 interaction을 파싱하여 (item_id, timestamp, additional_info_dict)를 반환.
    - dict인 경우:
        * A 형식: "item_id", "timestamp", "additional_info" dict가 있을 수 있음.
        * B 형식: "itemid", "timestamp"(문자열) 및 기타 추가 키.
    - list인 경우 (C 형식): [item_id, timestamp] 형태.
    """
    item_id = None
    timestamp = None
    additional = {}
    
    if isinstance(inter, dict):
        # item id 추출
        if "item_id" in inter:
            item_id = str(inter["item_id"])
        elif "itemid" in inter:
            item_id = str(inter["itemid"])
        else:
            return None  # 필수 필드가 없으면 스킵
        
        # timestamp 변환
        if "timestamp" in inter:
            timestamp = parse_timestamp(inter["timestamp"])
        else:
            return None
        
        # 추가정보: A 형식은 "additional_info" 내부에 있는 경우 우선 확인
        if "additional_info" in inter and isinstance(inter["additional_info"], dict):
            additional = inter["additional_info"]
        else:
            # B 형식 등은 item id와 timestamp 외의 모든 키를 추가정보로 취급
            for key, value in inter.items():
                if key not in {"item_id", "itemid", "timestamp", "user_id", "session_id"}:
                    additional[key] = value
    elif isinstance(inter, list):
        # C 형식: [item_id, timestamp]
        if len(inter) < 2:
            return None
        item_id = str(inter[0])
        timestamp = parse_timestamp(inter[1])
        additional = {}  # 별도의 추가정보 없음
    else:
        return None
    
    return (item_id, timestamp, additional)

def process_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)
    
    data = {}
    global_add_keys = set()
    
    # raw_data는 user id를 key로 하는 dict라고 가정합니다.
    for user_id, sessions in raw_data.items():
        user_id_str = str(user_id)
        data[user_id_str] = []
        if not isinstance(sessions, list):
            continue
        for session in sessions:
            if not isinstance(session, list):
                continue
            new_session = []
            for inter in session:
                parsed = parse_interaction(inter)
                if parsed is None:
                    continue
                item_id, timestamp, add_info = parsed
                global_add_keys.update(add_info.keys())
                new_session.append({
                    "item_id": item_id,
                    "timestamp": timestamp,
                    "add_info": add_info
                })
            if new_session:
                data[user_id_str].append(new_session)
    
    sorted_add_keys = sorted(global_add_keys)
    add_index = { key: idx for idx, key in enumerate(sorted_add_keys) }
    
    for user_id, sessions in data.items():
        for i, session in enumerate(sessions):
            new_session = []
            for inter in session:
                item_id = inter["item_id"]
                timestamp = inter["timestamp"]
                add_info_dict = inter["add_info"]
                add_info_list = [add_info_dict.get(key, None) for key in sorted_add_keys]
                new_session.append([item_id, timestamp, add_info_list])
            sessions[i] = new_session

    output = {
        "index": {
            "item_id": 0,
            "timestamp": 1,
            "add_info": 2,
            "add_index": add_index
        },
        "data": data
    }
    
    return output

def make_revised_filename(file_path):
    base, ext = os.path.splitext(file_path)
    return f"{base}_revised{ext}"

def main(file_paths):
    # file_paths는 문자열 하나 또는 문자열 리스트
    if isinstance(file_paths, str):
        file_paths = [file_paths]
    
    for file_path in file_paths:
        try:
            revised_data = process_file(file_path)
            new_file_path = make_revised_filename(file_path)
            with open(new_file_path, "w", encoding="utf-8") as f:
                json.dump(revised_data, f, ensure_ascii=False, indent=2)
            print(f"파일 변환 완료: {new_file_path}")
        except Exception as e:
            print(f"{file_path} 처리 중 에러 발생: {e}")

main(files)


파일 변환 완료: /home/jy1559/Mar2025_Module/Datasets/Globo/user_interactions_revised.json
