In [None]:
import csv
import json
from collections import defaultdict

CSV_PATH = "dataset/events.csv"
OUTPUT_JSON = "interactions.json"

def build_sessions_from_events(csv_path, output_path, session_gap_sec=3600):
    """
    csv_path: events.csv 경로
    output_path: 최종 JSON을 저장할 경로
    session_gap_sec: 이 값(초 단위) 이상의 간격이 발생하면 세션을 나눔 (기본 600초 = 10분)
    """

    # 1) events.csv 읽어서 user별 (timestamp, track_id) 수집
    user_data = defaultdict(list)
    # events.csv의 헤더: user_id,artist_id,album_id,track_id,timestamp (가정)
    # 필요 없는 필드는 무시하거나, CSV 구조에 맞춰 수정
    with open(csv_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            user_id = row["user_id"]
            track_id = row["track_id"]
            ts = float(row["timestamp"])
            user_data[user_id].append((ts, track_id))

    # 2) 전역 최소 timestamp 찾기
    min_ts = float('inf')
    for user_id, events in user_data.items():
        for (ts, track_id) in events:
            if ts < min_ts:
                min_ts = ts

    # 3) user별로 정렬 후, 세션 분할
    result_data = {}
    for user_id, events in user_data.items():
        # timestamp 기준 정렬
        events.sort(key=lambda x: x[0])
        sessions = []
        current_session = []
        prev_ts = None

        for (ts, track_id) in events:
            if prev_ts is None:
                # 첫 interaction
                current_session.append((ts, track_id))
                prev_ts = ts
            else:
                # gap 확인
                gap = ts - prev_ts
                if gap >= session_gap_sec:
                    # 이전 세션 확정
                    sessions.append(current_session)
                    current_session = [(ts, track_id)]
                else:
                    current_session.append((ts, track_id))
                prev_ts = ts

        # 마지막 세션도 추가
        if current_session:
            sessions.append(current_session)

        # 세션별로 timestamp 변환 (min_ts를 빼고 1000으로 나누어 초단위로)
        # 그리고 [track_id, timestamp, add_info] 형태로
        user_session_list = []
        for sess in sessions:
            sess_list = []
            for (ts, track_id) in sess:
                new_ts = (ts - min_ts) 
                sess_list.append([str(track_id), new_ts, []])  # add_info는 빈 리스트로
            user_session_list.append(sess_list)

        result_data[user_id] = user_session_list

    # 4) JSON 포맷 구성
    output_dict = {
        "index": {
            "item_id": 0,
            "timestamp": 1,
            "add_info": 2,
            "add_index": {}
        },
        "data": result_data
    }

    # 5) 저장
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_dict, f, indent=2)

    print(f"JSON saved to {output_path}")


if __name__ == "__main__":
    build_sessions_from_events(CSV_PATH, OUTPUT_JSON, session_gap_sec=3600)


JSON saved to interactions.json
