In [None]:
import pandas as pd
import os
    
datasets = {"Retail_Rocket": {"path": '/home/jy1559/Mar2025_Module/Datasets/Retail_Rocket',
                              "file_names": ['category_tree.csv', 'events.csv', 'item_properties_part1.csv', 'item_properties_part2.csv']},
            "Diginetica": {"path": '/home/jy1559/Mar2025_Module/Datasets/Diginetica',
                              "file_names": ['product-categories.csv', 'products.csv', 'train-clicks.csv', 'train-item-views.csv', 'train-purchases.csv', 'train-queries.csv']},
            "LFM-BeyMS": {"path": '/home/jy1559/Mar2025_Module/Datasets/LFM-BeyMS/dataset',
                              "file_names": ['beyms.csv', 'events.csv', 'genre_annotations.csv', 'mainstreaminess.csv', 'ms.csv', 'user_groups.csv']},
            "Beauty": {"path": '/home/jy1559/Mar2025_Module/Datasets/Amazon',
                "file_names": ['All_Beauty.jsonl', 'meta_All_Beauty.jsonl']},
            "Game": {"path": '/home/jy1559/Mar2025_Module/Datasets/Amazon',
                "file_names": ['Video_Games.jsonl', 'meta_Video_Games.jsonl']}}
dataset = datasets["LFM-BeyMS"]
directory_path = dataset['path']
file_names = dataset["file_names"]
for name in file_names:
    file_path = os.path.join(directory_path, name)
    try:
        if 'csv' in file_path:
            df = pd.read_csv(file_path)
        elif 'json' in file_path:
            df = pd.read_json(file_path, lines=True)
        print(f"First 5 rows of {name}:")
        print(df.head(), "\n")
    except FileNotFoundError:
        print(f"File {name} not found in the directory {directory_path}.")
    except pd.errors.EmptyDataError:
        print(f"File {name} is empty.")
    except pd.errors.ParserError:
        print(f"Error parsing {name}. Please check the file for inconsistencies.")

First 5 rows of beyms.csv:
   user_id
0  1049656
1  1055118
2  1056935
3  1070023
4  1072752 

First 5 rows of events.csv:
    user_id  artist_id  album_id  track_id   timestamp
0  31435741         21        31        53  1370977938
1  31435741         21        31        53  1370977728
2  31435741         21        31        53  1370977518
3  31435741         21        31        53  1370977308
4  31435741         21        31        53  1370977098 

First 5 rows of genre_annotations.csv:
   Unnamed: 0  track_id                                             genres
0           1      4868  ['soul', 'pop', 'singersongwriter', 'blues', '...
1           2      2900  ['electronic', 'indiepop', 'shoegaze', 'dreamp...
2           5    572665  ['soul', 'pop', 'singersongwriter', 'blues', '...
3           6      2897  ['indierock', 'electronic', 'indiepop', 'postp...
4           7     15100  ['folk', 'indiefolk', 'banjo', 'folkrock', 'bl... 

First 5 rows of mainstreaminess.csv:
   Unnamed: 0   u

In [12]:
import pandas as pd
import ast
import json

# 파일 경로 (실제 경로에 맞게 수정)
events_path = os.path.join(directory_path, dataset["file_names"][1])
genres_path = os.path.join(directory_path, dataset["file_names"][2])

# CSV 파일 로드
df_events = pd.read_csv(events_path)
df_genres = pd.read_csv(genres_path)

# events.csv와 genre_annotations.csv를 track_id 기준으로 병합
df_merged = pd.merge(df_events, df_genres[['track_id', 'genres']], on='track_id', how='left')

# 중복 제거: track_id, artist_id, album_id, genres 기준
df_merged = df_merged.drop_duplicates(subset=['track_id', 'artist_id', 'album_id', 'genres'])

def construct_sentence(row):
    # genres 컬럼은 문자열로 저장된 리스트일 수 있으므로 ast.literal_eval로 변환 시도
    try:
        genres_list = ast.literal_eval(row['genres']) if pd.notna(row['genres']) else []
        if isinstance(genres_list, list) and genres_list:
            genres_text = ", ".join(genres_list)
        else:
            genres_text = "No genres available"
    except Exception:
        genres_text = "No genres available"
        
    sentence = (f"Track ID: {row['track_id']}, "
                f"Artist ID: {row['artist_id']}, "
                f"Album ID: {row['album_id']}, "
                f"Genre: {genres_text}.")
    return sentence

# 문장 생성
df_merged['sentence'] = df_merged.apply(construct_sentence, axis=1)

# 각 트랙에 대해 문장을 생성하여, key를 track_id로 하는 dictionary 생성
item_sentences = {}
for _, row in df_merged.iterrows():
    key = str(row['track_id'])
    sentence = construct_sentence(row)
    item_sentences[key] = sentence

# 결과를 JSON 파일로 저장
output_file = "item_sentences.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(item_sentences, f, ensure_ascii=False, indent=2)

print(f"Item sentences dictionary saved to '{output_file}'.")

Item sentences dictionary saved to 'item_sentences.json'.


In [11]:
import pandas as pd
from tqdm.auto import tqdm
# 파일 경로 (각 경로는 실제 경로에 맞게 수정)
events_path = os.path.join(directory_path, dataset["file_names"][1])

df_events = pd.read_csv(events_path)

# timestamp가 초 단위라고 가정
df_events['datetime'] = pd.to_datetime(df_events['timestamp'], unit='s')
df_events = df_events.sort_values(by=['user_id', 'datetime'])

# 각 이벤트에서 필요한 정보만 추출: (track_id, timestamp)
def extract_event(row):
    return (row['track_id'], row['datetime'].strftime('%Y-%m-%d %H:%M:%S'))

df_events['event_info'] = df_events.apply(extract_event, axis=1)

# 사용자별로 그룹화하고, 시간 간격이 3시간 이상이면 세션 분리 (3시간 = 10800초)
user_sessions = {}

three_hours = pd.Timedelta(seconds=10800)

for user_id, group in tqdm(df_events.groupby('user_id')):
    group = group.sort_values('datetime')
    sessions = []
    current_session = []
    prev_time = None
    for _, row in group.iterrows():
        current_time = row['datetime']
        if prev_time is not None and (current_time - prev_time) >= three_hours:
            # 새로운 세션 시작: 이전 세션 저장 후 초기화
            if current_session:
                sessions.append(current_session)
            current_session = []
        current_session.append(row['event_info'])
        prev_time = current_time
    if current_session:
        sessions.append(current_session)
    user_sessions[user_id] = sessions

# 예시: 첫 번째 사용자에 대한 세션 출력
example_user = list(user_sessions.keys())[0]
print(f"User {example_user} sessions:")
for i, session in enumerate(user_sessions[example_user], 1):
    print(f"  Session {i}: {session}")

# 저장 방법 예시: JSON 형식으로 저장 (파일 용량과 접근성을 고려)
with open("user_sessions.json", "w", encoding="utf-8") as f:
    json.dump(user_sessions, f, ensure_ascii=False, indent=2)

print("사용자별 세션 데이터가 'user_sessions.json' 파일로 저장되었습니다.")



User 1002693 sessions:
  Session 1: [(848037, '2013-01-22 18:13:17'), (848036, '2013-01-22 18:16:43'), (848033, '2013-01-22 18:21:24')]
  Session 2: [(847578, '2013-01-22 21:55:45'), (3831482, '2013-01-22 22:03:16'), (138332, '2013-01-22 23:09:03')]
  Session 3: [(1189664, '2013-01-23 16:25:10'), (2267814, '2013-01-23 16:27:38'), (2267813, '2013-01-23 16:31:45'), (2267812, '2013-01-23 16:35:44'), (2267811, '2013-01-23 16:38:08'), (2267810, '2013-01-23 16:40:49'), (2267809, '2013-01-23 16:43:12'), (2267804, '2013-01-23 16:48:15'), (2267803, '2013-01-23 16:52:25'), (2267832, '2013-01-23 16:56:19')]
  Session 4: [(109882, '2013-01-23 20:55:58'), (619982, '2013-01-23 20:59:36'), (2059242, '2013-01-23 21:01:51'), (2059241, '2013-01-23 21:05:49'), (64811, '2013-01-23 21:10:31'), (64810, '2013-01-23 21:11:52'), (64780, '2013-01-23 21:14:35'), (64779, '2013-01-23 21:18:00'), (33245, '2013-01-23 23:47:17'), (1595633, '2013-01-23 23:59:29'), (569954, '2013-01-24 00:05:16'), (186230, '2013-01-24 

In [14]:
import json
import statistics

# JSON 파일 읽기
with open("user_sessions.json", "r", encoding="utf-8") as f:
    user_sessions = json.load(f)

# 사용자 수
num_users = len(user_sessions)

# 사용자별 세션 수와 전체 세션별 interaction 수 목록 계산
session_counts = []         # 각 사용자별 세션 수
interaction_counts = []     # 각 세션의 interaction 수

for user, sessions in tqdm(user_sessions.items()):
    session_counts.append(len(sessions))
    for session in sessions:
        interaction_counts.append(len(session))

# 사용자당 세션 수의 평균 계산
avg_sessions_per_user = sum(session_counts) / num_users if num_users > 0 else 0

# 각 세션당 interaction 수의 평균과 분산 계산
if interaction_counts:
    avg_interactions = statistics.mean(interaction_counts)
    var_interactions = statistics.variance(interaction_counts) if len(interaction_counts) > 1 else 0
else:
    avg_interactions = 0
    var_interactions = 0

print("Number of users:", num_users)
print("Average sessions per user:", avg_sessions_per_user)
print("Average interactions per session:", avg_interactions)
print("Variance of interactions per session:", var_interactions)


100%|██████████| 4148/4148 [00:00<00:00, 15450.01it/s]


Number of users: 4148
Average sessions per user: 394.4946962391514
Average interactions per session: 10.19783067825985
Variance of interactions per session: 278.87488818760977
