In [3]:
import pickle
import numpy as np

def analyze_news_embeddings(pickle_file):
    # pickle 파일 로드
    with open(pickle_file, 'rb') as f:
        embeddings = pickle.load(f)
    
    total_count = None
    dimension = None
    sample_embedding = None
    
    # 데이터 타입에 따라 처리
    if isinstance(embeddings, dict):
        total_count = len(embeddings)
        # 임의의 하나의 아이템 추출
        sample_key, sample_embedding = next(iter(embeddings.items()))
        # sample_embedding이 numpy array가 아닐 경우 변환
        sample_embedding = np.array(sample_embedding)
        dimension = sample_embedding.shape
        print(f"Example key: {sample_key}")
    elif isinstance(embeddings, list):
        total_count = len(embeddings)
        if total_count > 0:
            sample_embedding = np.array(embeddings[0])
            dimension = sample_embedding.shape
    elif isinstance(embeddings, np.ndarray):
        total_count = embeddings.shape[0]
        # 임베딩이 2차원 이상이라 가정 (예: (num_items, dim))
        if len(embeddings.shape) > 1:
            dimension = embeddings.shape[1:]
        else:
            dimension = embeddings.shape
        sample_embedding = embeddings[0]
    else:
        raise ValueError("Unsupported data type in pickle file.")
    print(embeddings.shape)
    print("------ News Embeddings Info ------")
    print(f"Total embeddings: {total_count}")
    print(f"Embedding dimension: {dimension}")
    print("Example embedding:")
    print(sample_embedding)

if __name__ == "__main__":
    # 실제 파일 경로로 수정
    pickle_file_path = "/home/jy1559/Mar2025_Module/Datasets/Globo/articles_embeddings.pickle"
    analyze_news_embeddings(pickle_file_path)


(364047, 250)
------ News Embeddings Info ------
Total embeddings: 364047
Embedding dimension: (250,)
Example embedding:
[-0.16118301 -0.95723313 -0.13794445  0.05085534  0.83005524  0.90136534
 -0.33514765 -0.55956066 -0.50060284  0.16518293  0.4284342   0.3550556
  0.87443674 -0.52888286  0.6254872   0.2689198  -0.8228351  -0.703853
 -0.62584543 -0.15285493 -0.6662412   0.04329487  0.1786375   0.04689008
  0.5945311  -0.18334764  0.19510683 -0.46763963 -0.30480695  0.35317516
  0.27818817  0.5386231  -0.37120935  0.48989806 -0.10383289  0.11917368
  0.13243659 -0.62108386 -0.45331132  0.34662652 -0.06174106 -0.7305939
 -0.38411567 -0.94075835  0.06134219  0.4825816   0.28968322 -0.62269634
 -0.05000444  0.42151213 -0.24257636  0.6687105  -0.509004   -0.46179956
  0.04390178  0.28848746  0.4498246  -0.28486234  0.916729    0.70312876
  0.85167396 -0.6272441   0.35773164  0.3901894   0.65293527  0.1036527
  0.79829276 -0.10254639 -0.2045143   0.37861153 -0.09026147 -0.2511573
  0.32598

In [10]:
import os
import json
import numpy as np
from collections import Counter

def compute_low_stats(values, range_max=10):
    """
    주어진 값들의 리스트(values)에 대해, 1부터 range_max까지
    각 값의 빈도, 비율, 누적 빈도, 누적 비율을 계산한다.
    """
    total = len(values)
    counter = Counter(values)
    result = {}
    cumulative = 0
    for i in range(1, range_max + 1):
        freq = counter.get(i, 0)
        cumulative += freq
        ratio = freq / total if total > 0 else 0
        cum_ratio = cumulative / total if total > 0 else 0
        result[i] = (freq, ratio, cumulative, cum_ratio)
    return result

def print_low_stats_table(stats, title):
    """
    stats 딕셔너리와 제목을 받아서 표 형태로 출력한다.
    각 행은 해당 값에 대한 (빈도, 비율, 누적 빈도, 누적 비율)을 보여준다.
    """
    print(title)
    print(f"{'Value':>5} | {'Freq':>6} | {'Ratio':>7} | {'CumFreq':>7} | {'CumRatio':>8}")
    print("-" * 40)
    for i in range(1, 11):
        freq, ratio, cum_freq, cum_ratio = stats.get(i, (0, 0, 0, 0))
        print(f"{i:>5} | {freq:>6} | {ratio*100:6.2f}% | {cum_freq:>7} | {cum_ratio*100:7.2f}%")
    print("\n")

def analyze_interactions_file(interactions_file):
    """
    interactions.json 파일을 읽어 각 유저별 session 수와 session당 interaction 수를 추출한다.
    """
    with open(interactions_file, 'r', encoding='utf-8') as f:
        interactions = json.load(f)
    
    data = interactions.get("data", {})
    # 각 유저의 session 수 계산
    sessions_per_user = [len(sessions) for sessions in data.values()]
    
    # 각 session별 interaction 수 계산
    interactions_per_session = []
    for sessions in data.values():
        for session in sessions:
            interactions_per_session.append(len(session))
    
    return sessions_per_user, interactions_per_session

def main(interactions_file):
    sessions_per_user, interactions_per_session = analyze_interactions_file(interactions_file)
    
    # 유저별 session 수 1~10 범위 통계
    print("===== Statistics for Sessions per User (Range 1-10) =====")
    sessions_stats = compute_low_stats(sessions_per_user, range_max=10)
    print_low_stats_table(sessions_stats, "Sessions per User")
    
    # session 당 interaction 수 1~10 범위 통계
    print("===== Statistics for Interactions per Session (Range 1-10) =====")
    interactions_stats = compute_low_stats(interactions_per_session, range_max=10)
    print_low_stats_table(interactions_stats, "Interactions per Session")

if __name__ == "__main__":
    # 실제 interactions.json 파일 경로로 수정하세요.
    interactions_file = "/home/jy1559/Mar2025_Module/Datasets/LFM-BeyMS/interactions.json"
    main(interactions_file)


===== Statistics for Sessions per User (Range 1-10) =====
Sessions per User
Value |   Freq |   Ratio | CumFreq | CumRatio
----------------------------------------
    1 |      0 |   0.00% |       0 |    0.00%
    2 |      0 |   0.00% |       0 |    0.00%
    3 |      0 |   0.00% |       0 |    0.00%
    4 |      0 |   0.00% |       0 |    0.00%
    5 |      0 |   0.00% |       0 |    0.00%
    6 |      1 |   0.02% |       1 |    0.02%
    7 |      0 |   0.00% |       1 |    0.02%
    8 |      0 |   0.00% |       1 |    0.02%
    9 |      0 |   0.00% |       1 |    0.02%
   10 |      0 |   0.00% |       1 |    0.02%


===== Statistics for Interactions per Session (Range 1-10) =====
Interactions per Session
Value |   Freq |   Ratio | CumFreq | CumRatio
----------------------------------------
    1 | 276212 |  16.88% |  276212 |   16.88%
    2 | 188295 |  11.51% |  464507 |   28.39%
    3 | 145288 |   8.88% |  609795 |   37.27%
    4 | 118861 |   7.26% |  728656 |   44.53%
    5 |  98908