In [2]:
import pandas as pd
import os
    
datasets = {"Retail_Rocket": {"path": '/home/jy1559/Mar2025_Module/Datasets/Retail_Rocket',
                              "file_names": ['category_tree.csv', 'events.csv', 'item_properties_part1.csv', 'item_properties_part2.csv']},
            "Diginetica": {"path": '/home/jy1559/Mar2025_Module/Datasets/Diginetica',
                              "file_names": ['product-categories.csv', 'products.csv', 'train-clicks.csv', 'train-item-views.csv', 'train-purchases.csv', 'train-queries.csv']},
            "LFM-BeyMS": {"path": '/home/jy1559/Mar2025_Module/Datasets/LFM-BeyMS/dataset',
                              "file_names": ['beyms.csv', 'events.csv', 'genre_annotations.csv', 'mainstreaminess.csv', 'ms.csv', 'user_groups.csv']},
            "Beauty": {"path": '/home/jy1559/Mar2025_Module/Datasets/Amazon',
                "file_names": ['All_Beauty.jsonl', 'meta_All_Beauty.jsonl']},
            "Game": {"path": '/home/jy1559/Mar2025_Module/Datasets/Amazon',
                "file_names": ['Video_Games.jsonl', 'meta_Video_Games.jsonl']}}
dataset = datasets["Beauty"]
directory_path = dataset['path']
file_names = dataset["file_names"]
for name in file_names:
    file_path = os.path.join(directory_path, name)
    try:
        if 'csv' in file_path:
            df = pd.read_csv(file_path)
        elif 'json' in file_path:
            df = pd.read_json(file_path, lines=True)
        print(f"First 5 rows of {name}:")
        print(df.head(), "\n")
        print(len(df['parent_asin'].unique()), "\n")
    except FileNotFoundError:
        print(f"File {name} not found in the directory {directory_path}.")
    except pd.errors.EmptyDataError:
        print(f"File {name} is empty.")
    except pd.errors.ParserError:
        print(f"Error parsing {name}. Please check the file for inconsistencies.")

First 5 rows of All_Beauty.jsonl:
   rating                                      title  \
0       5  Such a lovely scent but not overpowering.   
1       4     Works great but smells a little weird.   
2       5                                       Yes!   
3       1                          Synthetic feeling   
4       5                                         A+   

                                                text images        asin  \
0  This spray is really nice. It smells really go...     []  B00YQ6X8EO   
1  This product does what I need it to do, I just...     []  B081TJ8YS3   
2                          Smells good, feels great!     []  B07PNNCSP9   
3                                     Felt synthetic     []  B09JS339BZ   
4                                            Love it     []  B08BZ63GMJ   

  parent_asin                       user_id               timestamp  \
0  B00YQ6X8EO  AGKHLEW2SOWHNMFQIJGBECAF7INQ 2020-05-05 14:08:48.923   
1  B081TJ8YS3  AGKHLEW2SOWHNMFQIJGBE

In [3]:
import ast

def safe_eval(val, expected_type):
    """
    문자열 형태의 리스트나 딕셔너리를 안전하게 평가하고, 
    만약 평가에 실패하거나 타입이 맞지 않으면 빈 값을 반환합니다.
    """
    try:
        evaluated = ast.literal_eval(val) if isinstance(val, str) else val
        if isinstance(evaluated, expected_type):
            return evaluated
    except Exception:
        pass
    return None

def construct_item_sentence(row):
    parts = []
    
    # Title
    title = row.get('title', '').strip()
    if title:
        parts.append(f"Title: {title}.")
    
    # Rating 정보
    avg_rating = row.get('average_rating')
    rating_number = row.get('rating_number')
    if avg_rating is not None and rating_number is not None:
        parts.append(f"Average rating: {avg_rating}, based on {rating_number} reviews.")
    
    # Features (리스트 형태)
    features = safe_eval(row.get('features', ''), list)
    if features:
        # 리스트가 비어있지 않으면 각 항목을 문자열로 연결
        features_text = ", ".join(str(f).strip() for f in features if f)
        if features_text:
            parts.append(f"Features: {features_text}.")
    
    # Description (리스트 형태)
    description = safe_eval(row.get('description', ''), list)
    if description:
        description_text = " ".join(str(d).strip() for d in description if d)
        if description_text:
            parts.append(f"Description: {description_text}.")
    
    # Store (문자열)
    store = row.get('store', '')
    if store:
        parts.append(f"Store: {store.strip()}.")
    
    # Details (딕셔너리 형태)
    details = safe_eval(row.get('details', ''), dict)
    if details:
        # key: value 형태로 변환
        details_text = ", ".join(f"{k}: {v}" for k, v in details.items() if v)
        if details_text:
            parts.append(f"Details: {details_text}.")
    
    # 모든 부분을 하나의 문장으로 합치기
    return " ".join(parts)

# 예시: df는 amazon metadata가 담긴 DataFrame
# parent_asin을 key로 하는 dictionary 생성
item_sentences = {}
for idx, row in df.iterrows():
    asin = row.get('parent_asin', None)
    if asin:
        sentence = construct_item_sentence(row)
        item_sentences[asin] = sentence

# 결과 확인: 특정 아이템의 문장을 출력
example_asin = list(item_sentences.keys())[1]
print(f"ASIN: {example_asin}\nSentence: {item_sentences[example_asin]}")

ASIN: B076WQZGPM
Sentence: Title: Yes to Tomatoes Detoxifying Charcoal Cleanser (Pack of 2) with Charcoal Powder, Tomato Fruit Extract, and Gingko Biloba Leaf Extract, 5 fl. oz.. Average rating: 4.5, based on 3 reviews. Store: Yes To. Details: Item Form: Powder, Skin Type: Acne Prone, Brand: Yes To, Age Range (Description): Adult, Unit Count: 10 Fl Oz, Is Discontinued By Manufacturer: No, Item model number: SG_B076WQZGPM_US, UPC: 653801351125, Manufacturer: Yes to Tomatoes.


In [12]:
from tqdm.auto import tqdm
file_path = os.path.join(directory_path, dataset["file_names"][0])
df = pd.read_json(file_path, lines=True)
# 1. 타임스탬프를 자연어 형태로 변환 (연-월-일 시:분:초)
df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
df['datetime_str'] = df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
# 날짜만 추출해서 그룹핑용으로 사용 (연-월-일)
df['date'] = df['datetime'].dt.date

# 2. 필요한 컬럼만 선택하여 리뷰 정보를 딕셔너리 형태로 정리
def get_review_info(row):
    return {
        "parent_asin": row['parent_asin'],
        "asin": row['asin'],
        "timestamp": row['datetime_str'],
        "review_info": {
        "rating": row['rating'],
        "title": row['title'],
        "text": row['text'],
        "helpful_vote": row['helpful_vote']}
    }

df['review_info'] = df.apply(get_review_info, axis=1)

# 3. user_id별로 그룹화한 뒤, 같은 날짜별로 리뷰 그룹을 생성
# 결과: { user_id: [ [review1, review2, ...] (같은 날), [review3, ...], ... ] }
user_groups = {}
for user, group in tqdm(df.groupby('user_id')):
    # 날짜별 그룹 (리스트 순서는 날짜 오름차순)
    day_groups = []
    for date, day_df in group.groupby('date'):
        # 해당 날짜에 해당하는 리뷰 정보 리스트
        reviews = day_df['review_info'].tolist()
        day_groups.append(reviews)
    if len(day_groups) > 1:
        user_groups[user] = day_groups

# 만약 user_id를 제거하고, 모든 사용자의 데이터만 리스트로 저장하려면:
all_users_data = list(user_groups.values())

# 결과 예시 출력 (첫 번째 user의 데이터를 출력)
example_user = list(user_groups.keys())[0]
print(f"User {example_user} grouped reviews:")
for i, day_group in enumerate(user_groups[example_user], 2):
    print(f"  Day group {i}:")
    for review in day_group:
        print("   ", review)

100%|██████████| 631986/631986 [03:24<00:00, 3086.79it/s]

User AE224HM2QAW5TTSDL2QRDERA6KMA grouped reviews:
  Day group 2:
    {'parent_asin': 'B0002KHT8Y', 'asin': 'B0002KHT8Y', 'timestamp': '2004-09-11 00:15:34', 'review_info': {'rating': 4, 'title': 'Switched from Sonicare and Prefer Oral B', 'text': "I had been happily using Sonicare for about seven years.  I read positive reviews about Oral B and decided to try it.  A key reason I decided to try the Oral B is that I had to unscrew the brush head of the Sonicare once a week and clean the gunk out of it.  I hated doing this and often needed a cotton swab to clean the gunk out of the inside of the brush head.<br /><br />In contrast, the Oral B is much easier to clean.  After each brushing I just pull the bush head off and rinse it off and wipe everything off with a towel.  It takes less than a minute and the toothbrush is always clean.  The Sonicare takes much longer to clean and the process is less pleasant.<br /><br />But how does the Oral B clean?  The smaller brush head of the Oral B m




In [19]:
example_user = list(user_groups.keys())[1433]
print(user_groups[example_user])

[[{'parent_asin': 'B06Y41PQQD', 'asin': 'B06Y41PQQD', 'timestamp': '2018-01-19 01:26:30', 'review_info': {'rating': 4, 'title': 'Nice clips!', 'text': 'These were nice clips! Curled ok..but rolled and lasted better! Enough for a full head! Would recommend!', 'helpful_vote': 0}}], [{'parent_asin': 'B07VGBZ4HK', 'asin': 'B07VGBZ4HK', 'timestamp': '2019-11-23 20:38:18', 'review_info': {'rating': 1, 'title': 'NOT 100% HUMAN HAIR! ITS BLENDED!!!', 'text': 'This is NOT 100% Human Hair!! It barely curls and nearly burned! I wish I tested before seeing in! Now I have to take a full sew-in out. The only other way is to roll it. HORRIBLE!', 'helpful_vote': 1}}], [{'parent_asin': 'B08ZSH51H7', 'asin': 'B08ZSH51H7', 'timestamp': '2021-04-20 18:12:10', 'review_info': {'rating': 1, 'title': 'SYNTHETIC! BEWARE!!', 'text': 'Buyer BEWARE! This hair is SYNTHETIC!!!  I know..I know...I should have done a strain test.  And honestly, as it was going in, it was a tangling, but I thought it was due to the le