## Data collection

### Task: Import modules and working path setup

In [9]:
import sys, os, json, gzip, csv, urllib.request, shutil
from pathlib import Path
import pandas as pd

module_path = str((Path("..") / "utilities").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

from logger import Logger
from configurations import Configurations


### Task: Get configuration variables and initializations

In [2]:
# Initialize logger
LOG_FILE = Configurations.LOG_PATH
logger = Logger(process_name="data_collection", log_file=LOG_FILE)

# Define the folder to store the raw data
RAW_DIR = Path(Configurations.DATA_RAW_PATH)
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Define the folder to store the processed datain_path
PROCESSED_DIR = Path(Configurations.DATA_PROCESSED_PATH)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Amazon Reviews dataset, collected in 2023 by McAuley Lab
CATEGORIES = Configurations.CATEGORIES
CORES = Configurations.CORES
SPLITS = Configurations.SPLITS
BASE_URL = Configurations.BASE_URL
meta_base_url = "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_{category}.jsonl.gz"

# Build meta URL map from categories
meta_urls = {cat: meta_base_url.format(category=cat) for cat in CATEGORIES}

### Task: Define functions

#### Build candidate subsets

In [3]:
def build_url(core: str, category: str, split: str) -> str:
    return f"{BASE_URL}/{core}/last_out_w_his/{category}.{split}.csv.gz"

#### local_path_for_parquet

In [4]:
def local_path_for_parquet(core: str, category: str, split: str, raw_dir=RAW_DIR) -> Path:
    """Flat folder: raw → .csv.gz; processed → .parquet"""
    safe_cat = category.replace("/", "-")

    if raw_dir == RAW_DIR:
        return RAW_DIR / f"{safe_cat}.{core}.{split}.csv.gz"
    elif raw_dir == PROCESSED_DIR:
        return PROCESSED_DIR / f"{safe_cat}.{core}.{split}.parquet"
    else:
        raise ValueError(f"Invalid directory: {raw_dir}")

#### Download files

In [5]:
def download_file(url: str, out_path: Path, max_retries: int = 3) -> None:
    if out_path.exists() and out_path.stat().st_size > 0:
        logger.log_info(f"Exists, skip: {out_path.name}")
        return
    attempt = 0
    while attempt < max_retries:
        try:
            attempt += 1
            logger.log_info(f"Downloading (attempt {attempt}/{max_retries}): {url}")
            tmp = str(out_path) + ".part"
            urllib.request.urlretrieve(url, tmp)
            os.replace(tmp, out_path)
            logger.log_info(f"Saved: {out_path.name}")
            return
        except Exception as e:
            logger.log_warning(f"Failed attempt {attempt} for {url}: {e}")
    raise RuntimeError(f"Exceeded retries: {url}")

#### Preview

In [6]:
def preview_gz_csv_head(gz_path: Path, n_lines: int = 3) -> None:
    try:
        with gzip.open(gz_path, "rt", encoding="utf-8", newline="") as fin:
            reader = csv.reader(fin)
            header = next(reader, None)
            logger.log_info(f"[Preview] {gz_path.name} header: {header}")
            for i, row in enumerate(reader):
                logger.log_info(f"[Preview] row {i+1}: {row[:8]}")
                if i+1 >= n_lines:
                    break
    except Exception as e:
        logger.log_warning(f"Preview failed for {gz_path.name}: {e}")

#### Save to parquet format

In [10]:
def save_dataset_to_parquet(csv_gz_path: Path, out_parquet_path: Path):
    """
    Save a gzip-compressed CSV to Parquet format using pandas.
    Skips if the target Parquet file already exists.
    """
    if out_parquet_path.exists() and out_parquet_path.stat().st_size > 0:
        logger.log_info(f"Exists, skip: {out_parquet_path.name}")
        return

    try:
        df = pd.read_csv(csv_gz_path, compression='gzip')
        df.to_parquet(out_parquet_path, index=False)
        logger.log_info(f"Saved Parquet: {out_parquet_path.name}")
    except Exception as e:
        logger.log_exception(f"Failed to save Parquet for {out_parquet_path}: {e}")

#### Collect meta files for enriching answers

In [14]:
def _meta_path(category: str):
    return RAW_DIR / f"{category}.meta.jsonl.gz"

def download_meta(category: str, url: str | None = None):
    url = url or meta_urls.get(category)
    logger.log_info(f"Link for {category}: {url}")
    if not url:
        logger.log_warning(f"[META] No URL configured for category={category}")
        return
    dst = _meta_path(category)
    if not dst.exists() or dst.stat().st_size == 0:
        logger.log_info(f"[META] Downloading: {category} → {dst}")
        try:
            urllib.request.urlretrieve(url, str(dst))
            logger.log_info(f"[META] Done: {dst} ({dst.stat().st_size} bytes)")
        except Exception as e:
            logger.log_exception(f"[META] Download failed for {category}: {e}")
    else:
        logger.log_info(f"[META] Exists: {dst} (skip)")

def preview_meta(category: str, n: int = 5):
    fp = _meta_path(category)
    if not fp.exists():
        logger.log_warning(f"[META] Preview: file not found for {category}: {fp}")
        return
    rows = []
    with gzip.open(fp, "rt") as f:
        for line in f:
            try:
                rows.append(json.loads(line))
            except Exception:
                pass
            if len(rows) >= n:
                break
    if not rows:
        logger.log_warning(f"[META] Preview: no rows parsed for {category}")
        return
    df = pd.DataFrame(rows)
    logger.log_info(f"[META] Preview {category}: shape={df.shape}")
    logger.log_info(f"[META] Columns: {df.columns.tolist()}")
    display(df.head())

def save_meta_sample_for_ui(category: str, sample_n: int = 2000):
    fp = _meta_path(category)
    if not fp.exists():
        logger.log_warning(f"[META] Sample: file not found for {category}: {fp}")
        return None
    rows = []
    with gzip.open(fp, "rt") as f:
        for line in f:
            try:
                obj = json.loads(line)
                img = obj.get("images")
                if isinstance(img, list):
                    img = img[0] if img else None
                rows.append({
                    "parent_asin": obj.get("parent_asin"),
                    "title": obj.get("title"),
                    "price": obj.get("price"),
                    "details": obj.get("details"),
                    "image": img,
                    "store": obj.get("store")
                })
            except Exception:
                continue
            if len(rows) >= sample_n:
                break
    if not rows:
        logger.log_warning(f"[META] Sample: no rows collected for {category}")
        return None
    out_df = pd.DataFrame(rows)
    out_path = PROCESSED_DIR / f"{category.replace('/', '-')}.meta.sample.parquet"
    out_df.to_parquet(out_path, index=False)
    logger.log_info(f"[META] Saved sample: {out_path} | shape={out_df.shape}")
    display(out_df.head())
    return out_path


### Task: Main execution logic

In [12]:
def run():
    logger.log_info("=== Data Collection (0core & 5core; flat folder) started ===")
    logger.log_info(f"Output dir: {RAW_DIR} | Categories: {CATEGORIES}")

    for core in CORES:
        for cat in CATEGORIES:
            for split in SPLITS:
                url = build_url(core, cat, split)
                in_path = local_path_for_parquet(core, cat, split, raw_dir=RAW_DIR)
                try:
                    download_file(url, in_path)
                    preview_gz_csv_head(in_path, n_lines=3)
                    out_path = local_path_for_parquet(core, cat, split, raw_dir=PROCESSED_DIR)
                    save_dataset_to_parquet(in_path, out_path)
                except Exception as e:
                    logger.log_exception(f"Skip {cat}-{core}-{split}: {e}")

    logger.log_info("Collected all cores datasets ===")
    logger.log_info("=== Data Collection metadata for enrich answers started ===")
    for cat in CATEGORIES:
        try:
            download_meta(cat)
            preview_meta(cat, n=5)
            save_meta_sample_for_ui(cat, sample_n=1000)
        except Exception as e:
            logger.log_exception(f"[META] {cat} failed: {e}")

    logger.log_info("=== Data Collection completed ===")


### Task: Unit test

In [15]:
run()

2025-09-28 09:50:27,871 - INFO - === Data Collection (0core & 5core; flat folder) started ===
2025-09-28 09:50:27,874 - INFO - Output dir: /Users/kevin/Documents/GitHub/Python/VESKL/Personal/NEU/NEU/NEU_7275/Prj/Prj_1/APRS_7275_G6/Amazon-Product-Recommendation-System/data/raw | Categories: ['Electronics', 'Beauty_and_Personal_Care']
2025-09-28 09:50:27,874 - INFO - Exists, skip: Electronics.0core.train.csv.gz
2025-09-28 09:50:27,876 - INFO - [Preview] Electronics.0core.train.csv.gz header: ['user_id', 'parent_asin', 'rating', 'timestamp', 'history']
2025-09-28 09:50:27,877 - INFO - [Preview] row 1: ['AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'B01G8JO5F2', '5.0', '1523093017534', '']
2025-09-28 09:50:27,877 - INFO - [Preview] row 2: ['AGCI7FAH4GL5FI65HYLKWTMFZ2CQ', 'B0047T79VS', '3.0', '1344406083000', '']
2025-09-28 09:50:27,878 - INFO - [Preview] row 3: ['AGCI7FAH4GL5FI65HYLKWTMFZ2CQ', 'B004S2JX7W', '5.0', '1384912340000', 'B0047T79VS']
2025-09-28 09:50:27,879 - INFO - Exists, skip: Electronics.

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together
0,All Electronics,FS-1051 FATSHARK TELEPORTER V3 HEADSET,3.5,6,[],[Teleporter V3 The “Teleporter V3” kit sets a ...,,[{'thumb': 'https://m.media-amazon.com/images/...,[],Fat Shark,"[Electronics, Television & Video, Video Glasses]","{'Date First Available': 'August 2, 2014', 'Ma...",B00MCW7G9M,
1,All Electronics,Ce-H22B12-S1 4Kx2K Hdmi 4Port,5.0,1,"[UPC: 662774021904, Weight: 0.600 lbs]",[HDMI In - HDMI Out],,[{'thumb': 'https://m.media-amazon.com/images/...,[],SIIG,"[Electronics, Television & Video, Accessories,...",{'Product Dimensions': '0.83 x 4.17 x 2.05 inc...,B00YT6XQSE,
2,Computers,Digi-Tatoo Decal Skin Compatible With MacBook ...,4.5,246,[WARNING: Please IDENTIFY MODEL NUMBER on the ...,[],19.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'AL 2Sides Video', 'url': 'https://...",Digi-Tatoo,"[Electronics, Computers & Accessories, Laptop ...","{'Brand': 'Digi-Tatoo', 'Color': 'Fresh Marble...",B07SM135LS,
3,AMAZON FASHION,NotoCity Compatible with Vivoactive 4 band 22m...,4.5,233,[☛NotoCity 22mm band is designed for Vivoactiv...,[],9.99,[{'thumb': 'https://m.media-amazon.com/images/...,[],NotoCity,"[Electronics, Wearable Technology, Clips, Arm ...","{'Date First Available': 'May 29, 2020', 'Manu...",B089CNGZCW,
4,Cell Phones & Accessories,Motorola Droid X Essentials Combo Pack,3.8,64,"[New Droid X Essentials Combo Pack, Exclusive ...",[all Genuine High Quality Motorola Made Access...,14.99,[{'thumb': 'https://m.media-amazon.com/images/...,[],Verizon,"[Electronics, Computers & Accessories, Compute...",{'Product Dimensions': '11.6 x 6.9 x 3.1 inche...,B004E2Z88O,


2025-09-28 09:50:28,091 - INFO - [META] Saved sample: /Users/kevin/Documents/GitHub/Python/VESKL/Personal/NEU/NEU/NEU_7275/Prj/Prj_1/APRS_7275_G6/Amazon-Product-Recommendation-System/data/processed/Electronics.meta.sample.parquet | shape=(1000, 6)


Unnamed: 0,parent_asin,title,price,details,image,store
0,B00MCW7G9M,FS-1051 FATSHARK TELEPORTER V3 HEADSET,,"{'Date First Available': 'August 2, 2014', 'Ma...",{'thumb': 'https://m.media-amazon.com/images/I...,Fat Shark
1,B00YT6XQSE,Ce-H22B12-S1 4Kx2K Hdmi 4Port,,{'Product Dimensions': '0.83 x 4.17 x 2.05 inc...,{'thumb': 'https://m.media-amazon.com/images/I...,SIIG
2,B07SM135LS,Digi-Tatoo Decal Skin Compatible With MacBook ...,19.99,"{'Brand': 'Digi-Tatoo', 'Color': 'Fresh Marble...",{'thumb': 'https://m.media-amazon.com/images/I...,Digi-Tatoo
3,B089CNGZCW,NotoCity Compatible with Vivoactive 4 band 22m...,9.99,"{'Date First Available': 'May 29, 2020', 'Manu...",{'thumb': 'https://m.media-amazon.com/images/I...,NotoCity
4,B004E2Z88O,Motorola Droid X Essentials Combo Pack,14.99,{'Product Dimensions': '11.6 x 6.9 x 3.1 inche...,{'thumb': 'https://m.media-amazon.com/images/I...,Verizon


2025-09-28 09:50:28,096 - INFO - Link for Beauty_and_Personal_Care: https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Beauty_and_Personal_Care.jsonl.gz
2025-09-28 09:50:28,097 - INFO - [META] Exists: /Users/kevin/Documents/GitHub/Python/VESKL/Personal/NEU/NEU/NEU_7275/Prj/Prj_1/APRS_7275_G6/Amazon-Product-Recommendation-System/data/raw/Beauty_and_Personal_Care.meta.jsonl.gz (skip)
2025-09-28 09:50:28,098 - INFO - [META] Preview Beauty_and_Personal_Care: shape=(5, 14)
2025-09-28 09:50:28,099 - INFO - [META] Columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together']


Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together
0,All Beauty,"Shiyeen 10 Colors Hair Chalk for Girls Gift, K...",3.9,57,"[🌼[MEET YOUR HAIR COLOR NEEDS] Bright color, h...",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],shiyeen,"[Beauty & Personal Care, Hair Care, Hair Color...","{'Color': 'Orange,Blue,Cyan,White,Green,Red,Pi...",B08BLDKYHB,
1,All Beauty,"Ebbfurln Bob Wig Human Hair, 13x4 HD Lace Fron...",4.1,60,[Frontal Wigs Human Hair Material: 100% unproc...,[],45.65,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Cute bob', 'url': 'https://www.ama...",Ebbfurln,"[Beauty & Personal Care, Hair Care, Hair Exten...","{'Color': '13x4 Bob Wigs', 'Material': 'Human'...",B0BWJGQ32Y,
2,All Beauty,Makeup brush cleaner and dryer electronic spin...,4.4,7,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],J-ANKKA,"[Beauty & Personal Care, Tools & Accessories, ...","{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07DC9S9PF,
3,All Beauty,"3 Inch Clipper Guards, Hair Clipper Guide Comb...",4.0,68,[3 Inch Clipper Guards: The only 3 inch hair c...,[],24.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Wahl Clipper Guards NO.16 NO.12 NO...,CR8GR8,"[Beauty & Personal Care, Hair Care, Hair Cutti...","{'Recommended Uses For Product': 'Clipping', '...",B0BM8WLSXF,
4,All Beauty,Cathy Doll L-Glutathione Magic Cream SPF 50 Wh...,4.0,65,[Magic Cream is formulated with an additional ...,"[Extend the flawless, smooth complexion from f...",14.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'MCHOIX Sunscreen SPF 50 ', 'url': ...",Cathy Doll,"[Beauty & Personal Care, Skin Care, Sunscreens...","{'Product Benefits': 'Whitening', 'Sun Protect...",B00N4LMZZK,


2025-09-28 09:50:28,247 - INFO - [META] Saved sample: /Users/kevin/Documents/GitHub/Python/VESKL/Personal/NEU/NEU/NEU_7275/Prj/Prj_1/APRS_7275_G6/Amazon-Product-Recommendation-System/data/processed/Beauty_and_Personal_Care.meta.sample.parquet | shape=(1000, 6)


Unnamed: 0,parent_asin,title,price,details,image,store
0,B08BLDKYHB,"Shiyeen 10 Colors Hair Chalk for Girls Gift, K...",,"{'Color': 'Orange,Blue,Cyan,White,Green,Red,Pi...",{'thumb': 'https://m.media-amazon.com/images/I...,shiyeen
1,B0BWJGQ32Y,"Ebbfurln Bob Wig Human Hair, 13x4 HD Lace Fron...",45.65,"{'Color': '13x4 Bob Wigs', 'Material': 'Human'...",{'thumb': 'https://m.media-amazon.com/images/I...,Ebbfurln
2,B07DC9S9PF,Makeup brush cleaner and dryer electronic spin...,,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",{'thumb': 'https://m.media-amazon.com/images/I...,J-ANKKA
3,B0BM8WLSXF,"3 Inch Clipper Guards, Hair Clipper Guide Comb...",24.99,"{'Recommended Uses For Product': 'Clipping', '...",{'thumb': 'https://m.media-amazon.com/images/I...,CR8GR8
4,B00N4LMZZK,Cathy Doll L-Glutathione Magic Cream SPF 50 Wh...,14.99,"{'Product Benefits': 'Whitening', 'Sun Protect...",{'thumb': 'https://m.media-amazon.com/images/I...,Cathy Doll


2025-09-28 09:50:28,255 - INFO - === Data Collection completed ===
