In [15]:
!pip install ftfy
from ftfy import fix_text
import gzip
import json
import polars as pl
from collections.abc import Mapping, Sequence
from collections import OrderedDict
import tqdm
import re



In [16]:
!mkdir -p ./data/raw_data

In [17]:
!curl -L -o ./data/raw_data/items.json.gz https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/meta_categories/meta_Electronics.jsonl.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1252M  100 1252M    0     0  54.1M      0  0:00:23  0:00:23 --:--:-- 53.7M


In [18]:
!ls ./data/raw_data

items.json.gz


In [19]:
ATTR_STOP_WORDS = ["bought_together", "image", "rating_number", "average_rating", "video"]

In [22]:
def filter_and_flatten_json(json_obj, parent_key='', separator='.'):
    """
    Flatten a nested JSON object into a dictionary with dot-separated keys.

    Args:
        json_obj: The JSON object to flatten.
        parent_key: The parent key for nested structures (used in recursion).
        separator: The separator for nested keys (default: '.').

    Returns:
        A flattened dictionary.
    """
    items = []
    for key, value in json_obj.items():
        new_key = f"{parent_key}{separator}{key}" if parent_key else key

        list_in_meta = True

        for stop_word in ATTR_STOP_WORDS:
          if stop_word in new_key:
            list_in_meta = False
        if not list_in_meta:
          continue

        if new_key == 'main_category':
          new_key = 'categories[0]'

        if isinstance(value, Mapping):
            items.extend(filter_and_flatten_json(value, new_key, separator).items())
        elif isinstance(value, Sequence) and not isinstance(value, (str, bytes)):
            for i, v in enumerate(value):
                if isinstance(v, (Mapping, Sequence)) and not isinstance(v, (str, bytes)):
                    items.extend(filter_and_flatten_json({f"{new_key}[{i+1}]": v}, '', separator).items())
                else:
                    items.append((f"{new_key}[{i+1}]", v))
        else:
            items.append((new_key, value))
    return dict(items)

def aggregate_sequential_attrs(attrs_dict):
    cats_traversal = []
    full_description = []
    features = []
    aggregated_dict = dict()
    cats_pattern = r'^categories\[\d+\]$'
    desc_pattern = r'^description\[\d+\]$'
    feat_pattern = r'^features\[\d+\]$'

    for k, v in sorted(list(attrs_dict.items()), key=lambda x: x[0]):
        if re.match(cats_pattern,k):
          cats_traversal.append(v)
        elif re.match(desc_pattern,k):
          full_description.append(v)
        elif re.match(feat_pattern,k):
          features.append(v)
        else:
          aggregated_dict[k] = v

    aggregated_dict["categories"] = cats_traversal
    aggregated_dict["description"] = ". ".join(full_description)
    aggregated_dict["features"] = "; ".join(features)

    return aggregated_dict

def extract_text_metadata_as_kv(metadata_file):
    """
    Extract metadata from a JSONL.gz file and return a DataFrame with |asin|text| columns.

    Args:
        metadata_file (str): Path to the metadata JSONL.gz file (e.g., 'data/meta_All_Beauty.jsonl.gz').

    Returns:
        pandas.DataFrame: DataFrame with columns 'asin' and 'text' (flattened JSON as string).
    """
    data = []
    items_df = None

    with gzip.open(metadata_file, 'rt', encoding='utf-8') as f:
        for item_id, line in enumerate(tqdm.tqdm(f)):
            item = json.loads(line)
            asin = item.get('asin', item.get('parent_asin', ''))
            if not asin:  # Skip if no asin or parent_asin
                continue

            # Remove asin and parent_asin from the JSON to avoid duplication in text
            item_copy = item.copy()
            item_copy.pop('asin', None)
            item_copy.pop('parent_asin', None)

            # Flatten the remaining JSON
            flattened = filter_and_flatten_json(item_copy)
            aggregated = aggregate_sequential_attrs(flattened)
            # Convert flattened dictionary to JSON string
            text = json.dumps(aggregated)

            data.append({'item_id':item_id, 'asin': asin, 'text': fix_text(text.encode().decode('unicode-escape')), 'categories': aggregated["categories"]})

            if item_id % 128 == 127:
              if items_df is None:
                items_df = pl.from_dicts(data)
              else:
                items_df = pl.concat([items_df, pl.from_dicts(data)])
              del data
              data = []
    return pl.concat([items_df, pl.from_dicts(data)])

In [23]:
df = extract_text_metadata_as_kv("./data/raw_data/items.json.gz")

1610012it [43:06, 622.41it/s]


In [24]:
len(df)

1610012

In [26]:
!mkdir -p ./data/lvl1_data
df.write_parquet("./data/lvl1_data/items.parquet")

In [None]:
item_id_mapping = dict(df.select("asin", "item_id").iter_rows())