In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
# import sys, os
# sys.path.append(os.path.abspath(".."))

In [6]:
import datasets
from pydantic import BaseModel
from datasets import load_dataset
from src.utils import parse_dt, handle_dtypes

In [7]:
class Args(BaseModel):
    hf_dataset_path: str = "McAuley-Lab/Amazon-Reviews-2023"
    report_sample_num_rows: int = 10000
    random_seed: int = 41
args = Args()
print(args.json())

{"hf_dataset_path": "McAuley-Lab/Amazon-Reviews-2023", "report_sample_num_rows": 10000, "random_seed": 41}


In [8]:
#meta item
metadata = load_dataset(args.hf_dataset_path, "raw_meta_All_Beauty", split="full", trust_remote_code=True)

In [9]:
metadata[0]

{'main_category': 'All Beauty',
 'title': 'Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)',
 'average_rating': 4.8,
 'rating_number': 10,
 'features': [],
 'description': [],
 'price': 'None',
 'images': {'hi_res': [None,
   'https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg'],
  'large': ['https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg',
   'https://m.media-amazon.com/images/I/41w2yznfuZL.jpg'],
  'thumb': ['https://m.media-amazon.com/images/I/41qfjSfqNyL._SS40_.jpg',
   'https://m.media-amazon.com/images/I/41w2yznfuZL._SS40_.jpg'],
  'variant': ['MAIN', 'PT01']},
 'videos': {'title': [], 'url': [], 'user_id': []},
 'store': 'Howard Products',
 'categories': [],
 'details': '{"Package Dimensions": "7.1 x 5.5 x 3 inches; 2.38 Pounds", "UPC": "617390882781"}',
 'parent_asin': 'B01CUPMQZE',
 'bought_together': None,
 'subtitle': None,
 'author': None}

In [10]:
#User-item interaction (review/rating)
dataset = load_dataset(
    args.hf_dataset_path, name="5core_timestamp_All_Beauty"
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['user_id', 'parent_asin', 'rating', 'timestamp'],
        num_rows: 2237
    })
    valid: Dataset({
        features: ['user_id', 'parent_asin', 'rating', 'timestamp'],
        num_rows: 276
    })
    test: Dataset({
        features: ['user_id', 'parent_asin', 'rating', 'timestamp'],
        num_rows: 22
    })
})

In [12]:
dataset['train'][0]

{'user_id': 'AFSKPY37N3C43SOI5IEXEK5JSIYA',
 'parent_asin': 'B07J3GH1W1',
 'rating': '5.0',
 'timestamp': '1547589356557'}

In [13]:
train_raw = dataset['train'].to_pandas()
train_raw.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07J3GH1W1,5.0,1547589356557
1,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07W397QG4,5.0,1593352422858
2,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07KG1TWP5,5.0,1596473351088
3,AFSKPY37N3C43SOI5IEXEK5JSIYA,B08JTNQFZY,5.0,1617904219785
4,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07SLFWZKN,3.0,1619737501209


In [14]:
meta_raw = metadata.to_pandas()
meta_raw.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Beauty,"Howard LC0008 Leather Conditioner, 8-Ounce (4-...",4.8,10,[],[],,"{'hi_res': [None, 'https://m.media-amazon.com/...","{'title': [], 'url': [], 'user_id': []}",Howard Products,[],"{""Package Dimensions"": ""7.1 x 5.5 x 3 inches; ...",B01CUPMQZE,,,
1,All Beauty,Yes to Tomatoes Detoxifying Charcoal Cleanser ...,4.5,3,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Yes To,[],"{""Item Form"": ""Powder"", ""Skin Type"": ""Acne Pro...",B076WQZGPM,,,
2,All Beauty,Eye Patch Black Adult with Tie Band (6 Per Pack),4.4,26,[],[],,"{'hi_res': [None, None], 'large': ['https://m....","{'title': [], 'url': [], 'user_id': []}",Levine Health Products,[],"{""Manufacturer"": ""Levine Health Products""}",B000B658RI,,,
3,All Beauty,"Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4...",3.1,102,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Cherioll,[],"{""Brand"": ""Cherioll"", ""Item Form"": ""Powder"", ""...",B088FKY3VD,,,
4,All Beauty,Precision Plunger Bars for Cartridge Grips – 9...,4.3,7,"[Material: 304 Stainless Steel; Brass tip, Len...",[The Precision Plunger Bars are designed to wo...,,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Precision,[],"{""UPC"": ""644287689178""}",B07NGFDN6G,,,


In [18]:
train_df = train_raw.pipe(parse_dt).pipe(handle_dtypes)
train_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07J3GH1W1,5.0,2019-01-15 21:55:56.557
1,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07W397QG4,5.0,2020-06-28 13:53:42.858
2,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07KG1TWP5,5.0,2020-08-03 16:49:11.088
3,AFSKPY37N3C43SOI5IEXEK5JSIYA,B08JTNQFZY,5.0,2021-04-08 17:50:19.785
4,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07SLFWZKN,3.0,2021-04-29 23:05:01.209
