# Prepare features

Prepare the necessary features and transformations

# Set up

In [None]:
import os
import sys

import dill
import pandas as pd
from loguru import logger
from pydantic import BaseModel
from feast import FeatureStore

sys.path.insert(0, "..")
import numpy as np
from datasets import load_dataset

from src.id_mapper import IDMapper, map_indice
from src.utils import parse_dt, handle_dtypes
from src.init_s3 import init_s3_client

# Controller

In [2]:
class Args(BaseModel):
    run_name: str = "000-prep-data"
    testing: bool = True
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    sequence_length: int = 10

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "000-prep-data",
  "testing": true,
  "notebook_persist_dp": "/mnt/d/projects/recsys/notebooks/data/000-prep-data",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "sequence_length": 10
}


# Load data

In [3]:
metadata_raw = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Video_Games", trust_remote_code=True
)
metadata_raw_df = metadata_raw["full"].to_pandas()
metadata_raw_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Video Games,Dash 8-300 Professional Add-On,5.0,1,[Features Dash 8-300 and 8-Q300 ('Q' rollout l...,[The Dash 8-300 Professional Add-On lets you p...,,"{'hi_res': [None], 'large': ['https://m.media-...","{'title': [], 'url': [], 'user_id': []}",Aerosoft,"[Video Games, PC, Games]","{""Pricing"": ""The strikethrough price is the Li...",B000FH0MHO,,,
1,Video Games,Phantasmagoria: A Puzzle of Flesh,4.1,18,[Windows 95],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Sierra,"[Video Games, PC, Games]","{""Best Sellers Rank"": {""Video Games"": 137612, ...",B00069EVOG,,,
2,Video Games,NBA 2K17 - Early Tip Off Edition - PlayStation 4,4.3,223,[The #1 rated NBA video game simulation series...,[Following the record-breaking launch of NBA 2...,58.0,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['NBA 2K17 - Kobe: Haters vs Players...,2K,"[Video Games, PlayStation 4, Games]","{""Release date"": ""September 16, 2016"", ""Best S...",B00Z9TLVK0,,,
3,Video Games,Nintendo Selects: The Legend of Zelda Ocarina ...,4.9,22,[Authentic Nintendo Selects: The Legend of Zel...,[],37.42,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Amazon Renewed,"[Video Games, Legacy Systems, Nintendo Systems...","{""Best Sellers Rank"": {""Video Games"": 51019, ""...",B07SZJZV88,,,
4,Video Games,Thrustmaster Elite Fitness Pack for Nintendo Wii,3.0,3,"[Includes (9) Total Accessories, Pedometer, Wi...",[The Thrustmaster Motion Plus Elite Fitness Pa...,,"{'hi_res': [None, None, None, None, None, None...","{'title': [], 'url': [], 'user_id': []}",THRUSTMASTER,"[Video Games, Legacy Systems, Nintendo Systems...","{""Release date"": ""November 1, 2009"", ""Pricing""...",B002WH4ZJG,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137264,,Story of Seasons: Pioneers Of Olive Town (Nint...,4.5,397,[A wild world of discovery - tame the wilderne...,"[Product Description, Inspired by Tales of you...",31.04,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Marvelous Europe,"[Video Games, Nintendo Switch, Games]","{""Release date"": ""March 26, 2021"", ""Best Selle...",B09XQJS4CZ,,,
137265,Video Games,MotoGP 18 (PC DVD) UK IMPORT REGION FREE,4.0,1,[Brand new game engine - MotoGP18 has been reb...,[Become the champion of the 2018 MotoGP Season...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Milestone,"[Video Games, Game Genre of the Month]","{""Pricing"": ""The strikethrough price is the Li...",B07DGPTGNV,,,
137266,Cell Phones & Accessories,Century Accessory Soft Silicone Protective Ski...,2.9,19,"[Easy access to all buttons, controls and port...",[This soft case cover will add a splash of col...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Century Accessory,"[Video Games, Legacy Systems, Xbox Systems, Xb...","{""Package Dimensions"": ""2.76 x 2.76 x 0.2 inch...",B00HUWCQBW,,,
137267,,Hasbro Interactive Mr. Potato Head Activity Pa...,3.9,5,[],"[Amazon.com, Everyone's favorite master-of-dis...",,"{'hi_res': [None, 'https://m.media-amazon.com/...","{'title': [], 'url': [], 'user_id': []}",Hasbro,"[Video Games, PC, Games]","{""Release date"": ""July 24, 1999"", ""Best Seller...",B00002S9MH,,,


In [4]:
train_df = pd.read_parquet("../data/train.parquet")
val_df = pd.read_parquet("../data/val.parquet")
full_df = (
    pd.concat([train_df, val_df], axis=0)
    .pipe(parse_dt)
    .pipe(handle_dtypes)
    .assign(timestamp_unix=lambda df: df[args.timestamp_col].astype("int64") // 10**9)
)

In [5]:
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
54,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0050SVNZ8,4.0,1321885664000
55,AHATA6X6MYTC3VNBFJ3WIYVK257A,B00LZVNWIA,4.0,1408233606000
61,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0BH98D8GL,5.0,1511708554100
62,AHATA6X6MYTC3VNBFJ3WIYVK257A,B074RNL1RX,5.0,1511753174174
63,AHATA6X6MYTC3VNBFJ3WIYVK257A,B089QYP649,5.0,1531092820696
...,...,...,...,...
736763,AE3P3SRQSH7R4R7RR2KMUEWLEXPQ,B004IWRNTC,5.0,1394472136000
736764,AE3P3SRQSH7R4R7RR2KMUEWLEXPQ,B01FSKACPY,5.0,1394472165000
736765,AE3P3SRQSH7R4R7RR2KMUEWLEXPQ,B002JTX9WQ,5.0,1394472180000
736767,AE3P3SRQSH7R4R7RR2KMUEWLEXPQ,B0017QFMJU,5.0,1394472206000


IDMapper is the class responsible for mapping original string indice to integer indice since our model expect the integer indexing.

In [6]:
# Sorted to make sure that even rerun we get same idm mapping
unique_user_ids = sorted(train_df[args.user_col].unique())
unique_item_ids = sorted(train_df[args.item_col].unique())
idm = IDMapper()
idm.fit(unique_user_ids, unique_item_ids)

In [7]:
train_df = train_df.pipe(map_indice, idm, args.user_col, args.item_col)
val_df = val_df.pipe(map_indice, idm, args.user_col, args.item_col)

In [8]:
idm_persist_fp = "../data/idm.json"
idm.save(idm_persist_fp)
idm = IDMapper().load(idm_persist_fp)

# Load features from feature store

In [9]:
store = FeatureStore(
    repo_path="..", fs_yaml_file="../feature_store_offline_server.yaml"
)

## Load item features

In [10]:
item_features = [
    "parent_asin_rating_stats:parent_asin_rating_cnt_365d",
    "parent_asin_rating_stats:parent_asin_rating_avg_prev_rating_365d",
    "parent_asin_rating_stats:parent_asin_rating_cnt_90d",
    "parent_asin_rating_stats:parent_asin_rating_avg_prev_rating_90d",
    "parent_asin_rating_stats:parent_asin_rating_cnt_30d",
    "parent_asin_rating_stats:parent_asin_rating_avg_prev_rating_30d",
    "parent_asin_rating_stats:parent_asin_rating_cnt_7d",
    "parent_asin_rating_stats:parent_asin_rating_avg_prev_rating_7d",
]

In [13]:
%%time
features_df = store.get_historical_features(full_df[[args.item_col, args.timestamp_col]].drop_duplicates(), item_features).to_df()
assert features_df.duplicated().sum() == 0, display(features_df.loc[features_df.duplicated()])



Using timestamp as the event timestamp. To specify a column explicitly, please name it event_timestamp.
CPU times: user 1.45 s, sys: 274 ms, total: 1.72 s
Wall time: 4min 36s


In [14]:
features_df

Unnamed: 0,parent_asin,timestamp,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d
0,B07DHNX18W,2020-11-02 20:18:55.046,6,3.000000,1,2.000000,0,,0,
1,B0BL3CW73P,2019-05-02 20:58:55.236,1,5.000000,1,5.000000,0,,0,
2,B00CEGCN76,2014-02-26 00:24:16.000,2,4.000000,2,4.000000,2,4.000,2,4.000000
3,B00BGA9X9W,2014-05-28 04:28:39.000,92,4.521739,27,4.592593,8,4.625,3,4.333333
4,B00KSRV19E,2014-12-14 20:56:52.000,25,4.640000,25,4.640000,0,,0,
...,...,...,...,...,...,...,...,...,...,...
165242,B0BF1BQ3D2,2021-12-10 20:33:19.570,12,3.666667,0,,0,,0,
165243,B07P27XFP7,2022-05-16 04:09:28.552,0,,0,,0,,0,
165244,B0BL65X86R,2021-08-11 17:26:08.435,16,4.750000,4,5.000000,0,,0,
165245,B01N3ASPNV,2022-06-17 07:42:54.083,10,4.000000,0,,0,,0,


In [15]:
full_features_df = pd.merge(
    full_df, features_df, on=[args.item_col, args.timestamp_col], how="left"
).pipe(map_indice, idm, args.user_col, args.item_col)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d,user_indice,item_indice
0,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0050SVNZ8,4.0,2011-11-21 14:27:44.000,1321885664,1,4.000000,1,4.000000,1,4.0,0,,15688,1820
1,AHATA6X6MYTC3VNBFJ3WIYVK257A,B00LZVNWIA,4.0,2014-08-17 00:00:06.000,1408233606,2,3.500000,2,3.500000,2,3.5,1,3.0,15688,2778
2,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0BH98D8GL,5.0,2017-11-26 15:02:34.100,1511708554,2,3.500000,1,3.000000,0,,0,,15688,4549
3,AHATA6X6MYTC3VNBFJ3WIYVK257A,B074RNL1RX,5.0,2017-11-27 03:26:14.174,1511753174,0,,0,,0,,0,,15688,3757
4,AHATA6X6MYTC3VNBFJ3WIYVK257A,B089QYP649,5.0,2018-07-08 23:33:40.696,1531092820,26,4.153846,7,4.428571,2,3.0,1,1.0,15688,4360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AE3NRCMFIBBA2XVODR47YYNLKRDA,B001EYUQC8,5.0,2021-11-13 09:59:46.634,1636797586,1,3.000000,1,3.000000,0,,0,,268,908
165256,AEV5TZDZQEP24PM3SZ7SNV4TR26Q,B01N3ASPNV,5.0,2022-06-17 07:42:54.083,1655451774,10,4.000000,0,,0,,0,,4213,3527
165257,AELRDP5MCGSCANM6GWUXAMBN75LQ,B009AGXH64,5.0,2022-06-03 18:23:36.536,1654280616,1,5.000000,0,,0,,0,,2741,2217
165258,AHERXKLMQLGPQLW4ZLKD4IRLMZAA,B07M6RVMPJ,5.0,2021-11-27 00:36:11.015,1637973371,0,,0,,0,,0,,16297,4054


## Load user features

In [None]:
%%time
user_features = [
    "user_rating_stats:user_rating_cnt_90d",
    "user_rating_stats:user_rating_avg_prev_rating_90d",
    "user_rating_stats:user_rating_list_10_recent_asin",
    "user_rating_stats:user_rating_list_10_recent_asin_timestamp",
]

features_df = store.get_historical_features(full_df[[args.user_col, args.timestamp_col]].drop_duplicates(), user_features).to_df()
assert features_df.duplicated().sum() == 0, display(features_df.loc[features_df.duplicated()])

In [17]:
full_features_df = pd.merge(
    full_features_df, features_df, on=[args.user_col, args.timestamp_col], how="left"
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d,user_indice,item_indice,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp
0,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0050SVNZ8,4.0,2011-11-21 14:27:44.000,1321885664,1,4.000000,1,4.000000,1,4.0,0,,15688,1820,1,,,
1,AHATA6X6MYTC3VNBFJ3WIYVK257A,B00LZVNWIA,4.0,2014-08-17 00:00:06.000,1408233606,2,3.500000,2,3.500000,2,3.5,1,3.0,15688,2778,1,,B0050SVNZ8,1321885664
2,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0BH98D8GL,5.0,2017-11-26 15:02:34.100,1511708554,2,3.500000,1,3.000000,0,,0,,15688,4549,1,,"B0050SVNZ8,B00LZVNWIA",13218856641408233606
3,AHATA6X6MYTC3VNBFJ3WIYVK257A,B074RNL1RX,5.0,2017-11-27 03:26:14.174,1511753174,0,,0,,0,,0,,15688,3757,2,5.0,"B0050SVNZ8,B00LZVNWIA,B0BH98D8GL",132188566414082336061511708554
4,AHATA6X6MYTC3VNBFJ3WIYVK257A,B089QYP649,5.0,2018-07-08 23:33:40.696,1531092820,26,4.153846,7,4.428571,2,3.0,1,1.0,15688,4360,1,,"B0050SVNZ8,B00LZVNWIA,B0BH98D8GL,B074RNL1RX",1321885664140823360615117085541511753174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AE3NRCMFIBBA2XVODR47YYNLKRDA,B001EYUQC8,5.0,2021-11-13 09:59:46.634,1636797586,1,3.000000,1,3.000000,0,,0,,268,908,1,,"B000OLXX86,B000B9RI14,B0050SWQ86,B00CTKHXFO,B0...","1342793426,1362929993,1368549700,1392151606,14..."
165256,AEV5TZDZQEP24PM3SZ7SNV4TR26Q,B01N3ASPNV,5.0,2022-06-17 07:42:54.083,1655451774,10,4.000000,0,,0,,0,,4213,3527,1,,"B00HRH79H6,B00JM57VDS,B00VILBF0Y,B072C3VM5F",1423572849142357323014244889911523051352
165257,AELRDP5MCGSCANM6GWUXAMBN75LQ,B009AGXH64,5.0,2022-06-03 18:23:36.536,1654280616,1,5.000000,0,,0,,0,,2741,2217,1,,"B07SSZGYNR,B011AH9A16,B001ELJE5Q,B003N63BPE,B0...","1618466381,1618466731,1623954406,1625231840,16..."
165258,AHERXKLMQLGPQLW4ZLKD4IRLMZAA,B07M6RVMPJ,5.0,2021-11-27 00:36:11.015,1637973371,0,,0,,0,,0,,16297,4054,1,,"B001EYUXUI,B001EYUY3Y,B00006FWUU,B003FMTZSI,B0...","1595304483,1595304825,1595304936,1606090839,16..."


In [18]:
user_id = full_features_df[args.user_col].iloc[0]
full_features_df.loc[lambda df: df[args.user_col].eq(user_id)].sort_values(
    args.timestamp_col
)[
    [
        args.user_col,
        args.timestamp_col,
        "timestamp_unix",
        args.item_col,
        "user_rating_list_10_recent_asin",
        "user_rating_list_10_recent_asin_timestamp",
    ]
]

Unnamed: 0,user_id,timestamp,timestamp_unix,parent_asin,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp
0,AHATA6X6MYTC3VNBFJ3WIYVK257A,2011-11-21 14:27:44.000,1321885664,B0050SVNZ8,,
1,AHATA6X6MYTC3VNBFJ3WIYVK257A,2014-08-17 00:00:06.000,1408233606,B00LZVNWIA,B0050SVNZ8,1321885664
2,AHATA6X6MYTC3VNBFJ3WIYVK257A,2017-11-26 15:02:34.100,1511708554,B0BH98D8GL,"B0050SVNZ8,B00LZVNWIA",13218856641408233606
3,AHATA6X6MYTC3VNBFJ3WIYVK257A,2017-11-27 03:26:14.174,1511753174,B074RNL1RX,"B0050SVNZ8,B00LZVNWIA,B0BH98D8GL",132188566414082336061511708554
4,AHATA6X6MYTC3VNBFJ3WIYVK257A,2018-07-08 23:33:40.696,1531092820,B089QYP649,"B0050SVNZ8,B00LZVNWIA,B0BH98D8GL,B074RNL1RX",1321885664140823360615117085541511753174
5,AHATA6X6MYTC3VNBFJ3WIYVK257A,2020-11-02 20:18:55.046,1604348335,B07DHNX18W,"B0050SVNZ8,B00LZVNWIA,B0BH98D8GL,B074RNL1RX,B0...","1321885664,1408233606,1511708554,1511753174,15..."


In [19]:
def convert_asin_to_idx(inp: str, sequence_length=10, padding_value=-1):
    if inp is None:
        return [padding_value] * sequence_length
    asins = inp.split(",")
    indices = [idm.get_item_index(item_id) for item_id in asins]
    padding_needed = sequence_length - len(indices)
    output = np.pad(
        indices,
        (padding_needed, 0),  # Add padding at the beginning
        "constant",
        constant_values=padding_value,
    )
    return output

def pad_timestamp_sequence(inp: str, sequence_length=10, padding_value=-1):
    if inp is None:
        return [padding_value] * sequence_length
    inp_list = [int(x) for x in inp.split(",")]
    padding_needed = sequence_length - len(inp_list)
    output = np.pad(
        inp_list,
        (padding_needed, 0),  # Add padding at the beginning
        "constant",
        constant_values=padding_value,
    )
    return output


In [20]:
def bucketize_seconds_diff(seconds: int):
    if seconds < 60 * 10:
        return 0
    if seconds < 60 * 60:
        return 1
    if seconds < 60 * 60 * 24:
        return 2
    if seconds < 60 * 60 * 24 * 7:
        return 3
    if seconds < 60 * 60 * 24 * 30:
        return 4
    if seconds < 60 * 60 * 24 * 365:
        return 5
    if seconds < 60 * 60 * 24 * 365 * 3:
        return 6
    if seconds < 60 * 60 * 24 * 365 * 5:
        return 7
    if seconds < 60 * 60 * 24 * 365 * 10:
        return 8
    return 9


def from_ts_to_bucket(ts, current_ts: int = None):
    if current_ts is None:
        current_ts = int(time.time())
    return bucketize_seconds_diff(current_ts - ts)

def calc_sequence_timestamp_bucket(row):
    ts = row["timestamp_unix"]
    output = []
    for x in row["item_sequence_ts"]:
        x_i = int(x)
        if x_i == -1:
            # Keep padding (blank) element
            output.append(x_i)
        else:
            bucket = from_ts_to_bucket(x_i, ts)
            output.append(bucket)
    return output

In [21]:
full_features_df = full_features_df.assign(
    item_sequence=lambda df: df["user_rating_list_10_recent_asin"].apply(
        convert_asin_to_idx
    ),
    item_sequence_ts=lambda df: df["user_rating_list_10_recent_asin_timestamp"].apply(
        pad_timestamp_sequence
    ),
    item_sequence_ts_bucket=lambda df: df.apply(calc_sequence_timestamp_bucket, axis=1),
)
full_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,parent_asin_rating_avg_prev_rating_7d,user_indice,item_indice,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,item_sequence,item_sequence_ts,item_sequence_ts_bucket
0,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0050SVNZ8,4.0,2011-11-21 14:27:44.000,1321885664,1,4.000000,1,4.000000,1,...,,15688,1820,1,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]"
1,AHATA6X6MYTC3VNBFJ3WIYVK257A,B00LZVNWIA,4.0,2014-08-17 00:00:06.000,1408233606,2,3.500000,2,3.500000,2,...,3.0,15688,2778,1,,B0050SVNZ8,1321885664,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1820]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1321885664]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 6]"
2,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0BH98D8GL,5.0,2017-11-26 15:02:34.100,1511708554,2,3.500000,1,3.000000,0,...,,15688,4549,1,,"B0050SVNZ8,B00LZVNWIA",13218856641408233606,"[-1, -1, -1, -1, -1, -1, -1, -1, 1820, 2778]","[-1, -1, -1, -1, -1, -1, -1, -1, 1321885664, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 8, 7]"
3,AHATA6X6MYTC3VNBFJ3WIYVK257A,B074RNL1RX,5.0,2017-11-27 03:26:14.174,1511753174,0,,0,,0,...,,15688,3757,2,5.0,"B0050SVNZ8,B00LZVNWIA,B0BH98D8GL",132188566414082336061511708554,"[-1, -1, -1, -1, -1, -1, -1, 1820, 2778, 4549]","[-1, -1, -1, -1, -1, -1, -1, 1321885664, 14082...","[-1, -1, -1, -1, -1, -1, -1, 8, 7, 2]"
4,AHATA6X6MYTC3VNBFJ3WIYVK257A,B089QYP649,5.0,2018-07-08 23:33:40.696,1531092820,26,4.153846,7,4.428571,2,...,1.0,15688,4360,1,,"B0050SVNZ8,B00LZVNWIA,B0BH98D8GL,B074RNL1RX",1321885664140823360615117085541511753174,"[-1, -1, -1, -1, -1, -1, 1820, 2778, 4549, 3757]","[-1, -1, -1, -1, -1, -1, 1321885664, 140823360...","[-1, -1, -1, -1, -1, -1, 8, 7, 5, 5]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165255,AE3NRCMFIBBA2XVODR47YYNLKRDA,B001EYUQC8,5.0,2021-11-13 09:59:46.634,1636797586,1,3.000000,1,3.000000,0,...,,268,908,1,,"B000OLXX86,B000B9RI14,B0050SWQ86,B00CTKHXFO,B0...","1342793426,1362929993,1368549700,1392151606,14...","[-1, -1, -1, 543, 417, 1835, 2407, 3123, 1526,...","[-1, -1, -1, 1342793426, 1362929993, 136854970...","[-1, -1, -1, 8, 8, 8, 8, 8, 8, 7]"
165256,AEV5TZDZQEP24PM3SZ7SNV4TR26Q,B01N3ASPNV,5.0,2022-06-17 07:42:54.083,1655451774,10,4.000000,0,,0,...,,4213,3527,1,,"B00HRH79H6,B00JM57VDS,B00VILBF0Y,B072C3VM5F",1423572849142357323014244889911523051352,"[-1, -1, -1, -1, -1, -1, 2614, 2683, 2975, 3691]","[-1, -1, -1, -1, -1, -1, 1423572849, 142357323...","[-1, -1, -1, -1, -1, -1, 8, 8, 8, 7]"
165257,AELRDP5MCGSCANM6GWUXAMBN75LQ,B009AGXH64,5.0,2022-06-03 18:23:36.536,1654280616,1,5.000000,0,,0,...,,2741,2217,1,,"B07SSZGYNR,B011AH9A16,B001ELJE5Q,B003N63BPE,B0...","1618466381,1618466731,1623954406,1625231840,16...","[-1, -1, -1, -1, -1, 4168, 3101, 777, 1503, 3526]","[-1, -1, -1, -1, -1, 1618466381, 1618466731, 1...","[-1, -1, -1, -1, -1, 6, 6, 5, 5, 5]"
165258,AHERXKLMQLGPQLW4ZLKD4IRLMZAA,B07M6RVMPJ,5.0,2021-11-27 00:36:11.015,1637973371,0,,0,,0,...,,16297,4054,1,,"B001EYUXUI,B001EYUY3Y,B00006FWUU,B003FMTZSI,B0...","1595304483,1595304825,1595304936,1606090839,16...","[1029, 1035, 220, 1470, 1019, 260, 273, 289, 1...","[1595304483, 1595304825, 1595304936, 160609083...","[6, 6, 6, 6, 5, 5, 5, 5, 5, 5]"


# Split back to train and val dfs

In [22]:
val_timestamp = pd.to_datetime(val_df[args.timestamp_col].astype(int), unit="ms").min()
train_df_length = train_df.shape[0]
train_features_df = full_features_df.loc[lambda df: df["timestamp"].lt(val_timestamp)]
assert train_df.shape[0] == train_df_length

val_df_length = val_df.shape[0]
val_features_df = full_features_df.loc[lambda df: df["timestamp"].ge(val_timestamp)]
assert val_df.shape[0] == val_df_length

In [23]:
def check_dup(df):
    assert (
        df[[args.user_col, args.item_col, args.timestamp_col]].duplicated().sum() == 0
    )

In [25]:
rating_agg_cols = [feature.split(":")[1] for feature in item_features]
meta_cols = ["main_category", "title", "description", "categories", "price"]
cols = meta_cols + rating_agg_cols
cols

['main_category',
 'title',
 'description',
 'categories',
 'price',
 'parent_asin_rating_cnt_365d',
 'parent_asin_rating_avg_prev_rating_365d',
 'parent_asin_rating_cnt_90d',
 'parent_asin_rating_avg_prev_rating_90d',
 'parent_asin_rating_cnt_30d',
 'parent_asin_rating_avg_prev_rating_30d',
 'parent_asin_rating_cnt_7d',
 'parent_asin_rating_avg_prev_rating_7d']

In [None]:
# Merge the item features into the interaction data
train_features_df = pd.merge(
    train_features_df, metadata_raw_df[[args.item_col] + meta_cols +["images"]], how="left", on=args.item_col
)
val_features_df = pd.merge(
    val_features_df, metadata_raw_df[[args.item_col] + meta_cols +["images"]], how="left", on=args.item_col
)
check_dup(train_features_df)
check_dup(val_features_df)
train_features_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,item_sequence,item_sequence_ts,item_sequence_ts_bucket,main_category,title,description,categories,price
0,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0050SVNZ8,4.0,2011-11-21 14:27:44.000,1321885664,1,4.000000,1,4.000000,1,...,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",Video Games,Amazon Basics Carrying Case for Nintendo - New...,[],"[Video Games, Legacy Systems, Nintendo Systems...",
1,AHATA6X6MYTC3VNBFJ3WIYVK257A,B00LZVNWIA,4.0,2014-08-17 00:00:06.000,1408233606,2,3.500000,2,3.500000,2,...,B0050SVNZ8,1321885664,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1820]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1321885664]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 6]",Computers,Logitech G402 Hyperion Fury FPS Gaming Mouse,[Logitech G402 Hyperion Fury FPS Gaming Mouse],"[Video Games, PC, Accessories, Gaming Mice]",
2,AHATA6X6MYTC3VNBFJ3WIYVK257A,B0BH98D8GL,5.0,2017-11-26 15:02:34.100,1511708554,2,3.500000,1,3.000000,0,...,"B0050SVNZ8,B00LZVNWIA",13218856641408233606,"[-1, -1, -1, -1, -1, -1, -1, -1, 1820, 2778]","[-1, -1, -1, -1, -1, -1, -1, -1, 1321885664, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 8, 7]",Computers,Logitech G433 7.1 Wired Gaming Headset with DT...,[Logitech G433 gaming headset is the premium a...,"[Video Games, Xbox One, Accessories, Headsets]",44.99
3,AHATA6X6MYTC3VNBFJ3WIYVK257A,B074RNL1RX,5.0,2017-11-27 03:26:14.174,1511753174,0,,0,,0,...,"B0050SVNZ8,B00LZVNWIA,B0BH98D8GL",132188566414082336061511708554,"[-1, -1, -1, -1, -1, -1, -1, 1820, 2778, 4549]","[-1, -1, -1, -1, -1, -1, -1, 1321885664, 14082...","[-1, -1, -1, -1, -1, -1, -1, 8, 7, 2]",Video Games,Razer Wolverine Ultimate Officially Licensed X...,[Play anywhere with the Razer Wolverine Ultima...,"[Video Games, PC, Accessories, Controllers]",64.98
4,AHATA6X6MYTC3VNBFJ3WIYVK257A,B089QYP649,5.0,2018-07-08 23:33:40.696,1531092820,26,4.153846,7,4.428571,2,...,"B0050SVNZ8,B00LZVNWIA,B0BH98D8GL,B074RNL1RX",1321885664140823360615117085541511753174,"[-1, -1, -1, -1, -1, -1, 1820, 2778, 4549, 3757]","[-1, -1, -1, -1, -1, -1, 1321885664, 140823360...","[-1, -1, -1, -1, -1, -1, 8, 7, 5, 5]",Video Games,Turtle Beach Stealth 600 Wireless Surround Sou...,[The Turtle Beach Stealth 600 is the latest wi...,"[Video Games, PlayStation 4, Accessories, Head...",168.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164293,AE3P3SRQSH7R4R7RR2KMUEWLEXPQ,B004IWRNTC,5.0,2014-03-10 17:22:16.000,1394472136,4,4.250000,1,5.000000,0,...,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",Video Games,You Don't Know Jack - Xbox 360,"[Product Description, The award-winning You Do...","[Video Games, Legacy Systems, Xbox Systems, Xb...",20.55
164294,AE3P3SRQSH7R4R7RR2KMUEWLEXPQ,B01FSKACPY,5.0,2014-03-10 17:22:45.000,1394472165,3,4.333333,2,4.000000,1,...,B004IWRNTC,1394472136,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1718]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1394472136]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 0]",Video Games,Wheel of Fortune - Xbox 360,[Spin the wheel along with Pat Sajak and Vanna...,"[Video Games, Legacy Systems, Xbox Systems, Xb...",24.99
164295,AE3P3SRQSH7R4R7RR2KMUEWLEXPQ,B002JTX9WQ,5.0,2014-03-10 17:23:00.000,1394472180,2,5.000000,1,5.000000,0,...,"B004IWRNTC,B01FSKACPY",13944721361394472165,"[-1, -1, -1, -1, -1, -1, -1, -1, 1718, 3329]","[-1, -1, -1, -1, -1, -1, -1, -1, 1394472136, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 0, 0]",Video Games,Press Your Luck 2010 Edition - PC,"[Collect ""spins by answering trivia questions,...","[Video Games, PC, Games]",135.0
164296,AE3P3SRQSH7R4R7RR2KMUEWLEXPQ,B0017QFMJU,5.0,2014-03-10 17:23:26.000,1394472206,1,5.000000,0,,0,...,"B004IWRNTC,B01FSKACPY,B002JTX9WQ",139447213613944721651394472180,"[-1, -1, -1, -1, -1, -1, -1, 1718, 3329, 1370]","[-1, -1, -1, -1, -1, -1, -1, 1394472136, 13944...","[-1, -1, -1, -1, -1, -1, -1, 0, 0, 0]",Video Games,Tzou AC Power Adapter for Nintendo Wii Console,[Did you misplace or destroy the AC power adap...,"[Video Games, Legacy Systems, Nintendo Systems...",


In [27]:
train_persist_fp = "../data/train_features.parquet"
val_persist_fp = "../data/val_features.parquet"

train_features_df.to_parquet(train_persist_fp, index=False)
val_features_df.to_parquet(val_persist_fp, index=False)

In [None]:
if os.getenv("S3_ENDPOINT_URL") is not None:
    s3 = init_s3_client()

    bucket_name = "data-recsys"
    train_key = "train_features.parquet"
    val_key = "val_features.parquet"
    idm_key = "idm.json"

    # Upload the files to S3
    s3.upload_file(train_persist_fp, bucket_name, train_key)
    s3.upload_file(val_persist_fp, bucket_name, val_key)
    s3.upload_file(idm_persist_fp, bucket_name, idm_key)

    logger.info("Files uploaded successfully to S3!")