# Set up

In [None]:
import os
import sys
import json

import pandas as pd
from dotenv import load_dotenv
from pydantic import BaseModel
from sqlalchemy import create_engine
import numpy as np

sys.path.insert(0, "../..")

from src.init_s3 import init_s3_client

# Controller

In [58]:
class Args(BaseModel):
    holdout_fp: str = "../../data/holdout.parquet"
    random_seed: int = 41
    num_days_holdout: int = 30
    table_name: str = "amz_review_rating_raw"

    def init():
        return self

args = Args()
print(json.dumps(args.model_dump(), indent=2))

{
  "holdout_fp": "../../data/holdout.parquet",
  "random_seed": 41,
  "num_days_holdout": 30,
  "table_name": "amz_review_rating_raw"
}


In [None]:
if not os.path.exists(args.holdout_fp):
    s3 = init_s3_client()
    bucket_name = "data-recsys"
    holdout_key = "holdout.parquet"

    s3.download_file(bucket_name, holdout_key, args.holdout_fp)

In [55]:
parquet_path = args.holdout_fp
holdout_df = pd.read_parquet(parquet_path).assign(
    description = lambda df: df["description"].apply(list),
    categories = lambda df: df["categories"].apply(list),
)

In [None]:
holdout_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price
0,AHLBT2RDWYQWN5O2XNBNX2JPWVZA,B08NYV2VLS,4.0,2022-07-08 18:26:28.360,Video Games,Story of Seasons: Trio of Towns - Nintendo 3DS,"[""STORY OF SEASONS: Trio of Towns is a fresh n...","[""Video Games"", ""Legacy Systems"", ""Nintendo Sy...",
1,AHLBT2RDWYQWN5O2XNBNX2JPWVZA,B00KWIYPZG,5.0,2022-07-08 18:27:49.294,Video Games,Fantasy Life - 3DS,"[""Embark on a Journey that Lets You Build Your...","[""Video Games"", ""Legacy Systems"", ""Nintendo Sy...",96.65
2,AF5NKVKUZGRPBR7HAYYDUS25RGRQ,B0BKRXQ5GL,3.0,2022-07-06 12:14:32.366,Computers,Logitech G Logitech G935 Over Ear Wireless Hea...,"[""Logitech G935 Wireless DTS:X 7.1 Surround So...","[""Video Games"", ""PC"", ""Accessories"", ""Headsets""]",153.98
3,AGGRGJRYYYWAL7V5M4RG4VFKL3HA,B07BGYLS1L,5.0,2022-06-25 20:06:42.077,Video Games,Shadow of the Tomb Raider - Xbox One,"[""Experience Lara croft's defining moment as s...","[""Video Games"", ""Xbox One"", ""Games""]",14.8
4,AEKEN3WITS4ZEJ7ZIISGJDZYJB3Q,B0BH1ZL3G9,5.0,2022-06-16 19:54:29.703,Computers,Hipshotdot PRO Color and Brightness Control Do...,"[""The HipShotDot is the gaming industry\u2019s...","[""Video Games"", ""PC"", ""Accessories"", ""Controll...",22.99


# Append holdout to oltp

In [60]:
# PostgreSQL connection details
username = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
host = os.getenv("POSTGRES_HOST")
port = os.getenv("POSTGRES_PORT")
database = os.getenv("POSTGRES_DB")
schema = os.getenv("POSTGRES_OLTP_SCHEMA")

In [61]:
conn_str = (
    f"postgresql+psycopg2://{username}:{password}"
    f"@{host}:{port}/{database}"
)

engine = create_engine(conn_str)

In [63]:
def get_curr_oltp_max_timestamp():
    query = f"select max(timestamp) as max_timestamp from {schema}.{args.table_name};"
    return pd.read_sql(query, engine)["max_timestamp"].iloc[0]

get_curr_oltp_max_timestamp()

Timestamp('2022-06-15 23:52:30.729000')

In [65]:
assert get_curr_oltp_max_timestamp() < holdout_df["timestamp"].min()

In [66]:
holdout_df.to_sql(
    args.table_name, engine, if_exists="append", index=False, schema=schema
)

74

In [71]:
assert get_curr_oltp_max_timestamp() == holdout_df["timestamp"].max()