# Set up

In [1]:
import os
import sys

import pandas as pd
from dotenv import load_dotenv
from pydantic import BaseModel
from sqlalchemy import create_engine
import numpy as np
import json
from loguru import logger

sys.path.insert(0, "../..")

from src.utils import handle_dtypes, parse_dt

# Controller

In [2]:
class Args(BaseModel):
    holdout_fp: str = "../../data/holdout.parquet"
    random_seed: int = 41
    num_days_holdout: int = 30
    table_name: str = "amz_review_rating_raw"

    def init():
        return self

args = Args()
print(json.dumps(args.model_dump(), indent=2))

{
  "holdout_fp": "../../data/holdout.parquet",
  "random_seed": 41,
  "num_days_holdout": 30,
  "table_name": "amz_review_rating_raw"
}


# Read Data

In [3]:
parquet_path = args.holdout_fp
holdout_df = pd.read_parquet(parquet_path).assign(
    description = lambda df: df["description"].apply(list),
    categories = lambda df: df["categories"].apply(list),
)

In [4]:
# PostgreSQL connection details
username = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
host = os.getenv("POSTGRES_HOST")
port = os.getenv("POSTGRES_PORT")
database = os.getenv("POSTGRES_DB")
schema = os.getenv("POSTGRES_OLTP_SCHEMA")

conn_str = (
    f"postgresql+psycopg2://{username}:{password}"
    f"@{host}:{port}/{database}"
)

engine = create_engine(conn_str)

In [5]:
def get_curr_oltp_max_timestamp():
    query = f"select max(timestamp) as max_timestamp from {schema}.{args.table_name};"
    return pd.read_sql(query, engine)["max_timestamp"].iloc[0]

In [6]:
assert get_curr_oltp_max_timestamp() == holdout_df["timestamp"].max()

# Undo holdout

In [7]:
from sqlalchemy import text

delete_query = f"DELETE FROM {schema}.{args.table_name} WHERE timestamp >= :timestamp;"
min_timestamp = holdout_df['timestamp'].min().strftime('%Y-%m-%d')

with engine.connect() as connection:
    with connection.begin():
        result = connection.execute(text(delete_query), {'timestamp': min_timestamp})
        logger.info(f"Deleted {result.rowcount} rows")

[32m2025-11-12 21:24:05.368[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mDeleted 74 rows[0m


In [8]:
assert get_curr_oltp_max_timestamp() < holdout_df["timestamp"].min()