In [1]:
import os
from pathlib import Path
import yaml
from causal_pricing.io import read_csv, to_parquet
from causal_pricing.cleaning import basic_clean, add_derived_fields, make_daily_product_city
from causal_pricing.features import make_time_features, encode_city_dummies

In [2]:
# Move working directory to project root
os.chdir(Path.cwd().parents[0])
print("CWD:", Path.cwd())

CWD: /home/tchen/casual-pricing-lab


In [3]:
# Load configs
paths = yaml.safe_load(open("configs/paths.yaml"))
params = yaml.safe_load(open("configs/params.yaml"))


In [4]:
# 1) Load raw
df_raw = read_csv(paths["raw_sales"])

In [5]:
# 2) Clean
df_clean = basic_clean(df_raw)

  df[date_col] = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)
  df[date_col] = pd.to_datetime(df[date_col], errors="coerce", infer_datetime_format=True)


#### Cleaning Steps:
- 1) Drop all-NaN rows.
- 2) Remove repeated header rows (date column token).
- 3) Coerce quantity/price to numeric.
- 4) Parse order datetime.
- 5) Require non-null in [Order Date, Product, Quantity Ordered, Price Each].
- 6) Keep strictly positive quantity and price.
- 7) Remove extreme outliers via IQR caps (OUTLIER_IQR_MULT).
- 8) Drop exact duplicates.
- 9) Sort by datetime.

In [6]:
# 3) Add engineered fields
df_feat = add_derived_fields(df_clean)
df_feat = make_time_features(df_feat)
df_feat = encode_city_dummies(df_feat, min_count=int(params["city_min_count"]))

#### Adds:
- Revenue = Quantity Ordered * Price Each
- Date parts: year, month, day, weekday, hour
- Address parts: city, state, zip

In [7]:
# 4) Save interim
to_parquet(df_feat, paths["interim_clean"])

In [8]:
# 5) Aggregate to daily × product × city
df_daily = make_daily_product_city(df_feat)
to_parquet(df_daily, paths["processed_daily"])

In [9]:
df_feat.head(), df_daily.head(), df_feat.shape, df_daily.shape

(  Order ID                   Product  Quantity Ordered  Price Each  \
 0   177831  Lightning Charging Cable                 1       14.95   
 1   177687          Wired Headphones                 1       11.99   
 2   186754          27in FHD Monitor                 1      149.99   
 3   185103      USB-C Charging Cable                 1       11.95   
 4   191585              20in Monitor                 1      109.99   
 
            Order Date                            Purchase Address  Revenue  \
 0 2019-04-01 03:09:00         914 6th St, San Francisco, CA 94016    14.95   
 1 2019-04-01 04:12:00       23 Meadow St, San Francisco, CA 94016    11.99   
 2 2019-04-01 05:06:00    824 Chestnut St, San Francisco, CA 94016   149.99   
 3 2019-04-01 05:18:00  610 Washington St, San Francisco, CA 94016    11.95   
 4 2019-04-01 05:58:00        686 Main St, New York City, NY 10001   109.99   
 
    year  month  day  ...      city_norm  city_Atlanta city_Austin city_Boston  \
 0  2019      