## Variable Timing Matching

In [3]:
import sys
from pathlib import Path

# Detect repo root
repo_root = Path("/workspaces/NAZ_Measure")  # absolute path to your repo root
sys.path.insert(0, str(repo_root))

# Now imports will work
from configs.shuffleboard_2025_config import start_date, end_date
from utils.data import get_raw_data_tables

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from databricks.connect import DatabricksSession

spark = DatabricksSession.builder \
    .host("<host>") \
    .token("<token>") \
    .clusterId("<cluster>") \
    .config("spark.driver.memory", "512m") \
    .config("spark.executor.memory", "512m") \
    .getOrCreate()

AttributeError: 'Builder' object has no attribute 'config'

In [1]:
import sys
print(sys.executable)

/workspaces/NAZ_Measure/.venv/bin/python


In [2]:
from databricks.connect import DatabricksSession

# This uses the environment variables from devcontainer
spark = DatabricksSession.builder.getOrCreate()

print(spark.version)

Exception: pyspark and databricks-connect cannot be installed at the same time. To use databricks-connect, uninstall databricks-connect & pyspark by running 'pip uninstall -y databricks-connect pyspark pyspark-connect pyspark-client' followed by a re-installation of databricks-connect

In [None]:
import sys
from pathlib import Path
from time import perf_counter
from datetime import date
import os
import pandas as pd
import streamlit as st  # only needed if you want Streamlit messages

# -----------------------------
# Add repo root to sys.path
# -----------------------------
repo_root = Path("/workspaces/NAZ_Measure")  # adjust if needed
sys.path.insert(0, str(repo_root))

from pyspark.sql import functions as F
from pyspark.sql import window
from databricks.connect import DatabricksSession

# -----------------------------
# Configs / Inputs
# -----------------------------
from configs.shuffleboard_2025_config import (
    start_date, 
    end_date, 
    measure_start, 
    measure_end,
    brand_cluster_code, 
    desired_premise, 
    desired_retailer_channel,
    vpid_offset_weeks, 
    match_configs,
)
from utils.data import get_raw_data_tables
from utils.matching_algo import run_matching_variable_timing

# -----------------------------
# Spark session (secure)
# -----------------------------
def get_spark():
    host = os.environ.get("DATABRICKS_HOST")
    token = os.environ.get("DATABRICKS_TOKEN")
    cluster_id = os.environ.get("DATABRICKS_CLUSTER_ID")

    if not host or not token or not cluster_id:
        raise RuntimeError("Databricks environment variables not set!")

    try:
        return DatabricksSession.builder \
            .host(host) \
            .token(token) \
            .cluster_id(cluster_id) \
            .getOrCreate()
    except Exception as e:
        raise RuntimeError(f"Failed to create Databricks session: {e}")

# -----------------------------
# Timer start
# -----------------------------
t0 = perf_counter()

# -----------------------------
# Prepare VPID DataFrame
# -----------------------------
vpid_timing_df = pd.DataFrame({
    "VPID": list(vpid_offset_weeks.keys()),
    "identifier": list(vpid_offset_weeks.values()),  
})

# -----------------------------
# Start Spark
# -----------------------------
spark = get_spark()

# -----------------------------
# Load raw tables
# -----------------------------
hv, hp, hr, hc = get_raw_data_tables(spark)

# -----------------------------
# Run matching
# -----------------------------
config_summary, matched_dfs, group_dfs = run_matching_variable_timing(
    vpid_timing_df=vpid_timing_df,
    match_configs=match_configs,
    hv=hv, hp=hp, hr=hr, hc=hc,
    base_start=start_date, base_end=end_date,
    base_measure_start=measure_start, base_measure_end=measure_end,
    brand_cluster_code=brand_cluster_code,
    desired_premise=desired_premise,
    desired_retailer_channel=desired_retailer_channel,
    offset_col="identifier", 
    max_controls_per_test=1,
    data_end_cap=date(2025, 11, 30),
)

# -----------------------------
# Timer end & display
# -----------------------------
elapsed = perf_counter() - t0
print(f"run_matching_variable_timing runtime: {elapsed:.2f} seconds")

# Optional: if running in notebooks
try:
    display(config_summary)
except NameError:
    print(config_summary)

  from .autonotebook import tqdm as notebook_tqdm
Matching configs:   0%|          | 0/3 [00:00<?, ?config/s]

: 

In [0]:
# %sql
# select * from vip_dev.retailer.vip_class

## View Individual Weeks Within a Config and Final DF

In [0]:
cfg_name = "minmax_all_blocking"
offset = 1   

df = matched_dfs.get((cfg_name, offset))
if df is not None:
    display(df)
else:
    print("No matched_df for this config/offset")
dfs = []
for i in range(0, 14):
    df = group_dfs[("minmax_all_blocking", i)].copy()
    df["identifier"] = i
    dfs.append(df)

df_all_pairs = pd.concat(dfs, ignore_index=True)

display(df_all_pairs)

## Variable Timing Validation

In [1]:


from collections import Counter
from importlib import reload
from inspect import signature
import pandas as pd
import utils.validate as validate
from utils.data import get_raw_data_tables
from pyspark.sql import SparkSession
from configs.shuffleboard_2025_config import (
    start_date, 
    end_date, 
    measure_start, 
    measure_end,
    vpid_offset_weeks, 
    match_configs
)


spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()
hv, hp, hr, hc = get_raw_data_tables(spark)
target_id_counts = dict(Counter(vpid_offset_weeks.values()))
iter_df = validate.run_validation_iterations_variable_timing(
    vpid_timing_df=vpid_timing_df,
    match_configs=match_configs,
    hv=hv, hp=hp, hr=hr, hc=hc,
    base_start=start_date, base_end=end_date,
    base_measure_start=measure_start, base_measure_end=measure_end,
    sample_size_total=175,
    n_iterations=100,
    rng_seed=42,
    show_progress=True,
    offset_col="identifier",
    target_id_counts=target_id_counts,
)

print(f"iterations x configs rows: {len(iter_df)}")
display(iter_df)

ModuleNotFoundError: No module named 'utils'

In [0]:
from utils.data import build_variable_timing_caches

# Build the same pre/post period caches
offsets = sorted(vpid_timing_df["identifier"].astype(int).unique().tolist())
vol_pre_period_by_offset, vol_post_period_by_offset = build_variable_timing_caches(
    hv, hp, hr, hc,
    offsets=offsets,
    base_start=start_date, base_end=end_date,
    base_measure_start=measure_start, base_measure_end=measure_end,
    vpids_for_post_period=None,
)

tests_requested_by_offset = {
    k: set(vpid_timing_df.loc[vpid_timing_df["identifier"] == k, "VPID"])
    for k in offsets
}

tests_with_pre_by_offset = {
    k: set(vol_pre_period_by_offset[k]["VPID"])
    for k in offsets
}

lost_before_matching = {
    k: tests_requested_by_offset[k] - tests_with_pre_by_offset[k]
    for k in offsets
}

print("=== Missing from pre-period data (before matching) ===")
total_lost_before = 0
for k in offsets:
    n_lost = len(lost_before_matching[k])
    total_lost_before += n_lost
    if n_lost:
        print(f"Offset {k}: {n_lost} tests missing from pre-period data")

print(f"TOTAL missing from pre-period data across all offsets: {total_lost_before}\n")

cfg_name = match_configs[0].get("minmax_all_blocking", list(match_configs[0].values())[0])  # fallback to first available key

tests_in_result_by_offset = {
    k: set(
        group_dfs[(cfg_name, k)]
        .query("Group == 'Test'")["VPID"]
        .tolist()
    )
    for k in offsets
}

lost_in_matching = {
    k: (tests_with_pre_by_offset[k] & tests_requested_by_offset[k]) - tests_in_result_by_offset[k]
    for k in offsets
}

print("=== Lost during matching (have pre-period data but not in final membership) ===")
total_lost_matching = 0
for k in offsets:
    n_lost = len(lost_in_matching[k])
    total_lost_matching += n_lost
    if n_lost:
        print(f"Offset {k}: {n_lost} tests lost in matching")

print(f"TOTAL lost in matching across all offsets: {total_lost_matching}")


## View Iteration Config

In [0]:

import pandas as pd

cfg_name = "minmax_CYTrendShare_blocking"

groups = []
for key, gdf in group_dfs.items():
    if isinstance(key, tuple):
        name, offset_weeks = key
    else:
        name, offset_weeks = key, None

    if name != cfg_name or gdf is None or (isinstance(gdf, pd.DataFrame) and gdf.empty):
        continue

    tmp = gdf.copy()
    if "offset_weeks" not in tmp.columns:
        tmp["offset_weeks"] = offset_weeks
    groups.append(tmp[["Group","VPID","offset_weeks"]])

membership = pd.concat(groups, ignore_index=True) if groups else pd.DataFrame(columns=["Group","VPID","offset_weeks"])
membership = membership.drop_duplicates(["Group","VPID","offset_weeks"]).sort_values(["offset_weeks","Group","VPID"])
print(f"Membership rows for {cfg_name}: {len(membership)}")
display(membership)

if "offset_weeks" in membership.columns:
    display(membership.groupby(["offset_weeks","Group"], as_index=False).size())


## Run Tests

In [0]:
import pandas as pd
from utils.validate import run_delta_abs_tests_less


tests = run_delta_abs_tests_less(iter_df)
display(tests)