In [16]:
import os
import ast
import json
import os
import ast
import json
import math
import time
import random
from collections import Counter, defaultdict
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from scipy import sparse
try:
    import lightgbm as lgb
    HAS_LGBM = True
except Exception:
    HAS_LGBM = False

try:
    import xgboost as xgb
    HAS_XGB = True
except Exception:
    HAS_XGB = False

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
import gc
from functools import lru_cache
import random
from collections import Counter, defaultdict
from datetime import datetime
import numpy as np
import pandas as pd
from scipy import sparse
import openpyxl

In [26]:
# -------------------------
# CONFIG
# -------------------------
class CFG:
    # Data paths: update to where your CSVs are
    ORDER_DATA_PATH = r"C:\Users\ia383\Madness_Overloaded_WWT_Comp2025\Madness_Overloaded_Codebase\dataset\order_data.csv"
    CUSTOMER_DATA_PATH = r"C:\Users\ia383\Madness_Overloaded_WWT_Comp2025\Madness_Overloaded_Codebase\dataset\customer_data.csv"  
    STORE_DATA_PATH = r"C:\Users\ia383\Madness_Overloaded_WWT_Comp2025\Madness_Overloaded_Codebase\dataset\store_data.csv"        
    TEST_DATA_PATH = r"C:\Users\ia383\Madness_Overloaded_WWT_Comp2025\Madness_Overloaded_Codebase\dataset\test_data_question.csv"

    OUTPUT_EXCEL_PATH = r"C:\Users\ia383\Madness_Overloaded_WWT_Comp2025\TEAM_WWT_Comp2025_Output_fast.xlsx"
    SEED = 42

    # Catalog / sample controls
    MAX_ITEMS = 3000          # cap on unique items kept (most-frequent)
    MIN_ITEMS_PER_CART = 2
    SAMPLE_RECENT_DAYS = None # if int, only keep orders within N most recent days
    MAX_TRAIN_SAMPLES = 400000  # cap supervised samples (None to disable)

    # Co-occurrence / neighbors
    TOP_NEIGHBORS = 50
    ALPHA = 0.75              # weight down large carts
    RECENCY_HALF_LIFE_DAYS = 180  # decays older cooccurrence counts

    # Candidate & scoring
    TOP_POPULAR_K = 40
    CAND_LIMIT = 200
    RECALL_AT_K = 3
    WEIGHT_COOCC = 0.7
    WEIGHT_POP = 0.25
    WEIGHT_EMB = 0.05         # used only if embeddings available

    # Word2Vec (optional)
    USE_W2V = False and HAS_W2V
    W2V_SIZE = 64
    W2V_WINDOW = 5
    W2V_EPOCHS = 5

# deterministic
random.seed(CFG.SEED)
np.random.seed(CFG.SEED)


# Helper Functions

In [3]:
def stamp(msg: str):
    print(f"[{time.strftime('%H:%M:%S')}] {msg}")


In [4]:
def safe_read_csv(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    stamp(f"Reading: {path}")
    return pd.read_csv(path)

In [5]:
def parse_orders_cell(orders_cell):
    """Robust parsing of ORDERS into a list of item names."""
    if pd.isna(orders_cell):
        return []
    if isinstance(orders_cell, list):
        return [str(x).strip() for x in orders_cell]
    s = str(orders_cell)
    # try json then literal
    for fn in (json.loads, ast.literal_eval):
        try:
            obj = fn(s)
            break
        except Exception:
            obj = None
    if obj is None:
        return []
    items = []
    if isinstance(obj, dict) and "orders" in obj:
        for ord_entry in obj.get("orders", []):
            details = ord_entry.get("item_details", [])
            for d in details:
                name = d.get("item_name")
                if name:
                    items.append(str(name).strip())
    elif isinstance(obj, list):
        items = [str(x).strip() for x in obj if isinstance(x, (str, int))]
    else:
        # fallback: split by comma (rare)
        items = [part.strip() for part in s.split(",") if part.strip()]
    return items


In [6]:
def explode_orders_to_lists(df):
    stamp("Parsing ORDERS -> ITEM_LIST")
    return df["ORDERS"].apply(parse_orders_cell)

In [7]:
def parse_order_date(s):
    # try multiple date formats commonly found; return pandas Timestamp or NaT
    if pd.isna(s):
        return pd.NaT
    if isinstance(s, (pd.Timestamp, datetime)):
        return pd.to_datetime(s)
    for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%d", "%d-%m-%Y %H:%M:%S",
                "%d-%m-%Y", "%m/%d/%Y %H:%M:%S", "%m/%d/%Y"):
        try:
            return pd.to_datetime(s, format=fmt)
        except Exception:
            continue
    # fallback auto parse
    try:
        return pd.to_datetime(s, errors="coerce")
    except Exception:
        return pd.NaT

# # Build catalog + cooccurrence Matrices

In [8]:
def build_catalog_and_filtered_orders(order_df):
    stamp("Building item counts and applying catalog cap...")
    order_df = order_df.copy()
    order_df["ITEM_LIST"] = explode_orders_to_lists(order_df)
    # optional: remove carts with 0 items
    order_df = order_df[order_df["ITEM_LIST"].map(len) > 0].reset_index(drop=True)
    # parse dates
    if "ORDER_CREATED_DATE" in order_df.columns:
        order_df["_ORD_DATE"] = order_df["ORDER_CREATED_DATE"].apply(parse_order_date)
        # optional filter recent
        if isinstance(CFG.SAMPLE_RECENT_DAYS, int):
            max_date = order_df["_ORD_DATE"].max()
            cutoff = max_date - pd.Timedelta(days=CFG.SAMPLE_RECENT_DAYS)
            order_df = order_df[order_df["_ORD_DATE"] >= cutoff].reset_index(drop=True)
    else:
        order_df["_ORD_DATE"] = pd.NaT

    # item frequency
    ctr = Counter()
    for lst in order_df["ITEM_LIST"]:
        ctr.update([it for it in lst])

    most_common_items = [it for it, _ in ctr.most_common(CFG.MAX_ITEMS)]
    kept_set = set(most_common_items)

    # drop items not in top-K and reduce large carts
    def filter_cart(lst):
        filt = [it for it in lst if it in kept_set]
        # dedupe keeping order
        seen = set()
        out = []
        for it in filt:
            if it not in seen:
                seen.add(it)
                out.append(it)
        return out

    order_df["ITEM_LIST"] = order_df["ITEM_LIST"].apply(filter_cart)
    order_df = order_df[order_df["ITEM_LIST"].map(len) >= CFG.MIN_ITEMS_PER_CART].reset_index(drop=True)
    # recompute counts over filtered
    filtered_ctr = Counter()
    for lst in order_df["ITEM_LIST"]:
        filtered_ctr.update(lst)

    item2idx = {it: i for i, it in enumerate(filtered_ctr.keys())}
    idx2item = {i: it for it, i in item2idx.items()}

    stamp(f"Orders retained: {len(order_df)}, unique items kept: {len(item2idx)}")
    return order_df, item2idx, idx2item, filtered_ctr

In [9]:
def recency_weighted_factor(order_date, newest_date):
    """Return weight in (0,1] for an order based on recency; half-life controlled by CFG.RECENCY_HALF_LIFE_DAYS."""
    if pd.isna(order_date) or pd.isna(newest_date):
        return 1.0
    days = (newest_date - order_date).days
    if days <= 0:
        return 1.0
    half = CFG.RECENCY_HALF_LIFE_DAYS
    # exponential decay: weight = 0.5^(days/half)
    return 0.5 ** (days / max(1.0, half))

In [10]:
def build_cooccurrence_neighbors(order_df, item2idx):
    stamp("Computing recency-weighted co-occurrence counts (dict-of-dicts)...")
    newest = order_df["_ORD_DATE"].max() if "_ORD_DATE" in order_df.columns else pd.NaT
    co_counts = defaultdict(lambda: defaultdict(float))
    # accumulate symmetric counts
    for lst, d in zip(order_df["ITEM_LIST"], order_df["_ORD_DATE"]):
        uniq = list(dict.fromkeys(lst))
        if len(uniq) <= 1:
            continue
        w = 1.0 / (len(uniq) ** CFG.ALPHA)
        rec_w = recency_weighted_factor(d, newest)
        total_w = w * rec_w
        for i in range(len(uniq)):
            a = uniq[i]
            for j in range(i + 1, len(uniq)):
                b = uniq[j]
                co_counts[a][b] += total_w
                co_counts[b][a] += total_w

    # convert each item's co-dict to top-K neighbor list (sorted)
    neighbors = {}
    for it, nbrs in co_counts.items():
        # sort by descending weight
        sorted_n = sorted(nbrs.items(), key=lambda x: -x[1])[:CFG.TOP_NEIGHBORS]
        neighbors[it] = [(n, float(score)) for n, score in sorted_n]
    stamp(f"Built neighbor lists for {len(neighbors)} items")
    return neighbors

In [11]:
def build_popularity_tables(order_df):
    stamp("Building popularity (global & store-wise & time-of-day)...")
    global_ctr = Counter()
    store_ctr = defaultdict(Counter)
    hour_ctr = defaultdict(Counter)  # hour -> Counter
    for lst, store, dt in zip(order_df["ITEM_LIST"],
                              order_df.get("STORE_NUMBER", [None]*len(order_df)),
                              order_df["_ORD_DATE"]):
        global_ctr.update(lst)
        store_ctr[store].update(lst)
        if not pd.isna(dt):
            hour_ctr[dt.hour].update(lst)
    # lists
    global_pop = [it for it, _ in global_ctr.most_common()]
    store_pop = {s: [it for it, _ in c.most_common()] for s, c in store_ctr.items()}
    hour_pop = {h: [it for it, _ in c.most_common()] for h, c in hour_ctr.items()}
    return global_pop, store_pop, hour_pop, global_ctr

In [12]:
def train_item2vec(order_df):
    if not CFG.USE_W2V:
        return None
    if not HAS_W2V:
        stamp("[WARN] gensim not available; skipping Word2Vec.")
        return None
    stamp("Training Word2Vec on carts...")
    sentences = [lst for lst in order_df["ITEM_LIST"] if len(lst) >= 1]
    model = Word2Vec(sentences=sentences,
                     vector_size=CFG.W2V_SIZE,
                     window=CFG.W2V_WINDOW,
                     min_count=1,
                     workers=2,
                     epochs=CFG.W2V_EPOCHS,
                     seed=CFG.SEED)
    stamp("Word2Vec trained.")
    return model

#  Candidate generation + scoring

In [13]:
def candidates_for_cart(cart, neighbors, global_pop, store_pop=None, store=None, hour_pop=None, hour=None):
    cand = []
    cand_set = set()
    # 1) neighbors
    for it in cart:
        for n, score in neighbors.get(it, []):
            if n not in cand_set and n not in cart:
                cand_set.add(n)
                cand.append(('nbr', n, score))
    # 2) store pop / hour pop prioritized
    if store is not None and store_pop and store in store_pop:
        for it in store_pop[store][:CFG.TOP_POPULAR_K]:
            if it not in cand_set and it not in cart:
                cand_set.add(it)
                cand.append(('storepop', it, None))
    if hour is not None and hour_pop and hour in hour_pop:
        for it in hour_pop[hour][:CFG.TOP_POPULAR_K]:
            if it not in cand_set and it not in cart:
                cand_set.add(it)
                cand.append(('hourpop', it, None))
    # 3) global pop fallback
    for it in global_pop[:CFG.TOP_POPULAR_K]:
        if it not in cand_set and it not in cart:
            cand_set.add(it)
            cand.append(('gpop', it, None))
    # limit
    return cand[:CFG.CAND_LIMIT]

In [14]:
def compute_scores(cart, candidates, neighbors, global_ctr, w_co, w_pop, emb_model=None):
    """Return list of (item, score) sorted desc. candidates: list of tuples (source, item, co_score_or_None)"""
    scores = []
    # precompute co-sum from cart to candidate (fast lookup in neighbors dict)
    # Build dict of co-sums per candidate
    cand_set = [it for (_, it, _) in candidates]
    co_sum = {c: 0.0 for c in cand_set}
    for it in cart:
        nbrs = dict(neighbors.get(it, []))
        for c in cand_set:
            co_sum[c] += float(nbrs.get(c, 0.0))

    # popularity scores normalized by global_ctr max
    max_pop = max(global_ctr.values()) if global_ctr else 1
    for (_, item, co_hint) in candidates:
        co_s = co_sum.get(item, 0.0)
        pop_s = float(global_ctr.get(item, 0)) / max_pop
        emb_s = 0.0
        if emb_model is not None and HAS_W2V:
            # average similarity between item and cart items if present
            try:
                vec_item = emb_model.wv[item]
                sims = []
                for it in cart:
                    if it in emb_model.wv:
                        sims.append(np.dot(vec_item, emb_model.wv[it]) / (
                            np.linalg.norm(vec_item) * np.linalg.norm(emb_model.wv[it]) + 1e-9))
                if sims:
                    emb_s = float(np.mean(sims))
            except Exception:
                emb_s = 0.0
        score = w_co * co_s + w_pop * pop_s + CFG.WEIGHT_EMB * emb_s
        scores.append((item, score))
    scores.sort(key=lambda x: -x[1])
    return scores

# Generating Training Samples

In [15]:
def leave_one_out_samples(order_df):
    stamp("Generating leave-one-out samples for offline eval...")
    rows = []
    for lst, store, dt in zip(order_df["ITEM_LIST"], order_df.get("STORE_NUMBER", [None]*len(order_df)), order_df["_ORD_DATE"]):
        uniq = list(dict.fromkeys(lst))
        if len(uniq) < CFG.MIN_ITEMS_PER_CART:
            continue
        for i, target in enumerate(uniq):
            left = [x for j,x in enumerate(uniq) if j!=i]
            rows.append({"features_items": left, "target_item": target, "STORE_NUMBER": store, "_ORD_DATE": dt})
    if CFG.MAX_TRAIN_SAMPLES and len(rows) > CFG.MAX_TRAIN_SAMPLES:
        stamp(f"Sampling down supervised rows from {len(rows)} to {CFG.MAX_TRAIN_SAMPLES}")
        rows = random.sample(rows, CFG.MAX_TRAIN_SAMPLES)
    df = pd.DataFrame(rows)
    stamp(f"Total leave-one-out samples: {len(df)}")
    return df


In [17]:
def compute_recall_at_k_fast(samples_df, neighbors, global_pop, global_ctr, store_pop=None, hour_pop=None, emb_model=None, K=3):
    stamp("Computing Recall@K (fast, candidate-limited)...")
    n = len(samples_df)
    correct = 0
    start = time.time()
    for i, row in samples_df.iterrows():
        cart = row["features_items"]
        target = row["target_item"]
        store = row.get("STORE_NUMBER", None)
        dt = row.get("_ORD_DATE", pd.NaT)
        hour = dt.hour if not pd.isna(dt) else None

        cand = candidates_for_cart(cart, neighbors, global_pop, store_pop, store, hour_pop, hour)
        scored = compute_scores(cart, cand, neighbors, global_ctr, CFG.WEIGHT_COOCC, CFG.WEIGHT_POP, emb_model)
        preds = [it for it, _ in scored][:K]
        if target in preds:
            correct += 1

        if (i+1) % 5000 == 0 or i+1 == n:
            elapsed = time.time() - start
            stamp(f"[val] {i+1}/{n} | interim Recall@{K}={correct/(i+1):.4f} | {i+1:.1f}/{elapsed:.1f}s")

    recall = correct / max(1, n)
    stamp(f"[METRIC] Recall@{K}: {recall:.4f} (time {(time.time()-start)/60:.2f} min)")
    return recall

In [18]:
def run_inference_on_test(test_df, neighbors, global_pop, global_ctr, store_pop=None, hour_pop=None, emb_model=None):
    stamp("Running inference on test sheet...")
    cart_item_cols = [c for c in ["item1", "item2", "item3", "item4"] if c in test_df.columns]
    out_rows = []
    for r, row in test_df.iterrows():
        cart = []
        for c in cart_item_cols:
            it = row.get(c, None)
            if pd.notna(it) and str(it).strip().lower() != "missing":
                cart.append(str(it).strip())
        cart = list(dict.fromkeys(cart))
        store = row.get("STORE_NUMBER", None) if "STORE_NUMBER" in test_df.columns else None
        dt = row.get("ORDER_CREATED_DATE", None)
        hour = None
        if dt is not None and not pd.isna(dt):
            try:
                hour = parse_order_date(dt).hour
            except Exception:
                hour = None

        cand = candidates_for_cart(cart, neighbors, global_pop, store_pop, store, hour_pop, hour)
        scored = compute_scores(cart, cand, neighbors, global_ctr, CFG.WEIGHT_COOCC, CFG.WEIGHT_POP, emb_model)
        preds = [it for it, _ in scored][:CFG.RECALL_AT_K]
        # ensure length
        while len(preds) < CFG.RECALL_AT_K:
            for it in global_pop:
                if it not in cart and it not in preds:
                    preds.append(it)
                if len(preds) >= CFG.RECALL_AT_K:
                    break

        out = {
            "CUSTOMER_ID": row.get("CUSTOMER_ID", ""),
            "ORDER_ID": row.get("ORDER_ID", "")
        }
        for c in cart_item_cols:
            out[c] = row.get(c, "")
        for i in range(CFG.RECALL_AT_K):
            out[f"RECOMMENDATION {i+1}"] = preds[i] if i < len(preds) else ""
        out_rows.append(out)
    out_df = pd.DataFrame(out_rows)
    cols = ["CUSTOMER_ID", "ORDER_ID"] + cart_item_cols + [f"RECOMMENDATION {i+1}" for i in range(CFG.RECALL_AT_K)]
    cols = [c for c in cols if c in out_df.columns]
    out_df = out_df[cols]
    out_df.to_excel(CFG.OUTPUT_EXCEL_PATH, index=False)
    stamp(f"Wrote output Excel -> {CFG.OUTPUT_EXCEL_PATH}")
    return out_df

In [19]:
def run_pipeline():
    stamp("=== FAST PIPELINE START ===")
    # 1) Load data
    order_df = safe_read_csv(CFG.ORDER_DATA_PATH)
    # optional merges if you want but not required to run baseline
    # 2) Filter and build catalog
    order_df, item2idx, idx2item, filtered_ctr = build_catalog_and_filtered_orders(order_df)
    # 3) neighbors & pop tables
    neighbors = build_cooccurrence_neighbors(order_df, item2idx)
    global_pop, store_pop, hour_pop, global_ctr = build_popularity_tables(order_df)
    # 4) optional embeddings
    emb_model = None
    if CFG.USE_W2V:
        emb_model = train_item2vec(order_df)

    # 5) build leave-one-out samples and evaluate
    samples = leave_one_out_samples(order_df)
    # quick shuffle
    samples = samples.sample(frac=1.0, random_state=CFG.SEED).reset_index(drop=True)
    # evaluate on a holdout slice (fast)
    n_val = int(0.15 * len(samples)) if len(samples) > 1000 else min(2000, len(samples))
    val_df = samples.iloc[:n_val]
    stamp(f"Using {len(val_df)} samples for quick offline validation")
    recall = compute_recall_at_k_fast(val_df, neighbors, global_pop, global_ctr, store_pop, hour_pop, emb_model, K=CFG.RECALL_AT_K)

    # 6) inference on test sheet and write excel
    test_df = safe_read_csv(CFG.TEST_DATA_PATH)
    out_df = run_inference_on_test(test_df, neighbors, global_pop, global_ctr, store_pop, hour_pop, emb_model)

    stamp("=== FAST PIPELINE END ===")
    return {"val_recall3": float(recall), "output_path": CFG.OUTPUT_EXCEL_PATH}

In [27]:
if __name__ == "__main__":
    res = run_pipeline()
    stamp(f"Final offline Recall@{CFG.RECALL_AT_K}: {res['val_recall3']:.4f}")
    stamp(f"Output: {res['output_path']}")


[22:12:42] === FAST PIPELINE START ===
[22:12:42] Reading: C:\Users\ia383\Madness_Overloaded_WWT_Comp2025\Madness_Overloaded_Codebase\dataset\order_data.csv
[22:12:47] Building item counts and applying catalog cap...
[22:12:47] Parsing ORDERS -> ITEM_LIST
[22:16:34] Orders retained: 1414398, unique items kept: 145
[22:16:34] Computing recency-weighted co-occurrence counts (dict-of-dicts)...
[22:16:45] Built neighbor lists for 145 items
[22:16:45] Building popularity (global & store-wise & time-of-day)...
[22:16:52] Generating leave-one-out samples for offline eval...
[22:17:25] Sampling down supervised rows from 5656980 to 400000
[22:17:29] Total leave-one-out samples: 400000
[22:17:29] Using 60000 samples for quick offline validation
[22:17:29] Computing Recall@K (fast, candidate-limited)...
[22:17:31] [val] 5000/60000 | interim Recall@3=0.6086 | 5000.0/1.2s
[22:17:32] [val] 10000/60000 | interim Recall@3=0.6200 | 10000.0/2.4s
[22:17:33] [val] 15000/60000 | interim Recall@3=0.6206 | 1

In [23]:
import os
print(os.getcwd())  # Shows current working directory
print(os.path.exists(r"C:\Users\ia383\Madness_Overloaded_WWT_Comp2025\Madness_Overloaded_Codebase\dataset\order_data.csv"))


c:\Users\ia383\Madness_Overloaded_WWT_Comp2025\Madness_Overloaded_Codebase
True
