In [1]:
import pandas as pd
Outflows = pd.read_parquet("q1-ucsd-outflows.pqt")

In [2]:
import torch
from torch.optim import Adam
from datarater import InnerClassifier, DataRater

# fake dataset sizes
vocab_size = 500
num_classes = 5
pad_id = 0

# instantiate models
inner = InnerClassifier(vocab_size=vocab_size, num_classes=num_classes, pad_id=pad_id)
dr = DataRater(vocab_size=vocab_size, pad_id=pad_id)

# create random inputs
X = torch.randint(0, vocab_size, (8, 20))
Y = torch.randint(0, num_classes, (8,))

# forward passes
scores = dr(X)                 # (8,)
assert scores.shape == (8,)
per_ex = inner(X, Y)           # (8,)

# compute weighted loss
loss = (torch.softmax(scores, dim=0) * per_ex).sum()
loss.backward(create_graph=True)

print("✅ OK: shapes & autograd paths are good.")

✅ OK: shapes & autograd paths are good.


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [3]:
from sklearn.model_selection import train_test_split
from scipy.stats import ttest_ind
inflow = pd.read_parquet('q1-ucsd-inflows.pqt')
outflow = pd.read_parquet('q1-ucsd-outflows.pqt')
consumers_both = sorted(set(inflow["prism_consumer_id"]).intersection(outflow["prism_consumer_id"]))

#80-20 train test split
train_ids, test_ids = train_test_split(consumers_both, test_size=0.2, random_state=42)

inflow_train = inflow[inflow["prism_consumer_id"].isin(train_ids)]
inflow_test  = inflow[inflow["prism_consumer_id"].isin(test_ids)]

outflow_train = outflow[outflow["prism_consumer_id"].isin(train_ids)]
outflow_test  = outflow[outflow["prism_consumer_id"].isin(test_ids)]

In [None]:
# ---- 0) Inputs: a pandas DataFrame with columns: ["memo", "category", "amount", "posted_date"] ----
import pandas as pd
from collections import Counter
from typing import List, Dict, Tuple
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

# base dataframe
df = outflow_train[outflow_train['memo'] != outflow_train['category']].copy()     # (same as before)
df['posted_date'] = pd.to_datetime(df['posted_date'], errors='coerce')            # <<< MODIFIED
df = df.dropna(subset=['memo', 'category', 'amount', 'posted_date'])              # <<< MODIFIED

# ---- helpers for aux features (amount + posted_date) ----
def cyclical_encode(val, period):
    angle = 2 * np.pi * (val / period)
    return np.sin(angle), np.cos(angle)

# ---- 1) Tokenization & vocab (unchanged) ----
def basic_tokenize(text: str) -> List[str]:
    return text.strip().lower().split()

def build_vocab(texts: List[str], min_freq: int = 1) -> Dict[str, int]:
    counter = Counter()
    for t in texts:
        counter.update(basic_tokenize(t))
    stoi = {"<pad>": 0, "<unk>": 1}
    for tok, freq in counter.items():
        if freq >= min_freq and tok not in stoi:
            stoi[tok] = len(stoi)
    return stoi

def encode_text(text: str, stoi: Dict[str, int]) -> List[int]:
    return [stoi.get(tok, stoi["<unk>"]) for tok in basic_tokenize(text)]


# ---- 2) Labels (unchanged) ----
def build_label_map(labels: List[str]) -> Dict[str, int]:
    classes = sorted(set(labels))
    return {c: i for i, c in enumerate(classes)}

def build_inverse_label_map(ltoi: Dict[str, int]) -> Dict[int, str]:
    return {i: c for c, i in ltoi.items()}


# ---- 3) Split FIRST, then fit aux stats on train ONLY ----  # <<< MODIFIED
perm = torch.randperm(len(df))
cut  = int(0.8 * len(df))
train_df = df.iloc[perm[:cut]].reset_index(drop=True)
val_df   = df.iloc[perm[cut:]].reset_index(drop=True)

# fit amount stats on train only (avoid leakage)
_amt_mean = train_df['amount'].mean()
_amt_std  = train_df['amount'].std(ddof=0) + 1e-8

def add_aux_columns(_df: pd.DataFrame) -> pd.DataFrame:
    _df = _df.copy()
    # amount features (use train stats)
    _df['amount_z'] = (_df['amount'] - _amt_mean) / _amt_std
    _df['amount_log_signed'] = np.sign(_df['amount']) * np.log1p(np.abs(_df['amount']))

    # day-of-week and month cyclic encodings
    dow = _df['posted_date'].dt.dayofweek.astype(int)
    mon = (_df['posted_date'].dt.month.astype(int) - 1).clip(0, 11)
    dow_sin, dow_cos = zip(*[cyclical_encode(v, 7)  for v in dow])
    mon_sin, mon_cos = zip(*[cyclical_encode(v, 12) for v in mon])
    _df['dow_sin'] = dow_sin; _df['dow_cos'] = dow_cos
    _df['mon_sin'] = mon_sin; _df['mon_cos'] = mon_cos
    return _df

train_df = add_aux_columns(train_df)   # <<< MODIFIED
val_df   = add_aux_columns(val_df)     # <<< MODIFIED

AUX_COLS = ['amount_z', 'amount_log_signed', 'dow_sin', 'dow_cos', 'mon_sin', 'mon_cos']  # <<< MODIFIED
AUX_DIM  = len(AUX_COLS)                                                                        # <<< MODIFIED


# ---- 4) Build vocab from TRAIN ONLY, then labels from all ----  # <<< MODIFIED
stoi = build_vocab(train_df["memo"].astype(str).tolist(), min_freq=1)
ltoi = build_label_map(pd.concat([train_df["category"], val_df["category"]], axis=0).astype(str).tolist())
itol = build_inverse_label_map(ltoi)

vocab_size = len(stoi)
num_classes = len(ltoi)
pad_id = stoi["<pad>"]


# ---- 5) Dataset + collate now return (tokens, AUX, label) ----  # <<< MODIFIED
class MemoDataset(Dataset):
    def __init__(self, df: pd.DataFrame, stoi: Dict[str, int], ltoi: Dict[str, int], max_len: int = 64):
        self.pad_id = stoi["<pad>"]
        self.samples: List[Tuple[List[int], torch.Tensor, int]] = []
        for _, row in df.iterrows():
            ids = encode_text(str(row["memo"]), stoi)[:max_len]
            aux = torch.tensor(row[AUX_COLS].values.astype(np.float32))  # (F,)
            y   = ltoi[str(row["category"])]
            self.samples.append((ids, aux, y))

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        x_ids, aux, y = self.samples[idx]
        return torch.tensor(x_ids, dtype=torch.long), aux, torch.tensor(y, dtype=torch.long)

def collate_batch(batch, pad_id=0):
    xs, auxs, ys = zip(*batch)                                            # <<< MODIFIED
    L = max(len(x) for x in xs) if xs else 1
    X = torch.full((len(xs), L), pad_id, dtype=torch.long)
    for i, x in enumerate(xs):
        X[i, :len(x)] = x
    AUX = torch.stack(auxs)                                              # <<< MODIFIED  (B, F)
    return X, AUX, torch.stack(ys)                                       # <<< MODIFIED


# ---- 6) Datasets / Loaders ----
train_ds = MemoDataset(train_df, stoi, ltoi, max_len=64)
val_ds   = MemoDataset(val_df,   stoi, ltoi, max_len=64)
train_dl = DataLoader(train_ds, batch_size=128, shuffle=True,  collate_fn=lambda b: collate_batch(b, pad_id))  # <<< MODIFIED (batch size)
val_dl   = DataLoader(val_ds,   batch_size=256, shuffle=False, collate_fn=lambda b: collate_batch(b, pad_id))  # <<< MODIFIED

In [5]:
from torch.optim import Adam

def pick_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    # MPS (Apple Silicon)
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available() and torch.backends.mps.is_built():
        return torch.device("mps")
    return torch.device("cpu")

device = pick_device()
print("Using:", device)
inner = InnerClassifier(vocab_size=vocab_size, num_classes=num_classes, pad_id=pad_id,
                        d_model=128, n_layers=2, n_heads=4, max_seq_len=128).to(device)
dr    = DataRater(vocab_size=vocab_size, pad_id=pad_id, d_model=128, max_seq_len=128).to(device)

opt_inner = Adam(inner.parameters(), lr=2e-3)

def train_one_epoch():
    inner.train()
    for X, Y in train_dl:
        X, Y = X.to(device), Y.to(device)
        opt_inner.zero_grad(set_to_none=True)
        # DataRater weights (batch-softmax)
        scores  = dr(X)                               # (B,)
        weights = torch.softmax(scores, dim=0)        # sum=1
        per_ex  = inner(X, Y)                         # (B,)
        loss    = (weights * per_ex).sum()
        loss.backward()                               # no meta yet → no create_graph
        opt_inner.step()

def evaluate():
    inner.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X, Y in val_dl:
            X, Y = X.to(device), Y.to(device)
            probs = inner.predict_proba(X)
            pred  = probs.argmax(dim=-1)
            correct += (pred == Y).sum().item()
            total   += Y.numel()
    return correct / max(total, 1)

for epoch in range(15):
    train_one_epoch()
    acc = evaluate()
    print(f"epoch {epoch}  val_acc={acc:.3f}")

Using: cuda
epoch 0  val_acc=0.980
epoch 1  val_acc=0.982
epoch 2  val_acc=0.982
epoch 3  val_acc=0.983


KeyboardInterrupt: 

In [6]:
df

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
646,2,acc_3,PURCHASE AUTHORIZED ON 03/11 POKE POKU HENDERS...,35.08,2021-03-15,FOOD_AND_BEVERAGES
651,2,acc_3,PURCHASE AUTHORIZED ON 10/01 LIQUOR CITY HENDE...,43.83,2021-10-04,FOOD_AND_BEVERAGES
657,2,acc_3,PURCHASE INTL AUTHORIZED ON 10/20 Rituals Cosm...,98.27,2021-10-21,GENERAL_MERCHANDISE
658,2,acc_3,Trader Joe''s,152.61,2021-04-14,GROCERIES
660,2,acc_3,PURCHASE AUTHORIZED ON 05/28 VANS #174 LAS VEG...,81.17,2021-05-28,GENERAL_MERCHANDISE
...,...,...,...,...,...,...
2597457,5941,acc_9524,DEBIT CARD WITHDRAWAL PURCHASEAmazon Prime*TI4...,15.93,2023-01-16,GENERAL_MERCHANDISE
2597462,5941,acc_9524,POS WITHDRAWALAZ LOT QUIKTRIP XXXX XXXX E INDI...,25.00,2023-01-18,EDUCATION
2597465,5941,acc_9524,POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS ...,3.68,2023-01-18,FOOD_AND_BEVERAGES
2597468,5941,acc_9524,WITHDRAWAL Salt River ProjeTYPE: ONLINE PMT CO...,90.00,2023-01-20,FOOD_AND_BEVERAGES
