In [10]:
import polars as pl
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import HeteroData
from sklearn.model_selection import train_test_split
import pandas as pd
import zipfile
from pathlib import Path

In [11]:
print("polars version:", pl.__version__)
print("has groupby_rolling:", hasattr(pl.DataFrame, "groupby_rolling"))
print("has group_by_rolling:", hasattr(pl.DataFrame, "group_by_rolling"))
print("has rolling:", hasattr(pl.DataFrame, "rolling"))
print("LazyFrame has rolling:", hasattr(pl.LazyFrame, "rolling"))

polars version: 1.35.2
has groupby_rolling: False
has group_by_rolling: False
has rolling: True
LazyFrame has rolling: True


In [12]:
#Clean up 

df = pl.read_csv("../../Data/raw/credit_card_transactions-ibm_v2.csv")
df = df.unique()
df = df.sample(fraction=0.5, seed=42)
df.head(n = 10)

df = df.with_columns(
    pl.col("Merchant State").fill_null("ONLINE").alias("Merchant State")
)

df = df.with_columns([
    pl.col("Merchant State").fill_null(pl.lit("ONLINE")).alias("Merchant State"),
    pl.col("Zip").cast(pl.Utf8).alias("Zip_str"),
])

df = df.with_columns([
    pl.col("Errors?").fill_null("No Error").alias("Errors?")
])
# 2) Conditional Zip_str rules (use pl.lit for string literals)
df = df.with_columns([
    pl.when(pl.col("Merchant State") == "ONLINE")
      .then(pl.lit("ONLINE"))
      .when(pl.col("Zip_str").is_null() & (pl.col("Merchant State") != "ONLINE"))
      .then(pl.lit("NON_US"))
      .otherwise(pl.col("Zip_str"))
      .alias("Zip_str")
])

dt_expr = (
    pl.col("Year").cast(pl.Utf8)
    + "-" + pl.col("Month").cast(pl.Utf8).str.zfill(2)
    + "-" + pl.col("Day").cast(pl.Utf8).str.zfill(2)
    + " " + pl.col("Time").cast(pl.Utf8)
)
df = df.with_columns([
    dt_expr.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M").alias("DateTime")
])

# 5) Extract Date and Hour, build User_card, truncate Zip_str
df = df.with_columns([
    pl.col("DateTime").dt.date().alias("Date"),
    pl.col("DateTime").dt.hour().alias("Hour"),
    (pl.col("User").cast(pl.Utf8) + "_" + pl.col("Card").cast(pl.Utf8)).alias("User_card"),
    pl.col("Zip_str").str.slice(0, 3).alias("Zip_str"),
])

df = df.with_columns([
    pl.col("Is Fraud?")
      .cast(pl.Utf8)
      .replace({"Yes": 1, "No": 0}, default=None)
      .cast(pl.Int8)               # small int is fine
      .alias("Is Fraud?")
])

df = df.with_columns([
    pl.col("Amount").cast(pl.Utf8)
        .str.replace_all(r"[^0-9\.\-]", "")   
        .replace('', None)
        .cast(pl.Float64)                    
        .alias("Amount")
])

df = df.drop("Zip")


(Deprecated in version 1.0.0)
  .replace({"Yes": 1, "No": 0}, default=None)


In [13]:
baseline_y = df.select([
    pl.col("Is Fraud?")
])

baseline_X = df.drop("Is Fraud?")

train_X, test_X, train_y, test_y = train_test_split(
    baseline_X, baseline_y, test_size=0.2, random_state=42
)

In [14]:
cards = pl.read_csv("../../Data/raw/sd254_cards.csv")
users = pl.read_csv("../../Data/raw/sd254_users.csv")

In [15]:
users = users.with_columns(
    pl.arange(0, users.height).alias("User")
)

cards = cards.with_columns([
    (pl.col("User").cast(pl.Utf8) + "_" + pl.col("CARD INDEX").cast(pl.Utf8)).alias("User_card")
    ])

users = users.select(["State", "FICO Score", "Yearly Income - Person", "Total Debt", "Num Credit Cards", "User"])
cards = cards.select(["User_card", "Card Brand", "Credit Limit", "Card on Dark Web"])

In [16]:
Ctrain_X = train_X.join(users, on="User", how="left").join(cards, on="User_card", how="left")
Ctest_X = test_X.join(users, on="User", how="left").join(cards, on="User_card", how="left")


In [17]:
import polars as pl
import numpy as np
import math
from typing import List as TList

# ---------------- helpers ----------------
def estimate_kappa_from_R_scalar(R: float, eps: float = 1e-12, max_kappa: float = 1e8) -> float:
    """
    Robust approximation for kappa from resultant length R (R in [0,1]).
    Avoids division by zero / numerical explosion when R -> 1 by clamping.
    Returns a small positive value for R ~ 0 and a large finite value for R ~ 1.
    """
    # sanitize input
    if R is None or not np.isfinite(R):
        return 1e-8
    # clamp into [0, 1)
    if R <= 0.0:
        return 1e-8
    if R >= 1.0 - eps:
        # R extremely close to 1 -> concentration effectively infinite; return a large finite kappa
        return float(max_kappa)

    # standard approximate formulas (Mardia & Jupp / Best & Fisher)
    if R < 0.53:
        k = 2 * R + R**3 + (5 * R**5) / 6.0
    elif R < 0.85:
        k = -0.4 + 1.39 * R + 0.43 / (1.0 - R)
    else:
        denom = (R**3 - 4 * R**2 + 3 * R)
        # protect denom from being tiny/zero
        if abs(denom) < 1e-12:
            return float(max_kappa)
        k = 1.0 / denom

    # final safety: ensure finite and positive
    if not np.isfinite(k) or k <= 0:
        return 1e-8
    return float(k)

def pct_last(lst):
    if not lst:
        return 0.0
    last = lst[-1]
    cnt_le = sum(1 for v in lst if v <= last)
    return float(cnt_le) / float(len(lst))

def card_stats_from_angles(angles):
    """
    angles: list or 1D-array of radian angles
    returns: (mu, R, kappa) with robust guarding
    """
    if angles is None or len(angles) == 0:
        return 0.0, 0.0, 1e-8
    ang = np.asarray(angles, dtype=float)
    # protect against all-nan or empty effective
    if ang.size == 0 or not np.isfinite(ang).any():
        return 0.0, 0.0, 1e-8

    comp = np.mean(np.exp(1j * ang))
    mu = float(np.angle(comp))
    R = float(np.abs(comp))
    # clamp R to [0, 1)
    R = max(0.0, min(R, 1.0 - 1e-12))
    kappa = estimate_kappa_from_R_scalar(R)
    return mu, R, kappa

def process_df (df):

    df = df.with_columns([
    (pl.col("Merchant State") != pl.col("State"))
        .cast(pl.Int8)  # 0/1 feature (optional)
        .alias("merchant_state_diff")
])


    df = df.with_columns([
        pl.col("Amount").cast(pl.Float64).alias("Amount"),
        (pl.col("Amount") < 0).cast(pl.Int8).alias("amount_is_refund"),  # useful flag
        (
            pl.when(pl.col("Amount") == 0)
            .then(0.0)
            .otherwise(pl.col("Amount").abs().log1p()
                        * pl.when(pl.col("Amount") < 0).then(-1.0).otherwise(1.0))
        ).alias("amount_log")  
    ])


    # hour angle and sin/cos
    sec_of_day = (pl.col("DateTime").dt.hour() * 3600 + pl.col("DateTime").dt.minute() * 60 + pl.col("DateTime").dt.second())
    df = df.with_columns([
        sec_of_day.alias("sec_of_day"),
        (2 * math.pi * sec_of_day / 86400.0).alias("hour_angle"),
    ])
    df = df.with_columns([
        pl.col("hour_angle").map_elements(lambda x: float(math.sin(x))).alias("hour_sin"),
        pl.col("hour_angle").map_elements(lambda x: float(math.cos(x))).alias("hour_cos"),
    ])

    # von Mises: compute card-level mu,R,kappa from angle list then broadcast
    angles = df["hour_angle"].to_list()
    mu, R, kappa = card_stats_from_angles(angles)
    df = df.with_columns([
        pl.lit(mu).alias("card_mu"),
        pl.lit(R).alias("card_R"),
        pl.lit(kappa).alias("card_kappa"),
    ])

    # von-mises likelihood (unnormalized)
    df = df.with_columns([
        pl.struct(["card_kappa", "card_mu", "hour_angle"]).map_elements(
            lambda s: float(np.exp(s["card_kappa"] * math.cos(s["hour_angle"] - s["card_mu"])))
        ).alias("von_mises_likelihood_card")
    ])


    # tidy up per-card
    drop_cols = ["card_mu", "card_R", "sec_of_day","Time"]
    for c in drop_cols:
        if c in df.columns:
            df = df.drop(c)

    return df

Ctrain_X = process_df(Ctrain_X)
Ctest_X = process_df(Ctest_X)


In [18]:
tmp_dir = Path("../../Data/processed/challenger_splits/rf_splits")
Ctrain_X.write_csv(tmp_dir / "train_X.csv")
Ctest_X.write_csv(tmp_dir / "test_X.csv")
train_y.write_csv(tmp_dir / "train_y.csv")
test_y.write_csv(tmp_dir / "test_y.csv")
