In [2]:
import polars as pl
from sklearn.model_selection import train_test_split
import math
import numpy as np
from pathlib import Path

In [3]:
print("polars version:", pl.__version__)
print("has groupby_rolling:", hasattr(pl.DataFrame, "groupby_rolling"))
print("has group_by_rolling:", hasattr(pl.DataFrame, "group_by_rolling"))
print("has rolling:", hasattr(pl.DataFrame, "rolling"))
print("LazyFrame has rolling:", hasattr(pl.LazyFrame, "rolling"))

polars version: 1.35.2
has groupby_rolling: False
has group_by_rolling: False
has rolling: True
LazyFrame has rolling: True


In [4]:
#Clean up 

df = pl.read_csv("../../Data/raw/credit_card_transactions-ibm_v2.csv")
df = df.sample(fraction=0.5, seed=42)
df.head(n = 10)

df = df.with_columns(
    pl.col("Merchant State").fill_null("ONLINE").alias("Merchant State")
)

df = df.with_columns([
    pl.col("Merchant State").fill_null(pl.lit("ONLINE")).alias("Merchant State"),
    pl.col("Zip").cast(pl.Utf8).alias("Zip_str"),
])

# 2) Conditional Zip_str rules (use pl.lit for string literals)
df = df.with_columns([
    pl.when(pl.col("Merchant State") == "ONLINE")
      .then(pl.lit("ONLINE"))
      .when(pl.col("Zip_str").is_null() & (pl.col("Merchant State") != "ONLINE"))
      .then(pl.lit("NON_US"))
      .otherwise(pl.col("Zip_str"))
      .alias("Zip_str")
])

dt_expr = (
    pl.col("Year").cast(pl.Utf8)
    + "-" + pl.col("Month").cast(pl.Utf8).str.zfill(2)
    + "-" + pl.col("Day").cast(pl.Utf8).str.zfill(2)
    + " " + pl.col("Time").cast(pl.Utf8)
)
df = df.with_columns([
    dt_expr.str.strptime(pl.Datetime, "%Y-%m-%d %H:%M").alias("DateTime")
])

# 5) Extract Date and Hour, build User_card, truncate Zip_str
df = df.with_columns([
    pl.col("DateTime").dt.date().alias("Date"),
    pl.col("DateTime").dt.hour().alias("Hour"),
    (pl.col("User").cast(pl.Utf8) + "_" + pl.col("Card").cast(pl.Utf8)).alias("User_card"),
    pl.col("Zip_str").str.slice(0, 3).alias("Zip_str"),
])

df = df.with_columns([
    pl.col("Is Fraud?")
      .cast(pl.Utf8)
      .replace({"Yes": 1, "No": 0}, default=None)
      .cast(pl.Int8)               # small int is fine
      .alias("Is Fraud?")
])

df = df.with_columns([
    pl.col("Amount").cast(pl.Utf8)
        .str.replace_all(r"[^0-9\.\-]", "")   
        .replace('', None)
        .cast(pl.Float64)                    
        .alias("Amount")
])


(Deprecated in version 1.0.0)
  .replace({"Yes": 1, "No": 0}, default=None)


In [5]:
baseline_y = df.select([
    pl.col("Is Fraud?")
])

baseline_X = df.drop("Is Fraud?")

train_X, test_X, train_y, test_y = train_test_split(
    baseline_X, baseline_y, test_size=0.2, random_state=42
)

In [6]:
tmp_dir = Path("../../data/processed/baseline_splits")
train_X.write_csv(tmp_dir / "train_X.csv")
test_X.write_csv(tmp_dir / "test_X.csv")
train_y.write_csv(tmp_dir / "train_y.csv")
test_y.write_csv(tmp_dir / "test_y.csv")


In [7]:
cards = pl.read_csv("../../Data/raw/sd254_cards.csv")
users = pl.read_csv("../../Data/raw/sd254_users.csv")

In [8]:
users = users.with_columns(
    pl.arange(0, users.height).alias("User")
)

cards = cards.with_columns([
    (pl.col("User").cast(pl.Utf8) + "_" + pl.col("CARD INDEX").cast(pl.Utf8)).alias("User_card")
    ])

users = users.select(["State", "FICO Score", "Yearly Income - Person", "Total Debt", "Num Credit Cards", "User"])
cards = cards.select(["User_card", "Card Brand", "Credit Limit", "Card on Dark Web"])

users = users.with_columns([
    pl.col("Total Debt").cast(pl.Utf8)
        .str.replace_all(r"[^0-9\.\-]", "")   
        .replace('', None)
        .cast(pl.Float64)                    
        .alias("Total Debt")
])

users = users.with_columns([
    pl.col("Yearly Income - Person").cast(pl.Utf8)
        .str.replace_all(r"[^0-9\.\-]", "")   
        .replace('', None)
        .cast(pl.Float64)                    
        .alias("Yearly Income - Person")
])

cards = cards.with_columns([
    pl.col("Credit Limit").cast(pl.Utf8)
        .str.replace_all(r"[^0-9\.\-]", "")   
        .replace('', None)
        .cast(pl.Float64)                    
        .alias("Credit Limit")
])

In [9]:
Ctrain_X = train_X.join(users, on="User", how="left").join(cards, on="User_card", how="left")
Ctest_X = test_X.join(users, on="User", how="left").join(cards, on="User_card", how="left")


In [10]:
# ---------------- helpers ----------------
def estimate_kappa_from_R_scalar(R: float, eps: float = 1e-12, max_kappa: float = 1e8) -> float:
    # sanitize input
    if R is None or not np.isfinite(R):
        return 1e-8
    # clamp into [0, 1)
    if R <= 0.0:
        return 1e-8
    if R >= 1.0 - eps:
        return float(max_kappa)

    # standard approximate formulas (Mardia & Jupp / Best & Fisher)
    if R < 0.53:
        k = 2 * R + R**3 + (5 * R**5) / 6.0
    elif R < 0.85:
        k = -0.4 + 1.39 * R + 0.43 / (1.0 - R)
    else:
        denom = (R**3 - 4 * R**2 + 3 * R)
        # protect denom from being tiny/zero
        if abs(denom) < 1e-12:
            return float(max_kappa)
        k = 1.0 / denom

    # final safety: ensure finite and positive
    if not np.isfinite(k) or k <= 0:
        return 1e-8
    return float(k)

def pct_last(lst):
    if not lst:
        return 0.0
    last = lst[-1]
    cnt_le = sum(1 for v in lst if v <= last)
    return float(cnt_le) / float(len(lst))

def card_stats_from_angles(angles):
    if angles is None or len(angles) == 0:
        return 0.0, 0.0, 1e-8
    ang = np.asarray(angles, dtype=float)
    # protect against all-nan or empty effective
    if ang.size == 0 or not np.isfinite(ang).any():
        return 0.0, 0.0, 1e-8

    comp = np.mean(np.exp(1j * ang))
    mu = float(np.angle(comp))
    R = float(np.abs(comp))
    # clamp R to [0, 1)
    R = max(0.0, min(R, 1.0 - 1e-12))
    kappa = estimate_kappa_from_R_scalar(R)
    return mu, R, kappa

def process_df (df):

    df = df.with_columns([
        (pl.col("Merchant State") != pl.col("State")).alias("merchant_state_diff")
    ])


    df = df.with_columns([
        pl.col("Amount").cast(pl.Float64).alias("Amount"),
        (pl.col("Amount") < 0).cast(pl.Int8).alias("amount_is_refund"),  # useful flag
        (
            pl.when(pl.col("Amount") == 0)
            .then(0.0)
            .otherwise(pl.col("Amount").abs().log1p()
                        * pl.when(pl.col("Amount") < 0).then(-1.0).otherwise(1.0))
        ).alias("amount_log")  
    ])


    # hour angle and sin/cos
    sec_of_day = (pl.col("DateTime").dt.hour() * 3600 + pl.col("DateTime").dt.minute() * 60 + pl.col("DateTime").dt.second())
    df = df.with_columns([
        sec_of_day.alias("sec_of_day"),
        (2 * math.pi * sec_of_day / 86400.0).alias("hour_angle"),
    ])
    df = df.with_columns([
        pl.col("hour_angle").map_elements(lambda x: float(math.sin(x))).alias("hour_sin"),
        pl.col("hour_angle").map_elements(lambda x: float(math.cos(x))).alias("hour_cos"),
    ])

    # von Mises: compute card-level mu,R,kappa from angle list then broadcast
    angles = df["hour_angle"].to_list()
    mu, R, kappa = card_stats_from_angles(angles)
    df = df.with_columns([
        pl.lit(mu).alias("card_mu"),
        pl.lit(R).alias("card_R"),
        pl.lit(kappa).alias("card_kappa"),
    ])

    # von-mises likelihood (unnormalized)
    df = df.with_columns([
        pl.struct(["card_kappa", "card_mu", "hour_angle"]).map_elements(
            lambda s: float(np.exp(s["card_kappa"] * math.cos(s["hour_angle"] - s["card_mu"])))
        ).alias("von_mises_likelihood_card")
    ])


    # tidy up per-card
    drop_cols = ["card_mu", "card_R", "sec_of_day"]
    for c in drop_cols:
        if c in df.columns:
            df = df.drop(c)

    return df

Ctrain_X = process_df(Ctrain_X)
Ctest_X = process_df(Ctest_X)


In [11]:
tmp_dir = Path("../../Data/processed/challenger_splits/original")
Ctrain_X.write_csv(tmp_dir / "train_X.csv")
Ctest_X.write_csv(tmp_dir / "test_X.csv")
train_y.write_csv(tmp_dir / "train_y.csv")
test_y.write_csv(tmp_dir / "test_y.csv")


In [12]:
Ctrain_X.head(n=10)

User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Zip_str,DateTime,Date,Hour,User_card,State,FICO Score,Yearly Income - Person,Total Debt,Num Credit Cards,Card Brand,Credit Limit,Card on Dark Web,merchant_state_diff,amount_is_refund,amount_log,hour_angle,hour_sin,hour_cos,card_kappa,von_mises_likelihood_card
i64,i64,i64,i64,i64,str,f64,str,i64,str,str,f64,i64,str,str,datetime[μs],date,i8,str,str,i64,f64,f64,i64,str,f64,str,bool,i8,f64,f64,f64,f64,f64,f64
716,0,2011,12,4,"""13:53""",9.07,"""Swipe Transaction""",6780853441840436625,"""Fremont""","""CA""",94536.0,7210,,"""945""",2011-12-04 13:53:00,2011-12-04,13,"""716_0""","""CA""",798,68765.0,75608.0,5,"""Mastercard""",39751.0,"""No""",False,0,2.309561,-1.354666,-0.976735,0.214451,0.437183,1.285982
835,3,2012,8,18,"""10:41""",52.6,"""Swipe Transaction""",6042526206085641250,"""Bronx""","""NY""",10456.0,5912,,"""104""",2012-08-18 10:41:00,2012-08-18,10,"""835_3""","""NY""",712,25434.0,45540.0,4,"""Mastercard""",10500.0,"""No""",False,0,3.981549,-2.155191,-0.834046,-0.551694,0.437183,0.921731
1761,0,2009,8,22,"""21:03""",15.85,"""Swipe Transaction""",6246550243095942766,"""Cocoa""","""FL""",32926.0,5921,,"""329""",2009-08-22 21:03:00,2009-08-22,21,"""1761_0""","""FL""",644,12796.0,22192.0,3,"""Visa""",9661.0,"""No""",False,0,2.824351,0.726348,0.664144,0.747605,0.437183,1.208288
1315,1,2017,8,6,"""08:23""",10.7,"""Chip Transaction""",4722913068560264812,"""Perrysburg""","""OH""",43551.0,5411,,"""435""",2017-08-06 08:23:00,2017-08-06,8,"""1315_1""","""OR""",680,38194.0,1674.0,3,"""Mastercard""",9707.0,"""No""",True,0,2.459589,2.101667,0.862366,-0.506285,0.437183,0.704784
1809,1,2011,10,15,"""19:51""",58.14,"""Swipe Transaction""",4123806141501734555,"""Billings""","""MT""",59102.0,4900,,"""591""",2011-10-15 19:51:00,2011-10-15,19,"""1809_1""","""MT""",695,41249.0,42268.0,2,"""Mastercard""",16187.0,"""No""",False,0,4.079908,0.207403,0.20592,0.978569,0.437183,1.432991
797,2,2012,11,18,"""17:07""",20.5,"""Swipe Transaction""",-5776107283644035423,"""Winder""","""GA""",30680.0,5813,,"""306""",2012-11-18 17:07:00,2012-11-18,17,"""797_2""","""GA""",753,37056.0,75730.0,4,"""Mastercard""",21895.0,"""No""",False,0,3.068053,-0.322013,-0.316477,0.9486,0.437183,1.546445
1152,1,2011,6,17,"""08:13""",3.44,"""Swipe Transaction""",-727612092139916043,"""Indianapolis""","""IN""",46239.0,5411,,"""462""",2011-06-17 08:13:00,2011-06-17,8,"""1152_1""","""IN""",830,49412.0,42378.0,3,"""Mastercard""",27594.0,"""No""",False,0,1.490654,2.095268,0.865589,-0.500756,0.437183,0.705972
1228,3,2013,6,20,"""22:22""",55.56,"""Swipe Transaction""",-5843779458842238113,"""Chicago""","""IL""",60636.0,5812,,"""606""",2013-06-20 22:22:00,2013-06-20,22,"""1228_3""","""MD""",692,62791.0,193215.0,4,"""Mastercard""",39870.0,"""No""",True,0,4.035302,0.996583,0.83962,0.543174,0.437183,1.080176
980,2,2003,5,7,"""14:57""",64.19,"""Swipe Transaction""",8011310777263965322,"""Novi""","""MI""",48375.0,5912,,"""483""",2003-05-07 14:57:00,2003-05-07,14,"""980_2""","""MI""",509,62103.0,33545.0,4,"""Visa""",16722.0,"""No""",False,0,4.177306,-1.094031,-0.888484,0.458908,0.437183,1.398186
177,1,2001,10,24,"""15:07""",10.08,"""Swipe Transaction""",2027553650310142703,"""Saint Cloud""","""FL""",34771.0,5541,,"""347""",2001-10-24 15:07:00,2001-10-24,15,"""177_1""","""FL""",710,45037.0,31792.0,3,"""Mastercard""",28405.0,"""No""",False,0,2.405142,-0.845612,-0.748377,0.663273,0.437183,1.482752
