In [1]:
import pandas as pd
import numpy as np

In [2]:
cons = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-consDF.pqt")
cons = cons.drop(columns = ["credit_score"])
cons = cons.dropna()
acc = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-acctDF.pqt")
txn = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-trxnDF.pqt")
catmap = pd.read_csv("/uss/hdsi-prismdata/q2-ucsd-cat-map.csv")

In [3]:
income_cats = [2,3,5,7,8,9,49]

In [4]:
def build_monthly_cashflows(
    txn_df,
    window,
    date_col="posted_date",
    amt_col="amount",
    consumer_col="prism_consumer_id",
    cd_col="credit_or_debit",
    cat_col="category",
    fill_months=True,
    min_periods=None,
    consumer_level=False,
):
    if min_periods is None:
        min_periods = window

    df = txn_df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.dropna(subset=[date_col, consumer_col])
    df = df.sort_values([consumer_col, date_col])

    df["month"] = df[date_col].dt.to_period("M").dt.to_timestamp()

    df["income_amt"] = np.where(
        (df[cd_col] == "CREDIT") & (df[cat_col].isin(income_cats)),
        df[amt_col].astype(float),
        0.0,
    )
    df["spend_amt"] = np.where(
        df[cd_col] == "DEBIT",
        df[amt_col].astype(float),
        0.0,
    )

    # ---- base monthly sums (internal only) ----
    monthly = (
        df.groupby([consumer_col, "month"], as_index=False)
          .agg(income=("income_amt", "sum"),
               spend=("spend_amt", "sum"))
          .sort_values([consumer_col, "month"])
    )

    # fill missing months
    if fill_months and not monthly.empty:
        def _fill(g):
            full = pd.date_range(g["month"].min(), g["month"].max(), freq="MS")
            g2 = (
                g.set_index("month")
                 .reindex(full, fill_value=0.0)
                 .rename_axis("month")
                 .reset_index()
            )
            g2[consumer_col] = g[consumer_col].iloc[0]
            return g2

        monthly = (
            monthly.groupby(consumer_col, group_keys=False)
                   .apply(_fill)
                   .sort_values([consumer_col, "month"])
                   .reset_index(drop=True)
        )

    w = int(window)

    # ---- WINDOW == 1 ----
    if w == 1:
        monthly = monthly.rename(columns={
            "income": "income_1m",
            "spend": "spend_1m",
        })
        monthly["net_flow_1m"] = monthly["income_1m"] - monthly["spend_1m"]

        if not consumer_level:
            return monthly

        net_flow_col = "net_flow_1m"

    # ---- WINDOW > 1 ----
    else:
        monthly[f"income_{w}m"] = (
            monthly.groupby(consumer_col)["income"]
                   .transform(lambda s: s.rolling(w, min_periods=min_periods).sum())
        )
        monthly[f"spend_{w}m"] = (
            monthly.groupby(consumer_col)["spend"]
                   .transform(lambda s: s.rolling(w, min_periods=min_periods).sum())
        )
        monthly[f"net_flow_{w}m"] = (
            monthly[f"income_{w}m"] - monthly[f"spend_{w}m"]
        )

        # ðŸš¨ drop 1-month internals
        monthly = monthly.drop(columns=["income", "spend"])

        if not consumer_level:
            return monthly

        net_flow_col = f"net_flow_{w}m"

    # ---- consumer-level aggregation ----
    cons = (
        monthly.groupby(consumer_col, as_index=False)
        .agg(
            avg_monthly_net_flow=(net_flow_col, "mean"),
            med_monthly_net_flow=(net_flow_col, "median"),
            sd_monthly_net_flow=(net_flow_col, "std"),
            min_monthly_net_flow=(net_flow_col, "min"),
            max_monthly_net_flow=(net_flow_col, "max"),
        )
    )

    cons = cons.rename(columns={
        "avg_monthly_net_flow": f"avg_{w}m_netflow",
        "med_monthly_net_flow": f"med_{w}m_netflow",
        "sd_monthly_net_flow": f"sd_{w}m_netflow",
        "min_monthly_net_flow": f"min_{w}m_netflow",
        "max_monthly_net_flow": f"max_{w}m_netflow",
    })

    return cons

In [5]:
# def aggregate_cashflows_to_consumer(monthly_df, window, net_flow = f"{window}_net_flow"): # window = '1month'
#     # monthly_df columns: prism_consumer_id, month, income, spend, net_flow
#     cons = (
#         monthly_df.groupby("prism_consumer_id", as_index=False)
#         .agg(
#             months_observed=("month", "nunique"),
#             total_income=("income", "sum"),
#             total_spend=("spend", "sum"),
#             total_net_flow=(net_flow, "sum"),
#             avg_monthly_income=("income", "mean"),
#             avg_monthly_spend=("spend", "mean"),
#             avg_monthly_net_flow=(net_flow, "mean"),
#             sd_monthly_income=("income", "std"),
#             sd_monthly_spend=("spend", "std"),
#             sd_monthly_net_flow=(net_flow, "std"),
#             min_monthly_net_flow=(net_flow, "min"),
#             max_monthly_net_flow=(net_flow, "max"),
#         )
#     )

#     # rename
#     cons.rename(columns = {avg_monthly_income: f'avg_{window}_income',
#                            avg_monthly_spend: f'avg_{window}_spend',
#                            avg_monthly_net_flow: f'avg_{window}_netflow',
#                            sd_monthly_income: f'sd_{window}_income',
#                            sd_monthly_spend: f'sd_{window}_spend',
#                            sd_monthly_net_flow: f'sd_{window}_netflow',
#                            min_monthly_net_flow: f'min_{window}_netflow',
#                            max_monthly_net_flow: f'max_{window}_netflow'
#                           }
#                )
    
#     # helpful ratios (safe divide)
#     cons["spend_to_income"] = cons["total_spend"] / cons["total_income"].replace(0, np.nan)
#     cons["net_to_income"] = cons["total_net_flow"] / cons["total_income"].replace(0, np.nan)
    
#     return cons


In [6]:
# build_monthly_cashflows(txn, window=6, consumer_level=True)

In [5]:
feats = ((build_monthly_cashflows(txn, window=1, consumer_level=False))
          .groupby(['prism_consumer_id'], as_index=False)
          .agg(
              months_observed=("month", "nunique"),
              total_income=("income_1m", "sum"),
              total_spend=("spend_1m", "sum"),
              total_net_flow=("net_flow_1m", "sum"),
              )
         )

windows = [1, 3, 4, 6, 9, 12]
for w in windows:
    feats = feats.merge(build_monthly_cashflows(txn, window=w, consumer_level=True), 
                        left_on='prism_consumer_id', 
                        right_on='prism_consumer_id'
                       )

feats

  .apply(_fill)
  .apply(_fill)
  .apply(_fill)
  .apply(_fill)
  .apply(_fill)
  .apply(_fill)
  .apply(_fill)


Unnamed: 0,prism_consumer_id,months_observed,total_income,total_spend,total_net_flow,avg_1m_netflow,med_1m_netflow,sd_1m_netflow,min_1m_netflow,max_1m_netflow,...,avg_9m_netflow,med_9m_netflow,sd_9m_netflow,min_9m_netflow,max_9m_netflow,avg_12m_netflow,med_12m_netflow,sd_12m_netflow,min_12m_netflow,max_12m_netflow
0,0,7,9320.56,14908.41,-5587.85,-798.264286,-212.560,1363.160352,-3466.67,513.28,...,,,,,,,,,,
1,1,7,13411.59,23098.37,-9686.78,-1383.825714,-1039.490,1234.856462,-3232.42,79.83,...,,,,,,,,,,
2,10,7,15420.74,21766.60,-6345.86,-906.551429,-978.850,601.382838,-1915.14,-197.84,...,,,,,,,,,,
3,100,6,24411.78,39742.61,-15330.83,-2555.138333,-2087.665,2408.512923,-5515.09,624.68,...,,,,,,,,,,
4,1000,7,48378.60,77914.99,-29536.39,-4219.484286,-618.080,7192.050945,-18367.94,1348.32,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14487,9995,4,11226.84,17047.39,-5820.55,-1455.137500,-1838.745,965.382706,-2118.14,-24.92,...,,,,,,,,,,
14488,9996,4,0.03,1190.89,-1190.86,-297.715000,-293.790,141.146612,-457.36,-145.92,...,,,,,,,,,,
14489,9997,4,16672.86,16702.92,-30.06,-7.515000,137.395,1133.167566,-1516.15,1211.30,...,,,,,,,,,,
14490,9998,3,7203.23,15399.47,-8196.24,-2732.080000,-2817.060,530.568934,-3215.03,-2164.15,...,,,,,,,,,,


In [8]:
# def add_net_cashflow_change_features(
#     monthly: pd.DataFrame,
#     windows=(3, 4, 6, 9, 12),
#     consumer_col="prism_consumer_id",
#     month_col="month",
#     net_flow_col=None,   # if None, auto-detect net_flow_{w}m or net_flow_1m
# ):
#     """
#     Adds net cash-flow change features to the MONTHLY panel produced by build_monthly_cashflows.

#     For each consumer (consumer_col), using base series `net_flow_col`:
#       - {base}_diff_1m                         (month-over-month change)
#       - {base}_roll_mean_{W}m                  (rolling mean of base over W months)
#       - {base}_roll_mean_diff_{W}m             (current W-mean minus previous W-mean; diff(W))
#       - {base}_trend_slope_{W}m                (OLS slope over last W months)

#     Notes:
#     - This expects `monthly` to have consumer_col + month_col and a net flow column from build_monthly_cashflows.
#     - If net_flow_col is None, it will pick:
#         1) a column like "net_flow_3m" / "net_flow_6m" etc (largest W it finds), else
#         2) "net_flow_1m"
#     """

#     m = monthly.copy()

#     # ---- auto-detect net flow col if not provided ----
#     if net_flow_col is None:
#         candidates = [c for c in m.columns if c.startswith("net_flow_") and c.endswith("m")]
#         # candidates includes net_flow_1m, net_flow_3m, ...
#         if candidates:
#             # prefer the largest window if multiple exist
#             def _win(c):
#                 # net_flow_{w}m -> extract w
#                 try:
#                     return int(c.split("_")[2].replace("m", ""))
#                 except Exception:
#                     return -1
#             net_flow_col = max(candidates, key=_win)
#         elif "net_flow_1m" in m.columns:
#             net_flow_col = "net_flow_1m"
#         else:
#             raise ValueError(
#                 "Could not find a net flow column. Provide net_flow_col explicitly "
#                 "or ensure monthly has columns like 'net_flow_1m' or 'net_flow_{w}m'."
#             )

#     if net_flow_col not in m.columns:
#         raise ValueError(f"net_flow_col='{net_flow_col}' not found in monthly dataframe.")

#     # ensure sorting for rolling ops
#     m = m.sort_values([consumer_col, month_col]).reset_index(drop=True)

#     base = net_flow_col  # e.g., net_flow_6m or net_flow_1m

#     # ---- 1-month (one-step) diff of the base series, per consumer ----
#     m[f"{base}_diff_1m"] = (
#         m.groupby(consumer_col)[base]
#          .diff(1)
#     )

#     # ---- rolling mean + rolling mean diff, per consumer ----
#     for W in windows:
#         W = int(W)

#         m[f"{base}_roll_mean_{W}m"] = (
#             m.groupby(consumer_col)[base]
#              .transform(lambda s: s.rolling(W, min_periods=W).mean())
#         )

#         # change in rolling mean compared to W months ago (same as your prior diff(W), per consumer)
#         m[f"{base}_roll_mean_diff_{W}m"] = (
#             m.groupby(consumer_col)[f"{base}_roll_mean_{W}m"]
#              .diff(W)
#         )

#     # ---- rolling slope helper ----
#     def rolling_slope(arr: np.ndarray) -> float:
#         x = np.arange(len(arr), dtype=float)
#         x_mean = x.mean()
#         y = arr.astype(float)
#         y_mean = y.mean()
#         denom = ((x - x_mean) ** 2).sum()
#         if denom == 0:
#             return np.nan
#         return float(((x - x_mean) * (y - y_mean)).sum() / denom)

#     # ---- trend slope over last W months, per consumer ----
#     for W in windows:
#         W = int(W)
#         m[f"{base}_trend_slope_{W}m"] = (
#             m.groupby(consumer_col)[base]
#              .transform(lambda s: s.rolling(W, min_periods=W)
#                                    .apply(lambda x: rolling_slope(np.asarray(x)), raw=False))
#         )

#     return m

In [9]:
# monthly = build_monthly_cashflows(txn, window=1, consumer_level=False)
# add_net_cashflow_change_features(monthly, windows=windows)


# unfinished

In [15]:
def add_category_to_income_ratios_from_monthly(
    monthly_df: pd.DataFrame,
    txn_df: pd.DataFrame,
    # --- monthly df columns ---
    month_col: str = "month",
    consumer_id_col: str = "prism_consumer_id",
    monthly_income_col: str = "income_1m",      # comes from your existing df
    # --- txn df columns for category spend ---
    txn_date_col: str = "txn_date",
    txn_amt_col: str = "amount",                # always positive
    direction_col: str = "credit_debit",        # "CREDIT"/"DEBIT"
    category_col: str = "category_id",
    category_ids=None,                          # None | int | iterable[int]
    # --- behavior ---
    windows=(1, 3, 4, 6, 9, 12),
    fill_missing_months: bool = False,          # assume monthly_df already filled; set True if not
    label: str | None = None,                   # optional override for output prefix
) -> pd.DataFrame:
    """
    Speedy: uses precomputed monthly income (monthly_df) + computes category spend from txn_df,
    then adds rolling category_spend / income ratios.

    monthly_df must have: [month_col, consumer_id_col, monthly_income_col]
    """

    # ----- prep monthly base -----
    base = monthly_df[[month_col, consumer_id_col, monthly_income_col]].copy()
    base[month_col] = pd.to_datetime(base[month_col])
    base = base.sort_values([consumer_id_col, month_col]).reset_index(drop=True)
    base = base.rename(columns={monthly_income_col: "income"})

    # Optionally fill missing months (only if your base isn't already month-complete)
    if fill_missing_months and not base.empty:
        parts = []
        for cid, g in base.groupby(consumer_id_col, sort=False):
            full_months = pd.date_range(g[month_col].min(), g[month_col].max(), freq="MS")
            gg = (
                g.set_index(month_col)
                 .reindex(full_months)
                 .rename_axis(month_col)
                 .reset_index()
            )
            gg[consumer_id_col] = cid
            gg["income"] = gg["income"].fillna(0.0)
            parts.append(gg)
        base = pd.concat(parts, ignore_index=True).sort_values([consumer_id_col, month_col]).reset_index(drop=True)

    # ----- normalize category_ids + label -----
    if category_ids is None:
        cat_label = "all_debits" if label is None else label
        use_isin = False
        cat_set = None
    else:
        if isinstance(category_ids, (int, np.integer)):
            cat_set = {int(category_ids)}
        else:
            cat_set = set(int(x) for x in category_ids)
        cat_label = (f"cat_{'_'.join(map(str, sorted(cat_set)))}" if label is None else label)
        use_isin = True

    # ----- compute monthly category spend from txns (only debits) -----
    t = txn_df[[txn_date_col, txn_amt_col, direction_col, category_col, consumer_id_col]].copy()
    t[txn_date_col] = pd.to_datetime(t[txn_date_col], errors="coerce")
    t = t.dropna(subset=[txn_date_col])
    t["month"] = t[txn_date_col].dt.to_period("M").dt.to_timestamp()

    dir_upper = t[direction_col].astype(str).str.upper()
    debit_mask = dir_upper.eq("DEBIT")

    if category_ids is None:
        cat_mask = debit_mask
    else:
        cat_mask = debit_mask & t[category_col].isin(cat_set)

    cat_m = (
        t.loc[cat_mask]
         .groupby([consumer_id_col, "month"], as_index=False)[txn_amt_col]
         .sum()
         .rename(columns={"month": month_col, txn_amt_col: "cat_spend"})
    )

    # ----- merge category spend onto base -----
    out = base.merge(cat_m, on=[consumer_id_col, month_col], how="left")
    out["cat_spend"] = out["cat_spend"].fillna(0.0)

    # ----- rolling sums + ratios (per consumer) -----
    gb = out.groupby(consumer_id_col, group_keys=False)

    for W in windows:
        out[f"income_{W}m"] = gb["income"].transform(lambda s: s.rolling(W, min_periods=W).sum())
        out[f"{cat_label}_spend_{W}m"] = gb["cat_spend"].transform(lambda s: s.rolling(W, min_periods=W).sum())

        denom = out[f"income_{W}m"].replace(0, np.nan)
        out[f"{cat_label}_to_income_ratio_{W}m"] = out[f"{cat_label}_spend_{W}m"] / denom

    return out


In [18]:
monthly = build_monthly_cashflows(txn, window=1)

add_category_to_income_ratios(
    monthly,
    txn,
    category_ids= 20,                       # int | list[int] | set[int] | tuple[int]; None => all categories
    consumer_level = True,             # True => one row per consumer-month (recommended)
)

  .apply(_fill)


Unnamed: 0,month,prism_consumer_id,income_1m,spend_1m,net_flow_1m
0,2021-03-01,0,1000.22,1999.35,-999.13
1,2021-04-01,0,2167.37,2379.93,-212.56
2,2021-05-01,0,966.68,765.50,201.18
3,2021-06-01,0,185.70,3652.37,-3466.67
4,2021-07-01,0,2144.69,2276.21,-131.52
...,...,...,...,...,...
89894,2023-10-01,9998,2307.29,5124.35,-2817.06
89895,2023-05-01,9999,5614.26,4488.76,1125.50
89896,2023-06-01,9999,6714.26,12813.20,-6098.94
89897,2023-07-01,9999,4225.36,11658.85,-7433.49


In [12]:
catmap

Unnamed: 0,category_id,category
0,0,SELF_TRANSFER
1,1,EXTERNAL_TRANSFER
2,2,DEPOSIT
3,3,PAYCHECK
4,4,MISCELLANEOUS
5,5,PAYCHECK_PLACEHOLDER
6,6,REFUND
7,7,INVESTMENT_INCOME
8,8,OTHER_BENEFITS
9,9,UNEMPLOYMENT_BENEFITS


In [6]:
feats = cons.merge(feats, left_on = 'prism_consumer_id', right_on = 'prism_consumer_id')
y = feats['DQ_TARGET']
feats

Unnamed: 0,prism_consumer_id,evaluation_date,DQ_TARGET,months_observed,total_income,total_spend,total_net_flow,avg_1m_netflow,med_1m_netflow,sd_1m_netflow,...,avg_9m_netflow,med_9m_netflow,sd_9m_netflow,min_9m_netflow,max_9m_netflow,avg_12m_netflow,med_12m_netflow,sd_12m_netflow,min_12m_netflow,max_12m_netflow
0,0,2021-09-01,0.0,7,9320.56,14908.41,-5587.85,-798.264286,-212.560,1363.160352,...,,,,,,,,,,
1,1,2021-07-01,0.0,7,13411.59,23098.37,-9686.78,-1383.825714,-1039.490,1234.856462,...,,,,,,,,,,
2,2,2021-05-01,0.0,7,1100.00,22334.58,-21234.58,-3033.511429,-2565.230,2123.624465,...,,,,,,,,,,
3,3,2021-03-01,0.0,7,10777.81,19846.01,-9068.20,-1295.457143,-1608.440,1087.179614,...,,,,,,,,,,
4,4,2021-10-01,0.0,12,12074.44,17509.71,-5435.27,-452.939167,-30.215,930.433945,...,-1660.4375,-1630.805,1199.379404,-3141.46,-238.68,-5435.27,-5435.27,,-5435.27,-5435.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11596,13995,2022-01-22,0.0,9,2.23,850.00,-847.77,-94.196667,-15.640,150.518258,...,-847.7700,-847.770,,-847.77,-847.77,,,,,
11597,13996,2022-02-01,0.0,9,15183.27,53982.27,-38799.00,-4311.000000,-4841.210,1972.628444,...,-38799.0000,-38799.000,,-38799.00,-38799.00,,,,,
11598,13997,2021-12-24,0.0,7,5246.94,7425.31,-2178.37,-311.195714,0.010,903.133362,...,,,,,,,,,,
11599,13998,2022-01-30,0.0,9,33474.07,45669.84,-12195.77,-1355.085556,-140.090,3708.765924,...,-12195.7700,-12195.770,,-12195.77,-12195.77,,,,,


In [7]:
# if logistic --> impute NaNs
sd_cols = [c for c in feats.columns if c.startswith("sd_")]
feats[sd_cols] = feats[sd_cols].fillna(0.0)

# print(sum(feats['min_3m_netflow'].isna()))
for w in windows:
    ratio_cols = [c for c in feats.columns if f'_{w}m_' in c]
    for c in ratio_cols:
        feats[c] = feats[c].fillna(feats[c].mean())
# print(sum(feats['min_3m_netflow'].isna()))
feats

Unnamed: 0,prism_consumer_id,evaluation_date,DQ_TARGET,months_observed,total_income,total_spend,total_net_flow,avg_1m_netflow,med_1m_netflow,sd_1m_netflow,...,avg_9m_netflow,med_9m_netflow,sd_9m_netflow,min_9m_netflow,max_9m_netflow,avg_12m_netflow,med_12m_netflow,sd_12m_netflow,min_12m_netflow,max_12m_netflow
0,0,2021-09-01,0.0,7,9320.56,14908.41,-5587.85,-798.264286,-212.560,1363.160352,...,-32313.398635,-32310.40609,0.000000,-32418.865107,-32222.383159,-21365.851796,-19803.641429,0.0,-28988.351429,-17608.252857
1,1,2021-07-01,0.0,7,13411.59,23098.37,-9686.78,-1383.825714,-1039.490,1234.856462,...,-32313.398635,-32310.40609,0.000000,-32418.865107,-32222.383159,-21365.851796,-19803.641429,0.0,-28988.351429,-17608.252857
2,2,2021-05-01,0.0,7,1100.00,22334.58,-21234.58,-3033.511429,-2565.230,2123.624465,...,-32313.398635,-32310.40609,0.000000,-32418.865107,-32222.383159,-21365.851796,-19803.641429,0.0,-28988.351429,-17608.252857
3,3,2021-03-01,0.0,7,10777.81,19846.01,-9068.20,-1295.457143,-1608.440,1087.179614,...,-32313.398635,-32310.40609,0.000000,-32418.865107,-32222.383159,-21365.851796,-19803.641429,0.0,-28988.351429,-17608.252857
4,4,2021-10-01,0.0,12,12074.44,17509.71,-5435.27,-452.939167,-30.215,930.433945,...,-1660.437500,-1630.80500,1199.379404,-3141.460000,-238.680000,-5435.270000,-5435.270000,0.0,-5435.270000,-5435.270000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11596,13995,2022-01-22,0.0,9,2.23,850.00,-847.77,-94.196667,-15.640,150.518258,...,-847.770000,-847.77000,0.000000,-847.770000,-847.770000,-21365.851796,-19803.641429,0.0,-28988.351429,-17608.252857
11597,13996,2022-02-01,0.0,9,15183.27,53982.27,-38799.00,-4311.000000,-4841.210,1972.628444,...,-38799.000000,-38799.00000,0.000000,-38799.000000,-38799.000000,-21365.851796,-19803.641429,0.0,-28988.351429,-17608.252857
11598,13997,2021-12-24,0.0,7,5246.94,7425.31,-2178.37,-311.195714,0.010,903.133362,...,-32313.398635,-32310.40609,0.000000,-32418.865107,-32222.383159,-21365.851796,-19803.641429,0.0,-28988.351429,-17608.252857
11599,13998,2022-01-30,0.0,9,33474.07,45669.84,-12195.77,-1355.085556,-140.090,3708.765924,...,-12195.770000,-12195.77000,0.000000,-12195.770000,-12195.770000,-21365.851796,-19803.641429,0.0,-28988.351429,-17608.252857


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

auc_df = pd.DataFrame()

feature_cols = feats.drop(
    columns=["prism_consumer_id", "DQ_TARGET", "evaluation_date"]
)

for col in feature_cols.columns:
    X = feature_cols[[col]]
    y = y
    
    model = LogisticRegression(class_weight='balanced')
    model.fit(X, y)
    
    preds = model.predict_proba(X)[:, 1]
    
    score = roc_auc_score(y, preds)

    auc_df = pd.concat([auc_df, pd.DataFrame([{"feature": col, "auc_roc": score}])],ignore_index=True)
    # print(col)
    # print(score)
auc_df

Unnamed: 0,feature,auc_roc
0,months_observed,0.658742
1,total_income,0.591362
2,total_spend,0.579206
3,total_net_flow,0.520448
4,avg_1m_netflow,0.520235
5,med_1m_netflow,0.528269
6,sd_1m_netflow,0.531666
7,min_1m_netflow,0.527243
8,max_1m_netflow,0.592466
9,avg_3m_netflow,0.528164
