In [1]:
import pandas as pd
import numpy as np

consDF = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-consDF.pqt")
consDF = consDF.drop(columns = ["credit_score"])
testDF = consDF[consDF['DQ_TARGET'].isna()]
consDF = consDF.dropna()
acctDF = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-acctDF.pqt")
acctDF = acctDF[acctDF["prism_consumer_id"].isin(consDF['prism_consumer_id'])]
trxnDF = pd.read_parquet("/uss/hdsi-prismdata/q2-ucsd-trxnDF.pqt")
trxnDF = trxnDF[trxnDF["prism_consumer_id"].isin(consDF['prism_consumer_id'])]
catmap = pd.read_csv("/uss/hdsi-prismdata/q2-ucsd-cat-map.csv")

In [2]:
income_cats = [2,3,5,7,8,9,49]
windows = [1,3,6,9]
categories = catmap['category_id'].unique()

# def consumer_agg(
#     df: pd.DataFrame,
#     group_col: str,
#     feat_col: str,
#     window: int,
#     label: str,
#     stats=("mean", "median", "std", "min", "max"),
#     prefix_map=None,
# ) -> pd.DataFrame:
#     """
#     Generic consumer-level aggregation for a single feature column.

#     Example output columns:
#       avg_6m_netflow, med_6m_netflow, sd_6m_netflow, ...

#     Parameters
#     ----------
#     df : DataFrame
#         Must contain group_col and feat_col
#     group_col : str
#         Consumer id
#     feat_col : str
#         Column to aggregate (e.g. 'net_flow_6m')
#     window : int
#         Rolling window length (used for naming)
#     stats : tuple[str]
#         Aggregations to compute
#     prefix_map : dict[str,str]
#         Optional mapping: {"mean":"avg","median":"med","std":"sd","min":"min","max":"max"}
#     """
#     if prefix_map is None:
#         prefix_map = {"mean":"avg","median":"med","std":"sd","min":"min","max":"max"}

#     agg_spec = {f"{prefix_map[s]}_{window}m_{label}": (feat_col, s) for s in stats}

#     return df.groupby(group_col, as_index=False).agg(**agg_spec)

def consumer_agg(
    df: pd.DataFrame,
    group_col: str,
    feat_cols: list[str],
    stats=("mean", "median", "std", "min", "max"),
    prefix_map=None,
    # optional "single-feature pretty naming"
    window = windows,
    label: str | None = None,
) -> pd.DataFrame:
    """
    Consumer-level aggregation.

    Two modes:

    A) Multi-feature mode (default):
       feat_cols = ["x_3m", "y_3m"]  ->
         avg_x_3m, med_x_3m, sd_x_3m, min_x_3m, max_x_3m,
         avg_y_3m, ...

    B) Single-feature pretty naming:
       feat_cols = ["net_flow_6m"], window=6, label="netflow" ->
         avg_6m_netflow, med_6m_netflow, sd_6m_netflow, min_6m_netflow, max_6m_netflow
    """
    if prefix_map is None:
        prefix_map = {"mean": "avg", "median": "med", "std": "sd", "min": "min", "max": "max"}

    feat_cols = list(feat_cols)

    missing = [c for c in feat_cols if c not in df.columns]
    if missing:
        raise KeyError(f"consumer_agg: missing columns in df: {missing}")

    agg_spec = {}

    single_pretty = (window is not None) and (label is not None) and (len(feat_cols) == 1)
    if single_pretty:
        c = feat_cols[0]
        for s in stats:
            agg_spec[f"{prefix_map.get(s, s)}_{int(window)}m_{label}"] = (c, s)
    else:
        for c in feat_cols:
            for s in stats:
                agg_spec[f"{prefix_map.get(s, s)}_{c}"] = (c, s)

    return df.groupby(group_col, as_index=False).agg(**agg_spec)


def build_monthly_cashflows(
    txn_df: pd.DataFrame,
    window: int,
    income_cats,
    date_col="posted_date",
    amt_col="amount",
    consumer_col="prism_consumer_id",
    cd_col="credit_or_debit",
    cat_col="category",
    fill_months=True,
    min_periods=None,
    return_consumer_level=False,
):
    """
    Builds monthly income/spend and rolling net flow features.

    Fixes:
      - avoids groupby.apply DeprecationWarning via include_groups=False
      - avoids risky reindex(fill_value=0.0) by only filling numeric columns
      - normalizes CREDIT/DEBIT casing
    """
    w = int(window)
    if min_periods is None:
        min_periods = w

    df = txn_df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.dropna(subset=[date_col, consumer_col])
    df = df.sort_values([consumer_col, date_col])

    df["month"] = df[date_col].dt.to_period("M").dt.to_timestamp()

    # normalize CREDIT/DEBIT
    dir_upper = df[cd_col].astype(str).str.upper()

    df[amt_col] = pd.to_numeric(df[amt_col], errors="coerce")
    df = df.dropna(subset=[amt_col])

    df["income_amt"] = np.where(
        (dir_upper == "CREDIT") & (df[cat_col].isin(income_cats)),
        df[amt_col].astype(float),
        0.0,
    )
    df["spend_amt"] = np.where(
        (dir_upper == "DEBIT"),
        df[amt_col].astype(float),
        0.0,
    )

    monthly = (
        df.groupby([consumer_col, "month"], as_index=False)
          .agg(income=("income_amt", "sum"),
               spend=("spend_amt", "sum"))
          .sort_values([consumer_col, "month"])
          .reset_index(drop=True)
    )

    # --- fill missing months within each consumer (safe fill) ---
    if fill_months and not monthly.empty:
        fill_cols = ["income", "spend"]
    
        def _fill(g: pd.DataFrame) -> pd.DataFrame:
            cid = g.name  # <-- group key (consumer id) when include_groups=False
            full = pd.date_range(g["month"].min(), g["month"].max(), freq="MS")
    
            g2 = (
                g.set_index("month")
                 .reindex(full)
                 .rename_axis("month")
                 .reset_index()
            )
    
            g2[consumer_col] = cid
    
            for c in fill_cols:
                if c not in g2.columns:
                    g2[c] = 0.0
            g2[fill_cols] = g2[fill_cols].fillna(0.0)
    
            return g2
    
        monthly = (
            monthly
            .groupby(consumer_col, group_keys=False)
            .apply(_fill, include_groups=False)
            .sort_values([consumer_col, "month"])
            .reset_index(drop=True)
        )


    # --- rolling features ---
    g = monthly.groupby(consumer_col, group_keys=False)

    if w == 1:
        monthly["income_1m"] = monthly["income"]
        monthly["spend_1m"] = monthly["spend"]
        monthly["net_flow_1m"] = monthly["income_1m"] - monthly["spend_1m"]
        feat_col = "net_flow_1m"
    else:
        monthly[f"income_{w}m"] = g["income"].transform(lambda s: s.rolling(w, min_periods=min_periods).sum())
        monthly[f"spend_{w}m"]  = g["spend"].transform(lambda s: s.rolling(w, min_periods=min_periods).sum())
        monthly[f"net_flow_{w}m"] = monthly[f"income_{w}m"] - monthly[f"spend_{w}m"]
        feat_col = f"net_flow_{w}m"

    if not return_consumer_level:
        return monthly

    return consumer_agg(
        df=monthly,
        group_col=consumer_col,
        feat_cols=[feat_col],
        window=w,
        label="netflow",
    )

In [3]:
# monthly 1m table
m1 = build_monthly_cashflows(
    trxnDF,
    window=1,
    income_cats=income_cats,
    return_consumer_level=False
)

# base consumer totals
netflow_feats = (
    m1.groupby("prism_consumer_id", as_index=False)
      .agg(
          months_observed=("month", "nunique"),
          total_income=("income_1m", "sum"),
          total_spend=("spend_1m", "sum"),
          total_net_flow=("net_flow_1m", "sum"),
      )
)

# add windowed consumer-level summary stats
for w in windows:
    cons_w = build_monthly_cashflows(
        trxnDF,
        window=w,
        income_cats=income_cats,
        return_consumer_level=True
    )
    netflow_feats = netflow_feats.merge(cons_w, on="prism_consumer_id", how="left")

netflow_feats

Unnamed: 0,prism_consumer_id,months_observed,total_income,total_spend,total_net_flow,avg_1m_netflow,med_1m_netflow,sd_1m_netflow,min_1m_netflow,max_1m_netflow,...,avg_6m_netflow,med_6m_netflow,sd_6m_netflow,min_6m_netflow,max_6m_netflow,avg_9m_netflow,med_9m_netflow,sd_9m_netflow,min_9m_netflow,max_9m_netflow
0,0,7,9320.56,14908.41,-5587.85,-798.264286,-212.560,1363.160352,-3466.67,513.28,...,-5344.925,-5344.925,1069.435367,-6101.13,-4588.72,,,,,
1,1,7,13411.59,23098.37,-9686.78,-1383.825714,-1039.490,1234.856462,-3232.42,79.83,...,-8209.595,-8209.595,1732.715670,-9434.81,-6984.38,,,,,
2,10,7,15420.74,21766.60,-6345.86,-906.551429,-978.850,601.382838,-1915.14,-197.84,...,-5633.285,-5633.285,535.117199,-6011.67,-5254.90,,,,,
3,100,6,24411.78,39742.61,-15330.83,-2555.138333,-2087.665,2408.512923,-5515.09,624.68,...,-15330.830,-15330.830,,-15330.83,-15330.83,,,,,
4,1000,7,48378.60,77914.99,-29536.39,-4219.484286,-618.080,7192.050945,-18367.94,1348.32,...,-28927.255,-28927.255,12.650140,-28936.20,-28918.31,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11596,995,7,36558.09,31466.47,5091.62,727.374286,1065.550,2194.693702,-3745.92,3036.70,...,4245.425,4245.425,310.214816,4026.07,4464.78,,,,,
11597,996,7,4107.21,196249.61,-192142.40,-27448.914286,-26736.070,15990.433902,-56765.29,-7395.93,...,-171790.530,-171790.530,18322.466061,-184746.47,-158834.59,,,,,
11598,997,7,33797.46,81076.60,-47279.14,-6754.162857,-6003.820,2024.975140,-10818.18,-4970.93,...,-41756.060,-41756.060,167.867150,-41874.76,-41637.36,,,,,
11599,998,7,38813.02,66207.01,-27393.99,-3913.427143,-2019.060,5029.301050,-11758.49,560.53,...,-22484.075,-22484.075,7736.377511,-27954.52,-17013.63,,,,,


In [4]:
def build_monthly_category_to_income(
    txn_df: pd.DataFrame,
    income_cats,
    category_ids = categories,
    date_col: str = "posted_date",
    consumer_col: str = "prism_consumer_id",
    amt_col: str = "amount",
    direction_col: str = "credit_or_debit",
    category_col: str = "category",
    window=windows,
    fill_missing_months: bool = True,
    min_periods: int | None = None,
    consumer_level: bool = False,
    agg_stats=("mean", "median", "std", "min", "max"),
) -> pd.DataFrame:

    # --- normalize category_ids ---
    if isinstance(category_ids, (int, np.integer)):
        category_ids = [int(category_ids)]
    else:
        category_ids = [int(x) for x in category_ids]
    category_ids = sorted(set(category_ids))

    # --- prep txns ---
    t = txn_df[[date_col, consumer_col, amt_col, direction_col, category_col]].copy()
    t[date_col] = pd.to_datetime(t[date_col], errors="coerce")
    t = t.dropna(subset=[date_col, consumer_col])

    t[amt_col] = pd.to_numeric(t[amt_col], errors="coerce")
    t = t.dropna(subset=[amt_col])

    t["month"] = t[date_col].dt.to_period("M").dt.to_timestamp()

    dir_upper = t[direction_col].astype(str).str.upper()
    is_credit = dir_upper.eq("CREDIT")
    is_debit  = dir_upper.eq("DEBIT")

    income_cats_set = set(int(x) for x in income_cats)

    # --- monthly income ---
    income_m = (
        t.loc[is_credit & t[category_col].isin(income_cats_set)]
         .groupby([consumer_col, "month"], as_index=False)[amt_col]
         .sum()
         .rename(columns={amt_col: "income_1m"})
    )

    # --- monthly spend for requested categories (all at once) ---
    spend_1m_cols = [f"cat{cid}_spend_1m" for cid in category_ids]

    spend_m = (
        t.loc[is_debit & t[category_col].isin(category_ids)]
         .groupby([consumer_col, "month", category_col])[amt_col]
         .sum()
         .unstack(category_col)
    )

    if not spend_m.empty:
        spend_m = spend_m.reindex(columns=category_ids)  # stable order + include missing cats
        spend_m.columns = spend_1m_cols
        spend_m = spend_m.reset_index()
    else:
        spend_m = pd.DataFrame(columns=[consumer_col, "month"] + spend_1m_cols)

    # --- base (one merge) ---
    base = income_m.merge(spend_m, on=[consumer_col, "month"], how="outer")

    if base.empty:
        cols = [consumer_col, "month", "income_1m"] + spend_1m_cols
        return pd.DataFrame(columns=cols)

    base = base.sort_values([consumer_col, "month"]).reset_index(drop=True)

    # fill missing monthly values
    base["income_1m"] = base["income_1m"].fillna(0.0)
    for c in spend_1m_cols:
        if c not in base.columns:
            base[c] = 0.0
    base[spend_1m_cols] = base[spend_1m_cols].fillna(0.0)

    # --- fill missing months within each consumer (no .name / no include_groups needed) ---
    if fill_missing_months and not base.empty:
        fill_cols = ["income_1m"] + spend_1m_cols

        spans = (
            base.groupby(consumer_col, as_index=False)["month"]
                .agg(min_month="min", max_month="max")
        )
        spans["month"] = spans.apply(
            lambda r: pd.date_range(r["min_month"], r["max_month"], freq="MS"),
            axis=1
        )
        grid = spans[[consumer_col, "month"]].explode("month", ignore_index=True)

        base = grid.merge(base, on=[consumer_col, "month"], how="left")

        for c in fill_cols:
            if c not in base.columns:
                base[c] = 0.0
        base[fill_cols] = base[fill_cols].fillna(0.0)

        base = base.sort_values([consumer_col, "month"]).reset_index(drop=True)

    # --- rolling features ---
    gb = base.groupby(consumer_col, group_keys=False)
    frames = []

    # (A) If window includes 1: only create ratio_1m (DON'T re-add income_1m/spend_1m)
    if 1 in set(int(w) for w in window):
        denom_1m = base["income_1m"].replace(0, np.nan)
        ratio_1m = base[spend_1m_cols].div(denom_1m, axis=0).rename(
            columns=lambda c: c.replace("_spend_1m", "_to_income_ratio_1m")
        )
        frames.append(ratio_1m)

    # (B) W > 1: create income_Wm, spend_Wm, ratio_Wm
    for W in [int(w) for w in window if int(w) != 1]:
        mp = W if min_periods is None else int(min_periods)

        income_roll = gb["income_1m"].transform(lambda s: s.rolling(W, min_periods=mp).sum())
        denom = income_roll.replace(0, np.nan)

        spend_roll = gb[spend_1m_cols].transform(lambda df: df.rolling(W, min_periods=mp).sum())
        spend_roll = spend_roll.rename(columns=lambda c: c.replace("_spend_1m", f"_spend_{W}m"))

        ratio = spend_roll.div(denom, axis=0).rename(
            columns=lambda c: c.replace("_spend_", "_to_income_ratio_")
        )

        frames.append(pd.concat([income_roll.rename(f"income_{W}m"), spend_roll, ratio], axis=1))

    base = pd.concat([base] + frames, axis=1).copy()

    # --- return monthly or consumer-level ---
    if not consumer_level:
        return base

    agg_cols = []
    for cat_id in category_ids:
        label = f"cat{cat_id}"
        for W in [int(w) for w in window]:
            agg_cols.extend([
                f"{label}_spend_{W}m" if W != 1 else f"{label}_spend_1m",
                f"{label}_to_income_ratio_{W}m",
            ])

    return consumer_agg(
        df=base,
        group_col=consumer_col,
        feat_cols=agg_cols,
        stats=agg_stats,
    )


In [5]:
# monthly = build_monthly_category_to_income(
#     txn_df=trxnDF,
#     category_ids=[18, 20],
#     income_cats=income_cats,
#     window=(3, 6),
#     consumer_level=False,
# )

cat_ratio_feats = build_monthly_category_to_income(
    txn_df=trxnDF,
    income_cats=income_cats,
    # category_ids=[18, 20],
    # window=[1,3],
    consumer_level=True,
)

  return df.groupby(group_col, as_index=False).agg(**agg_spec)


In [6]:
# monthly

In [7]:
cat_ratio_feats

Unnamed: 0,prism_consumer_id,avg_cat0_spend_1m,med_cat0_spend_1m,sd_cat0_spend_1m,min_cat0_spend_1m,max_cat0_spend_1m,avg_cat0_to_income_ratio_1m,med_cat0_to_income_ratio_1m,sd_cat0_to_income_ratio_1m,min_cat0_to_income_ratio_1m,...,avg_cat49_spend_9m,med_cat49_spend_9m,sd_cat49_spend_9m,min_cat49_spend_9m,max_cat49_spend_9m,avg_cat49_to_income_ratio_9m,med_cat49_to_income_ratio_9m,sd_cat49_to_income_ratio_9m,min_cat49_to_income_ratio_9m,max_cat49_to_income_ratio_9m
0,0,67.338571,43.64,61.742993,10.00,176.44,0.097667,0.045144,0.163582,0.009998,...,,,,,,,,,,
1,1,1300.428571,1065.00,696.831605,569.00,2619.00,0.753606,0.584402,0.506534,0.301338,...,,,,,,,,,,
2,10,271.507143,230.00,273.997385,0.00,600.55,0.141775,0.068539,0.160186,0.000000,...,,,,,,,,,,
3,100,1992.113333,1860.16,969.847252,1101.88,3160.16,0.527689,0.442395,0.341886,0.251854,...,,,,,,,,,,
4,1000,7054.987143,4546.66,7569.116043,1000.00,22899.63,1.046170,0.702107,0.784976,0.265232,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11511,995,0.148571,0.00,0.393083,0.00,1.04,0.000018,0.000000,0.000046,0.000000,...,,,,,,,,,,
11512,996,11337.808571,11500.00,8273.802795,0.00,23971.42,1.625856,1.625856,2.299308,0.000000,...,,,,,,,,,,
11513,997,4655.654286,4843.70,1034.651032,2454.07,5559.99,2.388590,1.027782,3.816735,0.584379,...,,,,,,,,,,
11514,998,2689.627143,900.00,3106.537795,0.00,7657.39,0.650571,0.125355,0.980806,0.000000,...,,,,,,,,,,


In [8]:
mean_impute = netflow_feats.merge(cat_ratio_feats, on = 'prism_consumer_id')
# mean impute

# sd_cols = [c for c in mean_impute.columns if c.startswith("sd_")]
# mean_impute[sd_cols] = mean_impute[sd_cols].fillna(0.0)

# # print(sum(feats['min_3m_netflow'].isna()))
# for w in windows:
#     # ratio_cols = [c for c in mean_impute.columns if f'_{w}m_' in c]
#     for c in mean_impute.columns:
#         mean_impute[c] = mean_impute[c].fillna(mean_impute[c].mean())
# # print(sum(feats['min_3m_netflow'].isna()))

for c in mean_impute.columns:
    if c != 'prism_consumer_id':
        mean_impute[c] = mean_impute[c].fillna(mean_impute[c].mean())

mean_impute = consDF.merge(mean_impute, on = 'prism_consumer_id')
y = mean_impute['DQ_TARGET']

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

auc_df = pd.DataFrame()

feature_cols = mean_impute.drop(
    columns=["prism_consumer_id", "DQ_TARGET", "evaluation_date"]
)

for col in feature_cols.columns:
    X = feature_cols[[col]]
    y = y
    
    model = LogisticRegression(class_weight='balanced')
    model.fit(X, y)
    
    preds = model.predict_proba(X)[:, 1]
    
    score = roc_auc_score(y, preds)

    auc_df = pd.concat([auc_df, pd.DataFrame([{"feature": col, "auc_roc_mean": score}])],ignore_index=True)
    # print(col)
    # print(score)

# auc_df

In [9]:
# med_impute = netflow_feats.merge(cat_ratio_feats, on = 'prism_consumer_id')
# # median impute

# # sd_cols = [c for c in med_impute.columns if c.startswith("sd_")]
# # med_impute[sd_cols] = med_impute[sd_cols].fillna(0.0)

# # # print(sum(feats['min_3m_netflow'].isna()))
# # for w in windows:
# #     # ratio_cols = [c for c in med_impute.columns if f'_{w}m_' in c]
# #     for c in med_impute.columns:
# #         med_impute[c] = med_impute[c].fillna(med_impute[c].median())
# # # print(sum(feats['min_3m_netflow'].isna()))

# for c in med_impute.columns:
#     if c != 'prism_consumer_id':
#         med_impute[c] = med_impute[c].fillna(med_impute[c].median())

# med_impute = consDF.merge(med_impute, on = 'prism_consumer_id')
# y = med_impute['DQ_TARGET']

# # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# feature_cols = med_impute.drop(
#     columns=["prism_consumer_id", "DQ_TARGET", "evaluation_date"]
# )

# temp_df = pd.DataFrame()
# for col in feature_cols.columns:
#     X = feature_cols[[col]]
#     y = y
    
#     model = LogisticRegression(class_weight='balanced')
#     model.fit(X, y)
    
#     preds = model.predict_proba(X)[:, 1]
    
#     score = roc_auc_score(y, preds)

#     temp_df = pd.concat([temp_df, pd.DataFrame([{"feature": col, "auc_roc_med": score}])],ignore_index=True)

# auc_df = auc_df.merge(temp_df, on='feature')
#     # auc_df = pd.concat([auc_df, pd.DataFrame([{"feature_med": col, "auc_roc_med": score}])],axis = 1,ignore_index=True)
#     # print(col)
#     # print(score)

In [10]:
# sd_0_impute = netflow_feats.merge(cat_ratio_feats, on = 'prism_consumer_id')

# sd_cols = [c for c in sd_0_impute.columns if c.startswith("sd_")]
# sd_0_impute[sd_cols] = sd_0_impute[sd_cols].fillna(0.0)

# y = med_impute['DQ_TARGET']

# # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# # feature_cols = sd_0_impute.drop(
# #     columns=["prism_consumer_id", "DQ_TARGET", "evaluation_date"]
# # )

# temp_df = pd.DataFrame()
# for col in sd_cols:
#     X = sd_0_impute[[col]]
#     y = y
    
#     model = LogisticRegression(class_weight='balanced')
#     model.fit(X, y)
    
#     preds = model.predict_proba(X)[:, 1]
    
#     score = roc_auc_score(y, preds)

#     temp_df = pd.concat([temp_df, pd.DataFrame([{"feature": col, "auc_roc_sd_0": score}])],ignore_index=True)

# auc_df = auc_df.merge(temp_df, on='feature', how='left')
#     # print(col)
#     # print(score)
# auc_df

In [11]:
# auc_df['max_auc'] = np.where(auc_df['auc_roc_mean'] == auc_df['auc_roc_med'], 'same', 
#                             np.where(auc_df['auc_roc_mean'] > auc_df['auc_roc_med'], 'auc_roc_mean', 'auc_roc_med')
#                             )
# sd_rows = auc_df.index[auc_df["auc_roc_sd_0"].notna()].tolist()
# for r in sd_rows:
#     if auc_df.loc[r, 'max_auc'] == 'same':
#         if auc_df.loc[r, 'auc_roc_sd_0'] > auc_df.loc[r, 'auc_roc_mean']:
#             auc_df.loc[r, 'max_auc'] = 'auc_roc_sd_0'
#     else:
#         if auc_df.loc[r, 'auc_roc_sd_0'] > auc_df.loc[r, auc_df.loc[r, 'max_auc']]:
#             auc_df.loc[r, 'max_auc'] = 'auc_roc_sd_0'

# auc_df

In [12]:
from IPython.display import display

# Assume 'df' is your DataFrame

with pd.option_context('display.max_rows', None, 
                       'display.max_columns', None, 
                       'display.width', None,
                       'display.max_colwidth', None):
    display(auc_df.sort_values(by='auc_roc_mean', ascending=False))

Unnamed: 0,feature,auc_roc_mean
0,months_observed,0.660786
1,total_income,0.594597
2,total_spend,0.582362
3,total_net_flow,0.522839
4,avg_1m_netflow,0.51829
5,med_1m_netflow,0.526434
6,sd_1m_netflow,0.543766
7,min_1m_netflow,0.529795
8,max_1m_netflow,0.592898
9,avg_3m_netflow,0.527424


In [13]:
from IPython.display import display

# Assume 'df' is your DataFrame

with pd.option_context('display.max_rows', None, 
                       'display.max_columns', None, 
                       'display.width', None,
                       'display.max_colwidth', None):
    display(auc_df.sort_values(by='auc_roc_mean', ascending=False))

Unnamed: 0,feature,auc_roc_mean
970,med_cat23_to_income_ratio_6m,0.688311
973,max_cat23_to_income_ratio_6m,0.688053
969,avg_cat23_to_income_ratio_6m,0.687923
533,max_cat12_to_income_ratio_6m,0.687762
529,avg_cat12_to_income_ratio_6m,0.687587
530,med_cat12_to_income_ratio_6m,0.685975
972,min_cat23_to_income_ratio_6m,0.685498
813,max_cat19_to_income_ratio_6m,0.685386
1093,max_cat26_to_income_ratio_6m,0.684781
532,min_cat12_to_income_ratio_6m,0.684715


In [14]:
# auc_df[auc_df['max_auc'] == 'auc_roc_mean']

In [15]:
# auc_df[auc_df['max_auc'] == 'auc_roc_med']

In [16]:
X = feature_cols
y = y

model = LogisticRegression(class_weight='balanced')
model.fit(X, y)

preds = model.predict_proba(X)[:, 1]

score = roc_auc_score(y, preds)

# auc_df = pd.concat([auc_df, pd.DataFrame([{"feature": col, "auc_roc_mean": score}])],ignore_index=True)
# print(col)
print(len(feature_cols))
print(score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


11516
0.7797731539667023
