In [None]:
# Notebook 2 — Logistic Regression (Unsettled vs the rest)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, classification_report

transfers = pd.read_csv("../data/transfers_level.csv", parse_dates=["created_dt","funded_dt","transferred_dt"])

# Target: 1 if UNSETTLED else 0 (per AP)
y = (transfers["status"] == "unsettled").astype(int)

# Features
transfers["created_hour"] = transfers["created_dt"].dt.hour
transfers["created_dow"] = transfers["created_dt"].dt.dayofweek
X = transfers[["region","platform","experience","created_hour","created_dow"]]

cat = ["region","platform","experience"]
num = ["created_hour","created_dow"]

pre = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), cat), ("num", "passthrough", num)]
)

clf = Pipeline([("prep", pre), ("lr", LogisticRegression(max_iter=1000, class_weight="balanced"))])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
proba = clf.predict_proba(X_test)[:,1]

print("ROC-AUC:", roc_auc_score(y_test, proba))
print(classification_report(y_test, pred))

In [None]:

# Inspect approximate feature effects via coefficients
import numpy as np
ohe = clf.named_steps["prep"].named_transformers_["cat"]
cat_features = list(ohe.get_feature_names_out(cat))
feature_names = cat_features + num
coef = clf.named_steps["lr"].coef_[0]
coef_df = pd.DataFrame({"feature": feature_names, "coef": coef}).sort_values("coef", ascending=False)
coef_df.head(25)
