# RiskSampler – Intelligent Sample Weighting for XGBoost  
Demo notebook showcasing multiple weighting strategies available in **RiskSampler** and their impact on model performance.

## Objectives  
1. Load and clean a **binary classification** dataset (Lending Club Loans).  
2. Create different sample–weight schemes with **RiskSampler** (`balanced`, `equal_vintage`, `stabilise_er`, `recency_decay`, `expected_loss`, `combo`).  
3. Train an **XGBoost** classifier **with and without** sample weights.  
4. Compare metrics (MCC, ROC‑AUC, Precision, Recall, F1).  
5. Provide a template you can adapt to your own datasets and targets.

In [1]:
# Uncomment the next line the first time you run the notebook
# %pip install -e ..   # path to RiskSampler repo (edit as needed)

import pandas as pd
import numpy as np
import warnings, os, pathlib, sys, datetime as dt
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import (matthews_corrcoef, roc_auc_score,
                             precision_score, recall_score, f1_score)
from xgboost import XGBClassifier

# RiskSampler – adjust import if your package layout differs
# Example API: SampleWeightFactory(strategy).fit_transform(...)
from risk_sampler import RiskSampler, BehaviorPDBuilder

In [2]:
############################
# Configuration parameters #
############################
DATA_PATH = pathlib.Path('../../datasets/lending_club/accepted_2007_to_2018Q4.csv')
NROWS     = 200_000           # adjust for faster runs
DEFAULT_RAW = 'loan_status'
DEFAULT     = 'default'
TARGET = 'EVER90M12'
DATE_COL   = 'issue_d'        # YYYY‑MM formatted (in LendingClub)
ID_COLS = ['id','member_id']
RANDOM_SEED = 42
TEST_SIZE   = 0.25


In [3]:
def load_dataset(path: pathlib.Path, nrows=None):
    if path.exists():
        df = pd.read_csv(path, nrows=nrows, low_memory=False)
        print(f'Loaded {df.shape} rows × {df.shape[1]} cols')
        return df
    else:
        print(f'⚠️ Dataset not found at {path}. Generating synthetic demo dataset…')
        from sklearn.datasets import make_classification
        X, y = make_classification(n_samples=20_000, n_features=30,
                                   n_informative=10, n_redundant=5,
                                   weights=[0.88, 0.12],
                                   random_state=RANDOM_SEED)
        df = pd.DataFrame(X, columns=[f'feat_{i}' for i in range(X.shape[1])])
        df['issue_d'] = (pd.Timestamp('2010‑01‑01') + 
                         pd.to_timedelta(np.random.randint(0, 365*10, size=len(df)), unit='D'))
        df[DEFAULT_RAW] = np.where(y==1, 'Charged Off', 'Fully Paid')
        return df

df = load_dataset(DATA_PATH, NROWS)
df.head()


Loaded (200000, 151) rows × 151 cols


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


In [4]:
# Map raw loan_status to binary DEFAULT
positive_statuses = {'Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off'}
df[DEFAULT] = df[DEFAULT_RAW].isin(positive_statuses).astype(int)

print(df[DEFAULT].value_counts(normalize=True).rename('event_rate'))

default
0    0.824545
1    0.175455
Name: event_rate, dtype: float64


In [5]:
from risk_sampler import TargetBuilder

builder = TargetBuilder(
    id_col=ID_COLS[0],
    date_col=DATE_COL,
    dpd_col=DEFAULT,
    targets=["EVER90M12", "OVER90M12"],
)

df = builder.transform(df)

In [7]:
df[["EVER90M12", "OVER90M12"]].head()

Unnamed: 0,EVER90M12,OVER90M12
50494,0,0
33065,0,0
163459,0,0
1480,0,0
68327,0,0


In [8]:
df.EVER90M12.value_counts()

EVER90M12
0    200000
Name: count, dtype: int64

In [None]:
builder = BehaviorPDBuilder(
    id_col=ID_COLS[0],
    ref_col=DATE_COL,
    default_col=DEFAULT,
    target_col=TARGET,     # se já existir; senão deixe None
    cure_gap=3               # 3 meses performando definem cura
)

print(df.shape)
df = builder.transform(df)
print(df.shape)

In [None]:
# Very naive feature subset for demo – drop high cardinality or leakage columns
drop_cols = [TARGET_RAW, DEFAULT, DATE_COL, 'id', 'member_id']
num_cols  = df.select_dtypes(include='number').columns.difference(drop_cols)
cat_cols  = df.select_dtypes(include='object').columns.difference(drop_cols + [DATE_COL])

# For simplicity, use only numeric columns
features = num_cols.tolist()
print(f'Selected {len(features)} numeric features')


In [None]:
X = df[features]
y = df[DEFAULT]

# Simple YYYY‑MM vintage

df['vintage'] = pd.to_datetime(df[DATE_COL]).dt.to_period('M').astype(str)

X_train, X_test, y_train, y_test, vint_train, vint_test = train_test_split(
    X, y, df['vintage'], test_size=TEST_SIZE, stratify=y, random_state=RANDOM_SEED)

print('Train shape:', X_train.shape, '| Pos rate:', y_train.mean().round(4))
print('Test  shape:', X_test.shape, '| Pos rate:', y_test.mean().round(4))


In [None]:
def evaluate(model, X_tr, y_tr, X_te, y_te):
    yhat_tr = model.predict(X_tr)
    yhat_te = model.predict(X_te)

    metrics = {
        'MCC_train': matthews_corrcoef(y_tr, yhat_tr),
        'MCC_test' : matthews_corrcoef(y_te, yhat_te),
        'ROC_AUC_train': roc_auc_score(y_tr, model.predict_proba(X_tr)[:,1]),
        'ROC_AUC_test' : roc_auc_score(y_te, model.predict_proba(X_te)[:,1]),
        'F1_test'  : f1_score(y_te, yhat_te),
        'Precision_test': precision_score(y_te, yhat_te),
        'Recall_test'   : recall_score(y_te, yhat_te)
    }
    return metrics


In [None]:
baseline = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=RANDOM_SEED,
    n_jobs=-1
)
baseline.fit(X_train, y_train)
baseline_metrics = evaluate(baseline, X_train, y_train, X_test, y_test)
print('Baseline (no weights):')
baseline_metrics


In [None]:
schemes = {
    'balanced': {
        'strategies': 'balanced',
        'date_col': vint_train,
        'target_col': y_train,
    },
    'equal_vintage': {
        'strategies': 'equal_vintage',
        'vintage_col': vint_train,
        'target_col': y_train,
        'date_col': vint_train,
    },
    'stabilise_er': {
        'strategies': 'stabilise_er',
        'target_col': y_train,
        'date_col': vint_train,
    },
    'recency_decay': {
        'strategies': 'recency_decay',
        'vintage_col': vint_train,
        'decay_rate': 0.1,
        'target_col': y_train,
        'date_col': vint_train,
    },
}

# Add expected_loss only if column exists
if 'loan_amnt' in df.columns:
    schemes['expected_loss'] = {
        'strategies': 'expected_loss',
        'loss_col': X_train['loan_amnt'],
        'target_col': y_train,
        'date_col': vint_train,
    }
    schemes['combo'] = {
        'strategies': 'combo',
        'components': [
            {
                'strategies': 'expected_loss',
                'loss_col': X_train['loan_amnt'],
                'target_col': y_train,
                'date_col': vint_train,
            },
            {
                'strategies': 'recency_decay',
                'vintage_col': vint_train,
                'decay_rate': 0.1,
                'target_col': y_train,
                'date_col': vint_train,
            }
        ],
        'target_col': y_train,
        'date_col': vint_train,
    }


models_metrics = {}

for name, params in schemes.items():
    print(f'\n=== {name.upper()} ===')
    factory = RiskSampler(**params)
    sw = factory.fit_transform(X_train)   # adjust API if different

    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=RANDOM_SEED,
        n_jobs=-1
    )
    model.fit(X_train, y_train, sample_weight=sw)
    metrics = evaluate(model, X_train, y_train, X_test, y_test)
    models_metrics[name] = metrics

models_metrics


In [None]:
import pandas as pd
results_df = pd.DataFrame(models_metrics).T
results_df.loc['baseline'] = baseline_metrics
results_df = results_df.sort_values('MCC_test', ascending=False)
results_df


In [None]:
import matplotlib.pyplot as plt

results_df['MCC_test'].plot(kind='barh')
plt.title('MCC on test set by weighting scheme')
plt.xlabel('Matthews Corr. Coef')
plt.gca().invert_yaxis()
plt.show()


## Conclusions & Next Steps  
* **RiskSampler** allows flexible weighting strategies that can substantially improve model performance on imbalanced, drifting, or cost‑sensitive datasets.  
* In this demo the *best* scheme was the one that maximised **MCC** on the test set. You might choose a different metric aligned with your business objective.  
* Try tweaking:
  * hyper‑parameters of XGBoost,
  * decay rates or DEFAULT event‑rates inside each weighting scheme,
  * combining more than two strategies in `combo`.  

Replace the dataset with your own and adjust `TARGET_RAW`, `positive_statuses`, and `DATE_COL` accordingly.

In [None]:
# === Re‑run RiskSampler schemes with corrected parameters ===
from pprint import pprint

schemes = {
    'balanced': {
        'strategies': 'balanced',
        'date_col': vint_train,
        'target_col': y_train,
    },
    'equal_vintage': {
        'strategies': 'equal_vintage',
        'vintage_col': vint_train,
        'date_col': vint_train,
        'target_col': y_train,
    },
    'stabilise_er': {
        'strategies': 'stabilise_er',
        'date_col': vint_train,
        'target_col': y_train,
    },
    'recency_decay': {
        'strategies': 'recency_decay',
        'vintage_col': vint_train,
        'decay_rate': 0.1,
        'date_col': vint_train,
        'target_col': y_train,
    }
}

# Adiciona expected_loss e combo se loan_amnt existir
if 'loan_amnt' in X_train.columns:
    schemes['expected_loss'] = {
        'strategies': 'expected_loss',
        'loss_col': X_train['loan_amnt'],
        'date_col': vint_train,
        'target_col': y_train,
    }
    schemes['combo'] = {
        'strategies': 'combo',
        'components': [
            {
                'strategies': 'expected_loss',
                'loss_col': X_train['loan_amnt'],
                'date_col': vint_train,
                'target_col': y_train,
            },
            {
                'strategies': 'recency_decay',
                'vintage_col': vint_train,
                'decay_rate': 0.1,
                'date_col': vint_train,
                'target_col': y_train,
            }
        ],
        'date_col': vint_train,
        'target_col': y_train,
    }

models_metrics = {}
for name, params in schemes.items():
    print(f"\n=== {name.upper()} ===")
    rs = RiskSampler(**params)
    sw = rs.fit_transform(X_train)
    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=RANDOM_SEED,
        n_jobs=-1
    )
    model.fit(X_train, y_train, sample_weight=sw)
    metrics = evaluate(model, X_train, y_train, X_test, y_test)
    models_metrics[name] = metrics

pprint(models_metrics)
