# ROFL contextual bandit demo — v2

This notebook extends the previous version with:
- Neural bootstrap ensemble contextual bandit
- IPS-weighted regression trainer
- Instructions to plug in DuckDB (with placeholders)
- How to run the Streamlit visualizer app included in this folder


In [None]:
import numpy as np
import pandas as pd
from rofl_bandit import generate_synthetic_data, LinUCB, LinearThompson, NeuralBootstrapBandit, train_reward_model, OffPolicyEvaluator, ips_weighted_regression, predict_from_models
np.random.seed(42)

# Generate synthetic dataset
data = generate_synthetic_data(n=4000, n_arms=5, d=8, seed=42)
X = data['X']
rewards = data['rewards']
actions = data['actions']
propensities = data['propensities']
n_arms = data['true_thetas'].shape[0]

print('Dataset sizes: n=', X.shape[0], 'd=', X.shape[1], 'n_arms=', n_arms)


## Train LinUCB, Linear Thompson, and NeuralBootstrapBandit (offline imitation)


In [None]:
d = X.shape[1]
linucb = LinUCB(n_arms=n_arms, n_features=d, alpha=0.8)
for i in range(X.shape[0]):
    linucb.update(actions[i], X[i], rewards[i])
lts = LinearThompson(n_arms=n_arms, n_features=d, v2=1.0, lambda_reg=1.0)
for i in range(X.shape[0]):
    lts.update(actions[i], X[i], rewards[i])
neural = NeuralBootstrapBandit(n_arms=n_arms, n_features=d, n_models=6)
neural.fit(X, actions, rewards)
print('Trained linucb, lts (imitation), and neural ensemble')


## Off-policy evaluation (IPS & DR) for all policies


In [None]:
target_linucb = np.array([linucb.select_arm(X[i]) for i in range(X.shape[0])])
target_lts = np.array([lts.select_arm(X[i]) for i in range(X.shape[0])])
target_neural = np.array([neural.select_arm(X[i]) for i in range(X.shape[0])])

q_hat = train_reward_model(X, actions, rewards, n_arms)
ips_linucb = OffPolicyEvaluator.ips(rewards, actions, target_linucb, propensities, clip=100.0)
ips_lts = OffPolicyEvaluator.ips(rewards, actions, target_lts, propensities, clip=100.0)
ips_neural = OffPolicyEvaluator.ips(rewards, actions, target_neural, propensities, clip=100.0)
dr_linucb = OffPolicyEvaluator.dr(rewards, actions, target_linucb, propensities, q_hat)
dr_lts = OffPolicyEvaluator.dr(rewards, actions, target_lts, propensities, q_hat)
dr_neural = OffPolicyEvaluator.dr(rewards, actions, target_neural, propensities, q_hat)

pd.DataFrame({
    'policy': ['linucb', 'linear_thompson', 'neural'],
    'IPS': [ips_linucb, ips_lts, ips_neural],
    'DR': [dr_linucb, dr_lts, dr_neural]
})


## IPS-weighted regression trainer (to learn a policy via offline weighted supervised learning)


In [None]:
models = ips_weighted_regression(X, actions, rewards, propensities, n_arms)
preds = predict_from_models(models, X)
# derive greedy policy from preds
greedy_from_ips = np.argmax(preds, axis=1)
ips_ipsgreedy = OffPolicyEvaluator.ips(rewards, actions, greedy_from_ips, propensities, clip=100.0)
dr_ipsgreedy = OffPolicyEvaluator.dr(rewards, actions, greedy_from_ips, propensities, q_hat)
print('IPS-weighted regression policy evaluation')
pd.DataFrame({'policy':['ips_weighted_regression'], 'IPS':[ips_ipsgreedy], 'DR':[dr_ipsgreedy]})


## Replace synthetic data with DuckDB

Below is an example snippet to load your `claims_snapshot` from DuckDB and construct the required arrays. Edit column names and preprocessing to match your schema.


In [None]:
'''import duckdb
con = duckdb.connect('/path/to/your.duckdb')
q = '''SELECT feature_1, feature_2, feature_3, /*...*/, action_taken AS action, reward, propensity FROM claims_snapshot WHERE date >= '2025-01-01' '''
df = con.execute(q).fetchdf()
feature_cols = ['feature_1','feature_2','feature_3']
X = df[feature_cols].values
actions = df['action'].astype(int).values
rewards = df['reward'].values
propensities = df['propensity'].values
'''

## Streamlit visualizer

A Streamlit app `rofl_streamlit_app.py` is included next to this notebook. Run it with:
```
streamlit run rofl_streamlit_app.py
```
It will load either synthetic data or your DuckDB (edit path) and let you choose policies and display IPS/DR + action distributions using Altair.
