# Policy Learning Examples

## Imports

In [1]:
import sys
sys.path.append("../")

from blbf.STBT import STBT
from blbf.PolicyLearning import EvaluationMetrics, RewardPredictor, OutcomeWeightedLearning, CounterfactualRiskMinimization, CounterfactualRiskMinimizationCV, LinearModel, NonLinearModel
import blbf.DataReader as DataReader
import blbf.utils as utils
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
pd.set_option('display.max_columns', 100)



## Read data

In [2]:
np.random.seed(1)
X, y = DataReader.get_data(dataset= 'glass')

## Perform Supervised-to-Bandit Conversion 

Performs Supervised to Bandit Conversion for classification datasets. This conversion is generally used to test the limits of counterfactual learning in a well-controlled environment. See [1-3]. 

Here, we take a supervised dataset with features $x$ and labeled classes $y$, and simulate a bandit feedback data set from a logging policy. Basically, this involves: (i) simulating a stochastic logging policy, which may be  uniform (`logging_type='uniform'`), or given as a function of covariates (`logging_type = 'biased'`), (ii) when the logging policy for a given observation equals the optimal policy (true label), a positive reward is observed.
    

In [3]:
data = STBT(train_frac= 0.5, logging_type='biased').generate_batch(X, y, max_iter=1000)

## Skyline 

Best possible error rate, assuming we have full feedback (this can only be tested from the simulation as in practice as we have bandit feedback)x.


In [4]:
clf = LogisticRegressionCV(multi_class='multinomial', max_iter=2000).fit(data.X_train, data.y_train)
optimal_policy = clf.predict(data.X_test)
print("Skyline Error:", EvaluationMetrics.error_rate(optimal_policy, data.y_test))

Skyline Error: 0.30841121495327106


##  Reward Predictor (RP)

In [5]:
rp = RewardPredictor()
rp.learn_policy(data, max_iter=1000)
print("Reward Predictor Error:", rp.error_rate(rp.est_best_policy, data.y_test))

Reward Predictor Error: 0.7009345794392523


## Outcome Weighted Learning (OWL)

In [6]:
owl = OutcomeWeightedLearning()
owl.learn_policy(data, clf = 'LogisticRegressionCV', max_iter=1000)
print("OWL-LR:", owl.error_rate(owl.est_best_policy, data.y_test))

OWL-LR: 0.7289719626168225


## Counterfactual Risk Minimization (CRM)

In [7]:
crm = CounterfactualRiskMinimization(verbose=True, lambda_ = 1e-06)
crm.learn_policy(model=LinearModel, data=data, epochs = 2000)
print("CRM:", crm.error_rate(crm.est_best_policy, data.y_test))

Epoch 0: | Train Poem Loss: -0.10522
Epoch 100: | Train Poem Loss: -0.25576
Epoch 200: | Train Poem Loss: -0.39687
Epoch 300: | Train Poem Loss: -0.49617
Epoch 400: | Train Poem Loss: -0.54126
Epoch 500: | Train Poem Loss: -0.56482
Epoch 600: | Train Poem Loss: -0.58157
Epoch 700: | Train Poem Loss: -0.59403
Epoch 800: | Train Poem Loss: -0.60416
Epoch 900: | Train Poem Loss: -0.61307
Epoch 1000: | Train Poem Loss: -0.62120
Epoch 1100: | Train Poem Loss: -0.62863
Epoch 1200: | Train Poem Loss: -0.63533
Epoch 1300: | Train Poem Loss: -0.64126
Epoch 1400: | Train Poem Loss: -0.64643
Epoch 1500: | Train Poem Loss: -0.65089
Epoch 1600: | Train Poem Loss: -0.65471
Epoch 1700: | Train Poem Loss: -0.65798
Epoch 1800: | Train Poem Loss: -0.66077
Epoch 1900: | Train Poem Loss: -0.66315
CRM: 0.7009345794392523


## Experiments 

In [8]:
## Params

B = 10 # Number of simulations
EPOCHS = 500
LOGGING_TYPE = 'biased'
MODEL = LinearModel
LAMBDA = 1e-06
DATASETS = ['ecoli', 'glass', 'lymphography', 'yeast', 'digits', 'breast-cancer', 'wine', 'letter-recognition']
dat = list()
skyline_error = list()
randomized_error = list()
reward_predictor_error = list()
owl_lrcv_error = list()
crm_error = list()

In [9]:
for s in DATASETS:
    
    X, y = DataReader.get_data(dataset=s)
    
    for b in range(B):
        if (b % 10) == 0:
            print("Sample: %d - Dataset: %s" % (b, s))
        
        d = STBT(logging_type = LOGGING_TYPE).generate_batch(X, y, max_iter=1000)
        dat.append(s)    
       
        skyline = LogisticRegression(multi_class='multinomial', max_iter=2000).fit(d.X_train, d.y_train)
        optimal_policy = skyline.predict(d.X_test)
        
        rp = RewardPredictor().learn_policy(data=d, max_iter=1000)
        erm_lrcv = OutcomeWeightedLearning().learn_policy(data=d, clf = 'LogisticRegressionCV', max_iter=1000)
        crm = CounterfactualRiskMinimization(lambda_=LAMBDA).learn_policy(model=MODEL, data=d, epochs=EPOCHS)     
        
        skyline_error.append(EvaluationMetrics.error_rate(optimal_policy, d.y_test))
        randomized_error.append(EvaluationMetrics.error_rate(d.y_test_logging, d.y_test))
        reward_predictor_error.append(rp.error_rate(rp.est_best_policy, d.y_test))
        owl_lrcv_error.append(erm_lrcv.error_rate(erm_lrcv.est_best_policy, d.y_test))
        crm_error.append(crm.error_rate(crm.est_best_policy, d.y_test))
           
    
res = pd.DataFrame.from_dict({'dataset':dat, 'skyline_error': skyline_error, 'randomized_error':randomized_error, 'reward_predictor_error':reward_predictor_error,
                              'owl_lrcv_error':owl_lrcv_error, 'crm_error':crm_error})

res_summary = res.groupby(['dataset'], as_index=False).agg({
                            'skyline_error': ['mean','std'], 
                            'randomized_error': ['mean','std'], 
                            'reward_predictor_error': ['mean','std'],
                            'owl_lrcv_error': ['mean','std'],
                            'crm_error': ['mean','std']
                            })

Sample: 0 - Dataset: ecoli
Sample: 0 - Dataset: glass
Sample: 0 - Dataset: lymphography
Sample: 0 - Dataset: yeast
Sample: 0 - Dataset: digits
Sample: 0 - Dataset: breast-cancer
Sample: 0 - Dataset: wine
Sample: 0 - Dataset: letter-recognition


In [10]:
res_summary

Unnamed: 0_level_0,dataset,skyline_error,skyline_error,randomized_error,randomized_error,reward_predictor_error,reward_predictor_error,owl_lrcv_error,owl_lrcv_error,crm_error,crm_error
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
0,breast-cancer,0.029474,0.006233,0.525614,0.227465,0.027719,0.009127,0.170526,0.185646,0.102807,0.143245
1,digits,0.039933,0.007786,0.892102,0.04424,0.362291,0.082093,0.52525,0.091903,0.415239,0.112133
2,ecoli,0.126786,0.015636,0.850595,0.099818,0.301786,0.070432,0.429167,0.198541,0.344643,0.103649
3,glass,0.411215,0.047654,0.848598,0.076155,0.579439,0.077193,0.603738,0.107664,0.651402,0.083015
4,letter-recognition,0.22943,0.003287,0.96272,0.010833,0.73679,0.047752,0.71368,0.041299,0.73955,0.033839
5,lymphography,0.216216,0.047244,0.75,0.114533,0.274324,0.039295,0.327027,0.100683,0.414865,0.120541
6,wine,0.024719,0.014793,0.744944,0.151215,0.070787,0.04827,0.410112,0.22851,0.188764,0.151655
7,yeast,0.418329,0.016835,0.899461,0.038157,0.523046,0.03112,0.623181,0.03918,0.558086,0.036266
