In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier

### Contextual bandit problem - reward dynamics are state dependent

In this case, there is no universally "best" option - instead we have user preferences, which are as follows:
1. Mobile users prefer `a0`.
2. Desktop weekend traffic prefers `a1`.
3. Desktop weekday morning traffic prefers `a2`.
4. Desktop weekday non-morning traffic prefers `a3`.

In [2]:
is_mobile_df = pd.DataFrame({'is_mobile': [0, 1], 'key': [0, 0]})
is_weekend_df = pd.DataFrame({'is_weekend': [0, 1], 'key': [0, 0]})
is_morning_df = pd.DataFrame({'is_morning': [0, 1], 'key': [0, 0]})
action_df = pd.DataFrame({'action': ['a0', 'a1', 'a2', 'a3'], 'key': [0, 0, 0, 0]})
dynamics_df = is_mobile_df.merge(is_weekend_df).merge(is_morning_df).merge(action_df)
dynamics_df = dynamics_df.drop(columns='key')
dynamics_df['reward_rate'] = pd.Series([0.02, 0.02, 0.02, 0.04,
                                        0.02, 0.02, 0.04, 0.02,
                                        0.02, 0.04, 0.02, 0.02,
                                        0.02, 0.04, 0.02, 0.02,
                                        0.04, 0.02, 0.02, 0.02,
                                        0.04, 0.02, 0.02, 0.02,
                                        0.04, 0.02, 0.02, 0.02,
                                        0.04, 0.02, 0.02, 0.02,])

In [3]:
def get_state(batch_size):
    random_row = dynamics_df.sample(n=batch_size, replace=True, axis=0)
    random_row = random_row.drop(columns=['action', 'reward_rate'])
    return random_row.reset_index(drop=True)

In [4]:
def get_rewards(state_df, action_df, dynamics_df):
    sa_df = state_df.merge(action_df, left_index=True, right_index=True)
    sarr_df = sa_df.merge(dynamics_df)
    sarr_df['reward'] = np.random.binomial(n=1, p=sarr_df['reward_rate'])
    sar_df = sarr_df.drop(columns='reward_rate')
    return sar_df

In [5]:
num_batches = 100
batch_size = 100

#### Random Policy

In [6]:
cumulative_reward = 0
for _ in range(num_batches):
    state = get_state(batch_size)
    action = pd.DataFrame({'action': np.random.choice(['a0', 'a1', 'a2', 'a3'], size=batch_size)})
    sar_df = get_rewards(state, action, dynamics_df)
    cumulative_reward += sar_df['reward'].sum()
print(cumulative_reward)

253


#### Epsilon Greedy Policy

In [7]:
cat_tx = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('categorical', cat_tx, ['action'])
])

clf = GradientBoostingClassifier(n_estimators=1000)

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', clf)])

In [8]:
col_names = ['is_mobile', 'is_weekend', 'is_morning', 'action']
cumulative_reward = 0
first_batch = True
history = pd.DataFrame({'is_mobile': [], 'is_weekend': [], 'is_morning': [], 'action': [], 'reward': []})
for _ in range(num_batches):
    if first_batch:
        state = get_state(batch_size)
        action = pd.DataFrame({'action': np.random.choice(['a0', 'a1', 'a2', 'a3'], size=batch_size)})
        sar_df = get_rewards(state, action, dynamics_df)
        history = pd.concat([history, sar_df]).reset_index(drop=True)
        cumulative_reward += sar_df['reward'].sum()
        first_batch = False
    else:
        #Learn on the history
        model_pipeline = pipeline.fit(history[col_names], history['reward'])
        #Get new state
        state = get_state(batch_size)
        #Get greedy actions
        state['key'] = 0
        state['row_num'] = range(batch_size)
        action_choice_df = pd.DataFrame({'action': ['a0', 'a1', 'a2', 'a3'], 'key': 0})
        input_df = state.merge(action_choice_df)
        input_df['q_value'] = model_pipeline.predict_proba(input_df[col_names])[:,1]
        ba_df = input_df.loc[input_df.groupby(['row_num'])['q_value'].idxmax()].set_index('row_num')
        ba_df = ba_df.drop(columns=['key', 'q_value'])
        actions = ba_df[['action']].copy()
        #Add in exploration
        random_indices = list(np.random.choice(range(batch_size), size=int(batch_size/10)))
        for i in random_indices:
            actions.loc[i, 'action'] = np.random.choice(['a0', 'a1', 'a2', 'a3'])
        #Interact with the environment
        sar_df = get_rewards(state, actions, dynamics_df)
        history = pd.concat([history, sar_df]).reset_index(drop=True)
        cumulative_reward += sar_df['reward'].sum()

In [9]:
cumulative_reward

275