In [1]:
import warnings
warnings.filterwarnings("ignore")

from Environment import *
from Estimator import SlateEstimator
from Estimator_CPME import (
    DirectEstimator,
    IPSEstimator,
    CMEbis,
    DoublyRobustEstimator,
    DoublyRobustbis,
    BehaviorPolicyEstimator
)
from Policy import *
from ParameterSelector import ParameterSelectorWithBehaviorEstimator
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import rbf_kernel, linear_kernel
from tqdm import tqdm
# import matplotlib.pyplot as plt
import joblib
import os
import sys

if not os.path.exists("results"):
    os.mkdir("results")

os.environ["CUDA_VISIBLE_DEVICES"] = ""

def simulate_data(null_policy, target_policy, environment, item_vectors):
    """
    simulate data given policy, environment and set of context
    :return: observations
    """
    user = environment.get_context()
    null_reco, null_multinomial, null_user_vector = null_policy.recommend(user)
    # recommendation is represented by a concatenation of recommended item vectors
    # null_reco_vec = np.mean(item_vectors[null_reco], axis=0)
    null_reco_vec = np.concatenate(item_vectors[null_reco])
    null_reward = environment.get_reward(user, null_reco)

    target_reco, target_multinomial, _ = target_policy.recommend(user)
    # recommendation is represented by a concatenation of recommended item vectors
    # target_reco_vec = np.mean(item_vectors[target_reco], axis=0)
    target_reco_vec = np.concatenate(item_vectors[target_reco])
    target_reward = environment.get_reward(user, target_reco)

    observation = {"null_context_vec": null_user_vector, "target_context_vec": null_user_vector,
                   "null_reco": tuple(null_reco),
                   "null_reco_vec": null_reco_vec, "null_reward": null_reward,
                   "target_reco": tuple(target_reco), "null_multinomial": null_multinomial,
                   "target_multinomial": target_multinomial, "target_reco_vec": target_reco_vec,
                   "target_reward": target_reward, "user": user}

    return observation

# new_data = pd.concat([pd.DataFrame({'null_reward': data.null_reward.iloc[train],
#                                                     'null_context_vec': data.null_context_vec.iloc[train],
#                                                     'null_reco_vec': data.null_reco_vec.iloc[train],
#                                                     'null_reco': data.null_reco.iloc[train],
#                                                     'null_multinomial': data.null_multinomial.iloc[train]}),
#                                       pd.DataFrame({'target_context_vec': data.null_context_vec.iloc[test],
#                                                     'target_reco_vec': data.null_reco_vec.iloc[test],
#                                                     'target_reco': data.null_reco.iloc[test],
#                                                     'target_multinomial': data.target_multinomial.iloc[test]})],
#                                      axis=1)

def get_actual_reward(target_policy, environment, n=100000):
    sum_reward = 0
    for i in range(n):
        user = environment.get_context()
        target_reco, target_multinomial, _ = target_policy.recommend(user)
        sum_reward += environment.get_reward(user, target_reco)

    return sum_reward / float(n)

def grid_search(params, estimator, sim_data, n_iterations):
    """
    :param params:
    :param estimator:
    :param sim_data:
    :param n_iterations:
    :return:
    """
    actual_value = sim_data.target_reward.mean()
    return_df = pd.DataFrame(columns=['param', 'estimated_value', 'actual_value', 'error'])
    for param in params:
        estimated_values = []
        for _ in range(n_iterations):
            estimator.params = param
            estimated_values.append(estimator.estimate(sim_data))
        mean_value = np.array(estimated_values).mean()
        ret = {'param': param, 'estimated_value': mean_value, 'actual_value': actual_value,
               'error': np.abs(mean_value - actual_value),
               'percent_error': 100.0 * np.abs(mean_value - actual_value) / actual_value}
        return_df = return_df.append(ret, ignore_index=True)

    return return_df

2025-05-14 16:02:09.780892: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747234929.801315 3735590 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747234929.807692 3735590 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747234929.822971 3735590 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747234929.822989 3735590 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747234929.822991 3735590 computation_placer.cc:177] computation placer alr

In [2]:
seed = 0
num_items = 20
config = {
    "n_users": 50,
    "n_items": num_items,
    "n_reco": 5,
    "n_observation": 100,
    "context_dim": 10
}

result_df = pd.DataFrame()
multiplier = -0.3
num_iter = 30

user_vectors = np.random.normal(0, 1, size=(config['n_users'], config['context_dim']))
target_user_vectors = user_vectors * np.random.binomial(1, 0.5, size=user_vectors.shape)
item_vectors = np.random.normal(0, 1, size=(config['n_items'], config['context_dim']))

null_user_vectors = multiplier * target_user_vectors

# The policy we use to generate sim data
null_policy = MultinomialPolicy(item_vectors, null_user_vectors, config['n_items'], config['n_reco'],
                                temperature=0.5, cal_gamma=True)

# The target policy
target_policy = MultinomialPolicy(item_vectors, target_user_vectors, config['n_items'], config['n_reco'],
                                temperature=1.0, cal_gamma=False)

environment = AvgEnvironment(item_vectors, user_vectors)

np.random.seed(seed)
sim_data = [simulate_data(null_policy, target_policy, environment, item_vectors)
            for _ in range(config['n_observation'])]
sim_data = pd.DataFrame(sim_data)

# === Prepare estimators ===
behavior_estimator = BehaviorPolicyEstimator(config["n_items"])
user_features = np.vstack(sim_data["null_context_vec"].values)
actions = [r[0] for r in sim_data["null_reco"].values]  # Taking first item as action
print(user_features.shape)
behavior_estimator.fit(user_features, actions)

reg_pow = -1
reg_params = (10.0 ** reg_pow) / config['n_observation']
bw_params = (10.0 ** 0)
params = [reg_params, bw_params, bw_params]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.


2025-05-14 16:02:18.335525: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-14 16:02:18.346928: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-14 16:02:18.357712: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-14 16:02:18.359360: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-14 16:02:18.369405: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for 

[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   5 out of  50 | elapsed:  1.3min remaining: 12.1min
[Parallel(n_jobs=-1)]: Done   7 out of  50 | elapsed:  1.4min remaining:  8.5min
[Parallel(n_jobs=-1)]: Done   9 out of  50 | elapsed:  1.4min remaining:  6.5min
[Parallel(n_jobs=-1)]: Done  11 out of  50 | elapsed:  1.6min remaining:  5.7min
[Parallel(n_jobs=-1)]: Done  13 out of  50 | elapsed:  1.6min remaining:  4.6min
[Parallel(n_jobs=-1)]: Done  15 out of  50 | elapsed:  1.6min remaining:  3.8min
[Parallel(n_jobs=-1)]: Done  17 out of  50 | elapsed:  1.7min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done  19 out of  50 | elapsed:  1.7min remaining:  2.7min
[Parallel(n_jobs=-1)]: Done  21 out of  50 | elapsed:  1.7min remaining:  2.3min
[Parallel(n_jobs=-1)]: Done  23 out of  50 | elapsed:  1.7min remaining:  2.0min
[Par

In [11]:
sim_data["null_reward"].isna().sum() #.loc[:, sim_data.isna().any()]

0

In [6]:
sim_data.columns

Index(['null_context_vec', 'target_context_vec', 'null_reco', 'null_reco_vec',
       'null_reward', 'target_reco', 'null_multinomial', 'target_multinomial',
       'target_reco_vec', 'target_reward', 'user'],
      dtype='object')

In [6]:
row = sim_data.iloc[0]
# null_reco = row.null_reco
target_policy.get_propensity(row.target_multinomial, row.null_reco), behavior_estimator.predict_proba(row.null_context_vec, row.null_reco[0])

(7.669967066542437e-16, 0.09679033628977758)

In [7]:
row.target_multinomial

array([1.73935259e-03, 2.38212503e-04, 1.46035301e-03, 1.90904822e-04,
       5.28800198e-03, 2.41339985e-02, 3.00833337e-03, 4.53617055e-04,
       3.72627913e-02, 6.58069183e-02, 2.23973007e-04, 1.77826257e-03,
       1.84736984e-04, 8.19902066e-01, 9.23724281e-04, 3.14726870e-02,
       2.39932310e-04, 1.45168628e-04, 3.88030434e-03, 1.66666162e-03])

In [53]:
import copy
from sklearn.model_selection import StratifiedKFold
data = sim_data
params_grid = [[(10.0 ** p) / config['n_observation'], 1.0, 1.0] for p in np.arange(-7, 0, 1)]
n_splits=5
estimator = CMEbis(rbf_kernel, rbf_kernel, params)
num_params = len(params_grid)
num_data = len(data)

kfold = StratifiedKFold(n_splits=n_splits)
errors = np.zeros(num_params)

# create estimators using parameter grid
estimators = [copy.deepcopy(estimator) for _ in params_grid]
for params, e in zip(params_grid, estimators):
    e.params = params

for train, test in kfold.split(np.zeros(num_data), data.null_reward):

    # split the data
    new_data = pd.concat([pd.DataFrame({'null_reward': data.null_reward.iloc[train].dropna(axis=0),
                                        'null_context_vec': data.null_context_vec.iloc[train].dropna(axis=0),
                                        'null_reco_vec': data.null_reco_vec.iloc[train].dropna(axis=0),
                                        'null_reco': data.null_reco.iloc[train].dropna(axis=0),
                                        'null_multinomial': data.null_multinomial.iloc[train].dropna(axis=0)}),
                          pd.DataFrame({'target_context_vec': data.null_context_vec.iloc[test].dropna(axis=0),
                                        'target_reco_vec': data.null_reco_vec.iloc[test].dropna(axis=0),
                                        'target_reco': data.null_reco.iloc[test].dropna(axis=0),
                                        'target_multinomial': data.target_multinomial.iloc[test].dropna(axis=0)})],
                         axis=1)
    # evaluate the estimator on each split
    validate_data = data.iloc[test].dropna(axis=0)
    validate_reward = data["null_reward"].iloc[test].dropna(axis=0).values

    nullProb = [behavior_estimator.get_propensity(row.null_context_vec, row.null_reco[0]) for _,row in validate_data.iterrows()]
    if not target_policy.greedy:
        targetProb = [target_policy.get_propensity(row.target_multinomial, row.null_reco) for _,row in validate_data.iterrows()]
    else:
        targetProb = [1.0 if row.null_reco == row.target_reco else 0 for _,row in validate_data.iterrows()]

    ips_w = np.divide(targetProb, nullProb)
    actual_value = np.mean(ips_w * validate_reward) / np.mean(ips_w)

    # estimated_values = parallel(joblib.delayed(e.estimate)(new_data) for e in estimators)
    estimated_values = [e.estimate(new_data) for e in estimators]
    errors += [(est - actual_value) ** 2 for est in estimated_values]
errors /= n_splits

parameters = params_grid[np.argmin(errors)]

In [47]:
errors

array([0.24283445, 0.24283436, 0.24283338, 0.24282366, 0.24272695,
       0.24181297, 0.23594965])

In [54]:
parameters

[0.001, 1.0, 1.0]

In [51]:
reg_pow = 0
reg_params = (10.0 ** reg_pow) / config['n_observation']
bw_params = (10.0 ** 0)
params = [reg_params, bw_params, bw_params]

estimators = [  IPSEstimator(behavior_estimator, target_policy),
                SlateEstimator(config['n_reco'], null_policy),
                DirectEstimator(),
                DoublyRobustEstimator(behavior_estimator, target_policy),
                CMEbis(rbf_kernel, rbf_kernel, params),
                DoublyRobustbis(rbf_kernel, rbf_kernel, params, behavior_estimator, target_policy)
                ]

# cme_selector = ParameterSelectorWithBehaviorEstimator(estimators[4])  # cme estimator
# params_grid = [[(10.0 ** p) / config['n_observation'], 1.0, 1.0] for p in np.arange(-7, 0, 1)]
# cme_selector.select_from_propensity(sim_data, params_grid, behavior_estimator, target_policy)
# estimators[4] = cme_selector.estimator

In [52]:
params

[0.01, 1.0, 1.0]

In [40]:
cme_selector.estimator._params

[0.001, 1.0, 1.0]

In [36]:
estimators[4]._params

[0.001, 1.0, 1.0]

In [55]:
estimators[5].params

[0.0001, 1.0, 1.0]