In [None]:
# 1.Initialization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.environment import RecSysEnv
from src.policies import get_logging_policy, get_target_policy
from src.estimators import calculate_ips, calculate_dr
from src.utils import plot_ope_convergence

# Initialize our research environment
env = RecSysEnv(n_arms=5, context_dim=10)
pi_0 = get_logging_policy(n_arms=5, context_dim=10)
pi_e = get_target_policy(n_arms=5, context_dim=10)

In [None]:
# 2. Ground Truth: The Oracle
def get_ground_truth(env, policy, iterations=10000):
    rewards = []
    for _ in range(iterations):
        ctx = env.get_context()
        action, _ = policy.select_action(ctx)
        rewards.append(env.get_reward(ctx, action))
    return np.mean(rewards)

v_true = get_ground_truth(env, pi_e)
print(f"True Value of Target Policy (Ground Truth): {v_true:.4f}")

In [None]:
# 3. Data Collection (The Logged Dataset)
data_logs = []
for _ in range(5000):
    ctx = env.get_context()
    action, prob_0 = pi_0.select_action(ctx)
    reward = env.get_reward(ctx, action)
    
    # Get the probability that the NEW policy would have taken this action
    prob_e = pi_e.get_action_probabilities(ctx)[action]
    
    data_logs.append({
        'reward': reward,
        'p0': prob_0,
        'pe': prob_e
    })

df = pd.DataFrame(data_logs)

In [None]:
# 4. Running the Estimators
# IPS Estimate
ips_est = (df['reward'] * (df['pe'] / df['p0'])).mean()

print(f"Ground Truth: {v_true:.4f}")
print(f"IPS Estimate: {ips_est:.4f}")
print(f"Estimation Error: {abs(v_true - ips_est):.4f}")

In [None]:
# 5. Visualizing the Variance
plt.hist(df['pe'] / df['p0'], bins=30, color='skyblue', edgecolor='black')
plt.title("Distribution of Importance Weights")
plt.xlabel("Weight (pi_e / pi_0)")
plt.ylabel("Frequency")
plt.show()