In [2]:
import pandas as pd
df = pd.read_csv("/content/finalencodeddsfr.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Shape:", df.shape)
print(df[["coupon_id", "reward"]].head())
import numpy as np
from collections import defaultdict

# unique actions (coupons)
actions = df["coupon_id"].unique()
n_actions = len(actions)
print("Number of unique coupons (actions):", n_actions)

# map coupon ids to indices
action_to_idx = {a: i for i, a in enumerate(actions)}
# epsilon-greedy params
epsilon = 0.1
n_steps = len(df)   # one step per row
Q = np.zeros(n_actions)   # estimated value of each coupon
N = np.zeros(n_actions)   # count how many times each coupon was tried

# store rewards
total_rewards = []

for row in df.itertuples(index=False):
    reward = row.reward

    # choose action
    if np.random.rand() < epsilon:
        action_idx = np.random.randint(n_actions)   # explore
    else:
        action_idx = np.argmax(Q)                   # exploit

    # update estimates
    N[action_idx] += 1
    Q[action_idx] += (reward - Q[action_idx]) / N[action_idx]

    total_rewards.append(reward)

print("Average reward:", np.mean(total_rewards))
best_coupons = np.argsort(Q)[::-1][:10]   # top 10
print("Top coupons:")
for idx in best_coupons:
    print(f"Coupon {actions[idx]} → Estimated value {Q[idx]:.2f}")

Shape: (25249, 36)
   coupon_id  reward
0          7       1
1        443       2
2          8       2
3          7       1
4         22       1
Number of unique coupons (actions): 610
Average reward: 1.4988316368965107
Top coupons:
Coupon 609 → Estimated value 1.54
Coupon 1019 → Estimated value 1.54
Coupon 579 → Estimated value 1.54
Coupon 1012 → Estimated value 1.52
Coupon 776 → Estimated value 1.52
Coupon 342 → Estimated value 1.52
Coupon 301 → Estimated value 1.52
Coupon 357 → Estimated value 1.51
Coupon 531 → Estimated value 1.51
Coupon 436 → Estimated value 1.51
