In [5]:
import pandas as pd
df = pd.read_csv(r"/content/finalencodedds.csv")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Shape:", df.shape)
print(df[["coupon_id", "reward"]].head())
import numpy as np
from collections import defaultdict

# unique actions (coupons)
actions = df["coupon_id"].unique()
n_actions = len(actions)
print("Number of unique coupons (actions):", n_actions)

# map coupon ids to indices
action_to_idx = {a: i for i, a in enumerate(actions)}
# epsilon-greedy params
epsilon = 0.1
n_steps = len(df)   # one step per row
Q = np.zeros(n_actions)   # estimated value of each coupon
N = np.zeros(n_actions)   # count how many times each coupon was tried

# store rewards
total_rewards = []

for row in df.itertuples(index=False):
    reward = row.reward

    # choose action
    if np.random.rand() < epsilon:
        action_idx = np.random.randint(n_actions)   # explore
    else:
        action_idx = np.argmax(Q)                   # exploit

    # update estimates
    N[action_idx] += 1
    Q[action_idx] += (reward - Q[action_idx]) / N[action_idx]

    total_rewards.append(reward)

print("Average reward:", np.mean(total_rewards))
best_coupons = np.argsort(Q)[::-1][:10]   # top 10
print("Top coupons by learned Q-values:")
for idx in best_coupons:
    print(f"Coupon {actions[idx]} → Estimated value {Q[idx]:.2f}")

Shape: (313919, 54)
   coupon_id  reward
0         20       1
1         23       2
2         23       2
3       1031       3
4         23       3
Number of unique coupons (actions): 769
Average reward: 1.964560921766443
Top coupons by learned Q-values:
Coupon 131 → Estimated value 1.98
Coupon 854 → Estimated value 1.97
Coupon 64 → Estimated value 1.97
Coupon 882 → Estimated value 1.97
Coupon 459 → Estimated value 1.97
Coupon 314 → Estimated value 1.97
Coupon 4 → Estimated value 1.97
Coupon 677 → Estimated value 1.97
Coupon 675 → Estimated value 1.97
Coupon 334 → Estimated value 1.97
