In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict

# load the MDP dataset
df = pd.read_csv("/content/mdprewards.csv")

# hyperparameters
alpha = 0.1    # learning rate
gamma = 0.9    # discount factor
episodes = 5   # number of passes over dataset (can increase)

# initialize Q-table as dictionary: Q[(state, action)] = value
Q = defaultdict(float)

# training loop
for ep in range(episodes):
    for _, row in df.iterrows():
        state = row["state"]
        action = row["action"]
        reward = row["mdpreward"]
        next_state = row["next_state"]

        # current Q value
        old_q = Q[(state, action)]

        # best future Q value from next_state
        future_q = 0
        if pd.notna(next_state):  # ignore if no next state
            future_actions = df[df["state"] == next_state]["action"].unique()
            if len(future_actions) > 0:
                future_q = max(Q[(next_state, a)] for a in future_actions)

        # Q-learning update
        Q[(state, action)] = old_q + alpha * (reward + gamma * future_q - old_q)

print("Training done! Learned Q-values for", len(Q), "state-action pairs.")

# Example: get best action per state
policy = {}
for (s, a), val in Q.items():
    if s not in policy or val > policy[s][1]:
        policy[s] = (a, val)

print("\nBest coupon per state (sample):")
for s, (a, v) in list(policy.items())[:10]:
    print(f"State={s[:50]}... -> Best coupon {a} (Q={v:.2f})")

def policy(state):
    # pick the action with the highest Q-value for this state
    actions = [a for (s, a) in Q if s == state]
    if not actions:   # no actions found
        return None
    return max(actions, key=lambda a: Q[(state, a)])

example_state = "Marital=Single|Age=18-25|Spend=120.5"
best_coupon = policy(example_state)
print("Best coupon for", example_state, ",", best_coupon)


Training done! Learned Q-values for 4970 state-action pairs.

Best coupon per state (sample):
State=_Spend=131.44... -> Best coupon 6 (Q=1.07)
State=_Spend=1131.65... -> Best coupon 416 (Q=-0.22)
State=_Spend=403.94... -> Best coupon 868 (Q=-0.09)
State=_Spend=106.5... -> Best coupon 786 (Q=1.50)
State=_Spend=1722.58... -> Best coupon 416 (Q=-0.21)
State=_Spend=142.12... -> Best coupon 8 (Q=1.27)
State=_Spend=67.32... -> Best coupon 325 (Q=1.62)
State=_Spend=177.74... -> Best coupon 468 (Q=0.72)
State=_Spend=248.98... -> Best coupon 23 (Q=0.79)
State=_Spend=2687.53... -> Best coupon 416 (Q=-0.45)
Best coupon for Marital=Single|Age=18-25|Spend=120.5 , None
