In [None]:
import numpy as np

# --- Feature map: RBFs over x in [0,1] ---
def rbf_features(x, centers, sigma):
    # returns phi(x) shape (n_features,)
    return np.exp(-0.5 * ((x - centers) / sigma) ** 2)

# --- Environment: drift right, reward near the end ---
def step(x):
    # drift to the right with small noise
    x_next = np.clip(x + 0.07 + np.random.randn() * 0.01, 0.0, 1.0)
    r = 1.0 if x_next > 0.95 else 0.0
    done = bool(x_next > 0.99)  # episode ends near the end
    return x_next, r, done

def run_td_lambda(lam=0.9, n_episodes=50, seed=0):
    rng = np.random.default_rng(seed)
    np.random.seed(seed)

    n_feat = 15
    centers = np.linspace(0.0, 1.0, n_feat)
    sigma = 0.08

    gamma = 0.98
    alpha = 0.05

    w = np.zeros(n_feat)

    for ep in range(n_episodes):
        x = 0.0
        e = np.zeros_like(w)  # eligibility trace over WEIGHTS/FEATURES

        while True:
            phi = rbf_features(x, centers, sigma)
            Vx = w @ phi

            x_next, r, done = step(x)
            phi_next = rbf_features(x_next, centers, sigma)
            Vx_next = w @ phi_next

            delta = r + gamma * Vx_next - Vx

            # eligibility trace update
            e = gamma * lam * e + phi

            # weight update
            w += alpha * delta * e

            x = x_next
            if done:
                break

    # Evaluate learned V(x) on a grid
    xs = np.linspace(0, 1, 51)
    Vs = np.array([w @ rbf_features(x, centers, sigma) for x in xs])
    return xs, Vs, w

# Compare lambda=0 (no trace) vs lambda=0.9 (longer credit assignment)
xs0, V0, w0 = run_td_lambda(lam=0.0, n_episodes=80, seed=1)
xs9, V9, w9 = run_td_lambda(lam=0.9, n_episodes=80, seed=1)

print("Sample V(x) near the middle vs near the end")
for x_query in [0.2, 0.5, 0.8, 0.95]:
    i = np.argmin(np.abs(xs0 - x_query))
    print(f"x={xs0[i]:.2f}  V_lambda0={V0[i]:.3f}   V_lambda0.9={V9[i]:.3f}")

# If you want to see the trace behavior directly for ONE episode:
def demo_one_episode(lam=0.9, seed=2):
    np.random.seed(seed)
    n_feat = 10
    centers = np.linspace(0.0, 1.0, n_feat)
    sigma = 0.10
    gamma = 0.98

    w = np.zeros(n_feat)
    e = np.zeros_like(w)
    x = 0.0

    print("\n--- Trace demo (first ~10 steps) ---")
    for t in range(10):
        phi = rbf_features(x, centers, sigma)
        e = gamma * lam * e + phi

        print(f"t={t:2d} x={x:.3f}  ||phi||={np.linalg.norm(phi):.3f}  ||e||={np.linalg.norm(e):.3f}")
        # show which features are currently most eligible
        top = np.argsort(-e)[:3]
        print("   top-eligible feature idx:", top, "values:", np.round(e[top], 3))

        x, r, done = step(x)
        if done:
            break

demo_one_episode(lam=0.9)
