In [None]:
import time

import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
import scipy.stats as scist

from sklearn.preprocessing import PolynomialFeatures
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor

import gymnasium as gym

from rl import Rollout

# Playground

Below is to test how to do validation and training for policy evaluation.

In [None]:
def get_rollout_object():
    env = gym.make(
        "LunarLander-v2", 
    )
    
    gamma = 0.99
    rollout = Rollout(env, gamma=gamma)
    return env, rollout

### Warm up

In [None]:
def get_samples(n_steps=4096):
    (env, rollout) = get_rollout_object()
    
    s, info = env.reset()
    for t in range(n_steps):
        a = env.action_space.sample()
        (s_next, r, term, trunc, info) = env.step(a)
        rollout.add_step_data(s,a,r,term or trunc)
        s = s_next
    
        if term or trunc:
            s,_ = env.reset()
    
    # this is training data
    return rollout.get_est_stateaction_value()

### SGD for RKHS with validation

In [None]:
import sklearn.pipeline
import sklearn.preprocessing 
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDRegressor

In [None]:
# initialize
featurizer = PolynomialFeatures(1)
featurizer = RBFSampler(gamma=1.0, n_components=100)

# warm up
n_warmup_steps = 100
(env, rollout) = get_rollout_object()
X = np.atleast_2d(np.array([env.observation_space.sample() for _ in range(n_warmup_steps)])).T
featurizer.fit(X)

In [None]:
def custom_SGD(solver, X, y, n_epochs, max_regress = 5):
    frac_val = 0.1
    minibatch = 64
    tol = 1e-3
    
    n_consec_regress_epochs = 0
    
    train_losses = []
    test_losses = []
    
    for i in range(n_epochs):
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=i, shuffle=True, test_size=frac_val)
        num_batches = int(np.ceil(len(X_train)/ minibatch))
        for j in range(num_batches):
            k_s = minibatch*j
            k_e = min(len(X_train), minibatch*(j+1))
            # mini-batch update
            solver.partial_fit(X_train[k_s:k_e], y_train[k_s:k_e])

        y_train_pred = solver.predict(X_train)
        y_test_pred = solver.predict(X_test)

        train_losses.append(la.norm(y_train_pred - y_train)**2/len(y_train))
        test_losses.append(la.norm(y_test_pred - y_test)**2/len(y_test))

        if len(test_losses) > 1 and test_losses[-1] > (1.+tol)*test_losses[-2]:
            n_consec_regress_epochs += 1
        else:
            n_consec_regress_epochs = 0
        if n_consec_regress_epochs == max_regress:
            print("Early stopping")
            break

    return np.array(train_losses), np.array(test_losses)

In [None]:
featurizer = RBFSampler(gamma=1.0, n_components=100)

featurizer = sklearn.pipeline.FeatureUnion([
    ("rbf1", RBFSampler(gamma=25, n_components=100)),
    ("rbf2", RBFSampler(gamma=10, n_components=100)),
    ("rbf3", RBFSampler(gamma=5.0, n_components=100)),
    ("rbf4", RBFSampler(gamma=2.0, n_components=100)),
    ("rbf5", RBFSampler(gamma=1.0, n_components=100)),
    ("rbf6", RBFSampler(gamma=0.5, n_components=100)),
    ("rbf7", RBFSampler(gamma=0.1, n_components=100)),
    ("rbf8", RBFSampler(gamma=0.01, n_components=100)),
])

(q_est, adv_est, s_visited, a_visited) = get_samples()
act_mode = scist.mode(a_visited)[0].flat[0]
idx_with_mode = np.where(np.squeeze(a_visited) == act_mode)
X = featurizer.fit_transform(s_visited[idx_with_mode])
y = q_est[idx_with_mode]

# setup solve
n_epochs = 200
solver = SGDRegressor(max_iter=n_epochs, tol=1e-3, learning_rate="constant", eta0=0.01)

# train
train_losses, test_losses = custom_SGD(solver, X, y, n_epochs)

# plot
plt.style.use('ggplot')
_, ax = plt.subplots()

ax.plot(np.arange(len(train_losses)), train_losses, label="train", color="gray")
ax.plot(np.arange(len(test_losses)), test_losses, label="test", linestyle="dotted", color="red")
ax.set(xlabel="epoch", ylabel="loss", title="Cross validation")
ax.legend()

print(f"Final training loss: {train_losses[-1]:.2f}")

Compare with `SGD`.

In [None]:
a = solver.fit(X, y)
print(f"Final loss: {la.norm(solver.predict(X)-y)**2/len(y):.2f}")

### Compare runtime
Let's compare runtime over `10` trials.

In [None]:
n_epochs = 32
s_time = time.time()
custom_sgd_losses = []
for i in range(10):
    (q_est, adv_est, s_visited, a_visited) = get_samples()
    train_losses, test_losses = custom_SGD(solver, X, y, max_regress=np.inf, n_epochs=n_epochs)
    custom_sgd_losses.append(train_losses[-1])
total_time = time.time() - s_time
print(f"Total time: {total_time:.2f}s. Losses:\n{custom_sgd_losses}")

In [None]:
solver = SGDRegressor(
    max_iter=n_epochs, 
    tol=1e-3, 
    early_stopping=False, 
    learning_rate="constant", 
    eta0=0.01
)

s_time = time.time()
sgd_losses = []
for i in range(10):
    (q_est, adv_est, s_visited, a_visited) = get_samples()
    solver.fit(X, y)
    sgd_losses.append(la.norm(solver.predict(X)-y)**2/len(y))
total_time = time.time() - s_time
print(f"Total time: {total_time:.2f}s. Losses:\n{sgd_losses}")

We can see the `SGDRegressor` is about 3-4x faster than our implementation.

# SGD for NN 

Start with imports.

In [None]:
import torch
from torch import nn