In [1]:
from lspi.basis_functions import ExactBasis, RadialBasisFunction
from lspi.policy import Policy
from lspi.policy_ct import QuadraticPolicy
from lspi.sample import Sample
from lspi.solvers import LSTDQSolver
import lspi
from lspi import domains
import numpy as np

In [4]:
ssar = np.load("ssar.npy")
ssar.shape

(1000, 12)

In [2]:
import numpy as np
from lspi.policy import Policy

def lspi_loop_offline(solver, samples, discount, epsilon, max_iterations = 5, initial_policy=None):

    # Initialize random seed
    # np.random.seed(int(sum(100 * np.random.rand())))
    # Create a new policy
    policy = QuadraticPolicy(n_action= 1, n_state= 4, discount = discount)
    if initial_policy is None:
        initial_policy = policy

    # Initialize policy iteration
    iteration = 0
    distance = float('inf')
    all_policies = [initial_policy]

    # If no samples, return
    if not samples:
        print('Warning: Empty sample set')
        return policy, all_policies

    # Main LSPI loop
    while iteration < max_iterations and distance > epsilon:
        # Update and print the number of iterations
        iteration += 1
        print('*********************************************************')
        print(f'LSPI iteration: {iteration}')
        iteration == 1

        # Evaluate the current policy (and implicitly improve)
        policy = lspi.learn(samples, initial_policy, solver)
        # Compute the distance between the current and the previous policy
        if len(policy.weights) == len(all_policies[-1].weights):
            difference = policy.weights - all_policies[-1].weights
            lmax_norm = np.linalg.norm(difference, np.inf)
            l2_norm = np.linalg.norm(difference)
        else:
            lmax_norm = abs(np.linalg.norm(policy.weights, np.inf) -
                            np.linalg.norm(all_policies[-1].weights, np.inf))
            l2_norm = abs(np.linalg.norm(policy.weights) -
                          np.linalg.norm(all_policies[-1].weights))
        distance = l2_norm

        # Print some information
        print(f'   Norms -> Lmax: {lmax_norm:.6f}   L2: {l2_norm:.6f}')

        # Store the current policy
        all_policies.append(policy)

    # Display some info
    print('*********************************************************')
    if distance > epsilon:
        print(f'LSPI finished in {iteration} iterations WITHOUT CONVERGENCE to a fixed point')
    else:
        print(f'LSPI converged in {iteration} iterations')
    print('*********************************************************')

    return policy, all_policies



In [3]:
def load_samples_from_file(filename):
    samples = []
    with open(filename, 'r') as file:
        for line in file:
            data = line.strip().split(',')
            state = np.array([float(x) for x in data[0:4]])
            action = int(data[4])
            reward = float(data[5])
            next_state = np.array([float(x) for x in data[6:10]])
            done = bool(int(data[10]))
            samples.append(Sample(state, action, reward, next_state, done))
    return samples

def generate_file(file_name, num_samples=1000):
    with open(file_name, 'w') as file:
        for _ in range(num_samples):
            state = np.random.rand(4)
            action = np.random.randint(0, 2)
            reward = np.random.rand()
            next_state = np.random.rand(4)
            done = np.random.choice([0, 1])
            file.write(f"{','.join(map(str, state))},{action},{reward},{','.join(map(str, next_state))},{done}\n")

In [4]:
generate_file('samples.txt', num_samples=1000)
# Load samples from the file
samples = load_samples_from_file('samples.txt')
# samples
solver = LSTDQSolver()
samples = load_samples_from_file('samples.txt')
policy, all_policies = lspi_loop_offline(solver, samples, discount=0.9, epsilon=0.01, max_iterations=5)


*********************************************************
LSPI iteration: 1


ValueError: operands could not be broadcast together with shapes (15,1) (5,1) 

In [None]:

sampling_policy = lspi.Policy(lspi.basis_functions.DummyBasis(2), .9, 1)

samples = []
for i in range(1000):
    
    action = sampling_policy.select_action(domain.current_state())
    # print(f"current state:{domain.current_state()} action:{action}")
    samples.append(domain.apply_action(action))
    # print

random_policy_cum_rewards = np.sum([sample.reward
                                            for sample in samples])

# samples

In [None]:
solver = lspi.solvers.LSTDQSolver()


initial_policy = lspi.Policy(
    lspi.basis_functions.RadialBasisFunction(
        means = np.array([[0], [2], [4], [6], [8]]), 
        gamma=.5, 
        num_actions=2),
    discount=.9,
    explore=0)

learned_policy = lspi.learn(samples, initial_policy, solver)

domain.reset()
cumulative_reward = 0
for i in range(1000):
    # print(f"current state:{domain.current_state()} action:{action}")
    action = learned_policy.select_action(domain.current_state())
    sample = domain.apply_action(action)
    cumulative_reward += sample.reward

cumulative_reward, random_policy_cum_rewards


(879, np.int64(207))