In [1]:
import sys


PACKAGE_PARENT = '../../'
sys.path.append(PACKAGE_PARENT)

from alphaslime.evaluate.eval_agents import EvaluateGameSA
from alphaslime.agents.other.semiGradSarsa import SemiGradSarsa
from alphaslime.approx.linearq import LinearQApprox

from pandas import read_csv
import numpy as np
import matplotlib.pyplot as plt

import gym
import csv

In [2]:
# config intial properties

env_id = 'CartPole-v1'
env = gym.make(env_id)
# seed the random numbers and the gym environment
seed = 42
np.random.seed(seed)    
env.seed(seed)

# agent properties
alpha = 0.1 # step size
epsilon = 1
gamma = 0.95
training_episodes = 1000
observation_dimension=4
action_table = [0, 1]


# q function approximator
q_hat = LinearQApprox()

# set config file for agent
config = {
    'alpha': None,
    'gamma': gamma,
    'epsilon': epsilon,
    'action_table': action_table,
    'd': observation_dimension,
    't_max': 500,
    'max_score': 500,
    'episode_printer': 100,
    'env': env,
    'weights':None,
    'q_hat': q_hat
}



In [3]:
# we want to determine the best alpha value
# so iterate alpha
# then train, then determine average score per alpha

# function for saving weights to disk
def save_weight(alpha, weights):
    path = './train/sarsa/'
    file_name = 'weights_alpha' + ".csv"
    path += file_name
    with open(path, 'a', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)

        # write the header
        writer.writerow(str(alpha))

        # write multiple rows
        writer.writerows(weights)

In [4]:
# create alpha list
alphas = np.arange(0.1,1,0.05)

# testing trails
trails = 1

average_rewards = []

# configure evaluation test
base_dir = './'
RENDER = False

agent_rewards = np.zeros((trails,))

In [5]:
# train for different alpha values
for alpha in alphas:
    print('Training agent for alpha = {}'.format(alpha))

    # set config file
    config['alpha'] = alpha
    config['epsilon'] = epsilon

    # init agent
    agent = SemiGradSarsa(config)

    # update alpha value
    agent.alpha = alpha 
    # reset episolon value
    agent.epsilon = epsilon
    # train agent
    agent.train(episodes=training_episodes)

    # get trained weight value
    weights = agent.w

    # save weights
    # TODO: use a thread for write operation
    save_weight(alpha, weights)

    # determine average score per episode
    eval_game = EvaluateGameSA(agent=agent, base_dir_path=base_dir, render=RENDER, env=env)
    agent.epsilon = 0 # make greedy actions
    for episode in range(trails):
        reward = eval_game.evaluate_episode()
        agent_rewards[episode] = reward
    
    # store average reward
    average_rewards.append(np.mean(agent_rewards))

average_rewards = np.array(average_rewards)

    
    

Training agent for alpha = 0.1
Completed Episodes = 0
Completed Episodes = 100
Completed Episodes = 200
Completed Episodes = 300
Completed Episodes = 400
Completed Episodes = 500
Completed Episodes = 600
Completed Episodes = 700
Completed Episodes = 800
Completed Episodes = 900


Error: iterable expected, not numpy.float64

In [None]:
# plot average reward per alpha
plt.plot(alphas, average_rewards)