In [None]:
import gymnasium as gym
from qlearning_solver import QLearningSolver
import numpy as np
import time

In [None]:
class Tester():
    def __init__(self, episodes=100, max_steps=100):
        self.episodes = episodes
        self.max_steps = max_steps
        self.env = gym.make('CliffWalking-v0')

    def reset_f(self):
        state, _ = self.env.reset()
        return state

    def step_f(self, action):
        next_state, reward, terminated, truncated, _ = self.env.step(action)
        done = terminated or truncated
        return next_state, reward, done

    def get_obs_size(self):
        return self.env.observation_space.n
    
    def get_act_size(self):
        return self.env.action_space.n

    def evaluate(self, q_table):
        total_rewards = []
        for _ in range(self.episodes):
            state, _ = self.env.reset()
            total_reward = 0
            for _ in range(self.max_steps):
                action = np.argmax(q_table[state])
                state, reward, done, truncated, _ = self.env.step(action)
                total_reward += reward
                if done or truncated:
                    break
            total_rewards.append(total_reward)
        return total_rewards

    def render(self, q_table):
        test_env = gym.make('CliffWalking-v0', render_mode='human')
        for episode in range(5):
            state, _ = test_env.reset()
            done = False

            print('Episode', episode)

            for _ in range(self.max_steps):
                test_env.render()
                action = np.argmax(q_table[state])
                next_state, reward, done, truncated, _ = test_env.step(action)
                state = next_state

                if done or truncated:
                    test_env.render()
                    print('finished episode', episode, 'with reward', reward)
                    break

                time.sleep(0.05) # short time delay when rendering to allow linux ctrl+c to work
                # otherwise i couldnt close the app
    def close(self):
        self.env.close()

tester = Tester()

SyntaxError: invalid syntax (2880797664.py, line 16)

In [None]:
solver = QLearningSolver(tester.get_obs_size, tester.get_act_size)

# Train the agent
q_table = solver.train(tester.step_f, tester.reset_f, num_episodes=10000, max_steps=100)
tester.render(q_table)

In [3]:
env = gym.make("CliffWalking-v0", render_mode='ansi')

# Reset environment
state, _ = env.reset()
env.render()

done = False
total_reward = 0

while not done:
    action = env.action_space.sample()  # Random action
    print(action)
    next_state, reward, done, truncated, info = env.step(action)
    total_reward += reward

    print(env.render())
    print(f"Action: {action}, Reward: {reward}, State: {next_state}\n")

print(f"Total reward: {total_reward}")
env.close()

0
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T


Action: 0, Reward: -1, State: 24

0
o  o  o  o  o  o  o  o  o  o  o  o
x  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T


Action: 0, Reward: -1, State: 12

0
x  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T


Action: 0, Reward: -1, State: 0

3
x  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T


Action: 3, Reward: -1, State: 0

3
x  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T


Action: 3, Reward: -1, State: 0

2
o  o  o  o  o  o  o  o  o  o  o  o
x  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  