<a href="https://colab.research.google.com/github/jargnar/notebooks/blob/main/qlearning_custom_environment_openai_gym.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import numpy as np
from gym import Env
from gym.spaces import Discrete, Box

In [27]:
an_example_space = Box(low=0, high=2, shape=(3,3), dtype=np.int32)

In [28]:
an_example_space.sample()

array([[2, 2, 1],
       [1, 0, 0],
       [1, 2, 0]], dtype=int32)

In [65]:
class SimpleFrozenLake(Env):
  """
  A 3x3 grid world with 1s and 0s where one can only move right or down.
  It's preferable to live inside 1s than in 0s in this grid world,
  and that's what an agent must do.
  """
  def __init__(self, grid):
    self.action_space = Discrete(2)
    self.observation_space = Box(low=0, high=1, shape=(3, 3), dtype=np.int32)
    self.grid = np.array(grid)
    assert self.grid.shape == (3, 3)
    self.state = (0, 0)

  def step(self, action):
    if action == 0:
      self.state = (self.state[0], self.state[1] + 1)
    elif action == 1:
      self.state = (self.state[0] + 1, self.state[1])
    
    done = False
    reward = self.grid[self.state]
    if self.state[0] == 2 or self.state[1] == 2:
      done = True
  
    return self.state, reward, done, {}
  
  def render(self):
    pass
  
  def reset(self):
    self.state = (0, 0)
    return self.state

In [66]:
env = SimpleFrozenLake([[1, 1, 1], [0, 1, 0], [1, 0, 0]])

In [70]:
qtable = np.zeros((env.observation_space.shape[0] * env.observation_space.shape[1], env.action_space.n))
lr = 0.8
gamma = 0.95
eps = 1.0
maxeps = 1.0
mineps = 0.01 
dr = 0.001

happiness = []

for episode in range(1000):
  state = env.reset()
  step = 0
  done = False
  rewards = 0
  reward = 0
    
  for step in range(100):
    ee = np.random.uniform(0, 1)
    action = np.argmax(qtable[state,:], axis=1)[0] if ee > eps else env.action_space.sample()
    new_state, reward, done, info = env.step(action)
    qtable[state, action] = qtable[state, action] + lr * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
    rewards += reward
    state = new_state
    if done == True: 
        break
        
    episode += 1
    eps = mineps + (maxeps - mineps)*np.exp(-dr*episode) 
    happiness.append(rewards)

print(sum(happiness)/1000)
print(qtable)

1.249
[[19.96392854 19.90523515]
 [19.96327115 19.93844323]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]
 [ 0.          0.        ]]
