In [1]:
pip install gym 

You should consider upgrading via the '/Users/hoganma/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import gym
import random
import numpy as np
from gym.envs.registration import register
import time
from IPython.display import clear_output

#Note: Epsilon Greedy Strategy must be considered (when to prioritise exploration and exploitation). 
#Without it, agent will go back and forth points with huge randomised q values

In [3]:
#register just removes the prebuilt slip function which is a hassle
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point = 'gym.envs.toy_text:FrozenLakeEnv',
        kwargs = {'map_name': '4x4','is_slippery':False},
        max_episode_steps = 100,
        reward_threshold = 0.78,
    )
except:
    pass

env_name = "FrozenLakeNoSlip-v0"
env = gym.make(env_name) #handle for interacting with the environment
print("Observation spaces: ",env.observation_space) 
print("Action spaces: ", env.action_space) 
type(env.action_space)

Observation spaces:  Discrete(16)
Action spaces:  Discrete(4)


gym.spaces.discrete.Discrete

In [4]:
#understand continuous and discrete data; alternatives are for finding value in different data sets

class Agent():
    def __init__(self,env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n 
            print("Action size: ", self.action_size)
        
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_space = env.action_space.shape
            print("Action range: ", self.action_low,self.action_high) 
            
    #pick the action that the agent does
    def get_action(self,state):
        if self.is_discrete:
            action= random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_space)
        return action

In [5]:
#Implementing Q-learning

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)#subclass of the agent; inherits stuff from cell above
        #action size is already defined in the parent class above; only need to define state size instead
        self.state_size = env.observation_space.n 
        print("State size: ", self.state_size)
        
        self.eps = 1.0
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.build_model()
        
    #creating the q table (random values to start off) // row = state; col = actions 
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    #redefining get_action - select actions based on q values corresponding to given states, hence best action will be 
    #highest q value in index
    def get_action(self, state):
            q_state = self.q_table[state]
            action_greedy = np.argmax(q_state)#exploit
            action_random = super().get_action(state)#explore
            return action_random if random.random() < self.eps else action_greedy
        
    #new method to update q tables (along with other stuff in tuple) after each step 
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        #first get q values for next state (exception if we get terminal states AKA holes so we set those to zeroes)
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        #target value according to equation (discount rate defined from constructor; see above)
        q_target = reward + self.discount_rate * np.max(q_next)
        
        #calculate distance between current learned q value and the target value (see formula #2)
        q_update = q_target - self.q_table[state,action]
        self.q_table[state,action] += self.learning_rate * q_update
        
        if done:
            self.eps = self.eps * 0.99  #exponential decay for epsilon
        

In [None]:
agent = QAgent(env)
total_reward = 0

for ep in range(200):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)#update after what happens with action
        agent.train((state,action,next_state,reward,done)) #parses in those 5 values for training
        state = next_state #update for repetition
        total_reward += reward
        print("State: ", state, "Action: ", action)
        print("Episode: {} , Total Reward: {}, Epsilon: {}".format(ep,total_reward,agent.eps)) #neat function
        env.render()
        print(agent.q_table)  #can get rid of
        time.sleep(0.05)
        clear_output(wait = True) #clears board; don't have multiple lakes
    

State:  2 Action:  2
Episode: 126 , Total Reward: 0.0, Epsilon: 0.28186069554046345
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
[[6.14143103e-05 5.25521830e-05 5.39192306e-05 4.38000092e-05]
 [2.89529826e-05 6.30557004e-05 8.18296783e-05 4.20969580e-05]
 [8.32551447e-05 7.49658664e-05 7.67255952e-05 8.32563370e-05]
 [8.29176491e-05 2.12979591e-05 1.84864891e-05 8.25930187e-05]
 [5.98172014e-05 4.07627362e-05 4.62405852e-06 2.66861829e-05]
 [8.13690890e-05 1.41173144e-05 1.57894708e-05 4.20640417e-05]
 [3.29704999e-05 4.19971872e-05 7.34190666e-05 9.19849432e-05]
 [5.21337980e-05 8.85672852e-05 3.73126776e-05 4.66814984e-05]
 [1.48148479e-05 5.42789613e-05 1.53693929e-05 9.86900049e-06]
 [7.73168773e-05 3.87086223e-05 3.06318808e-05 6.47253564e-05]
 [7.90818751e-05 7.67541194e-05 1.66687333e-05 7.36143497e-05]
 [1.24429558e-05 5.65180744e-05 2.53553970e-05 8.38253534e-05]
 [6.87154949e-05 4.06474354e-05 9.88894650e-05 2.58162005e-05]
 [5.67915965e-05 5.38483679e-05 3.43035673e-05 8.50573497e