# IMPLEMENTING TEMPORAL DIFFRENCE LEARNING IN FROZEN LAKE (openai's gym environment) using q-learning (qtable)

In [2]:
!python3 -m pip install gym

Collecting gym
  Downloading gym-0.17.2.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 13 kB/s 
Collecting pyglet<=1.5.0,>=1.4.0
  Downloading pyglet-1.5.0-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 16 kB/s 
[?25hCollecting cloudpickle<1.4.0,>=1.2.0
  Downloading cloudpickle-1.3.0-py2.py3-none-any.whl (26 kB)
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 26 kB/s 
[?25hBuilding wheels for collected packages: gym, future
  Building wheel for gym (setup.py) ... [?25ldone
[?25h  Created wheel for gym: filename=gym-0.17.2-py3-none-any.whl size=1650891 sha256=350afba66562fd108dbd1c13befabcc3a9f2c8a99ca410a9e5cf3dd42bf99bff
  Stored in directory: /Users/user/Library/Caches/pip/wheels/48/bf/7c/44b1b8e4ad998fc48e31caedbb9e028351861b8d20632642bc
  Building wheel for future (setup.py) ... [?25ldone
[?25h  Created wheel for future: filename=future-0.18.2-py3-none-any.whl 

## import packages gym for environment and performing trials, random for determining exploitation or exploration using episilon, and numpy for numerical computation in array-like manner.

In [25]:
import numpy as np
import gym
import random

## random example from https://gym.openai.com/docs/ to test my gym

In [12]:
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

## define enironment for the agent

In [26]:
env = gym.make("FrozenLake-v0")

## get environment size for determining qtable. size is observable space and possible action is action space

In [27]:
action_size = env.action_space.n
state_size = env.observation_space.n

## initialize qtable

In [52]:
qtable = np.zeros((state_size, action_size))
print (qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


## defining our hyper parameters

In [75]:
total_episodes = 100000 #number of times the episode/ trials are done.
learning_rate = 0.7 #rate of update of action_space values in a state
max_steps = 99 #max_number of step to take in environment before end of episode
gamma = 0.95 #discount

## initialising learning episodial learning parameters

In [76]:
epsilon  = 1 # variable used in epsilon greedy algorithm
max_epsilon = 1 # the ceiling value of epsilon
min_epsilon = 0.01 # the floor value of epsilon 
epsilon_range = max_epsilon - min_epsilon # 
decay_rate = 0.005 # the power of e = 2.7182818285, natural number, by which we decrease epsilon

## initializing the agent's accumulated reward

In [77]:
accumulated_rewards = []
explore_exploit_data = []

## this block of code represents our agent performing actions in the environment 

In [78]:
for episode in range(total_episodes):
    state = env.reset() # we are starting from the left-uppermost position (i.e the beginning state of the frozen lake environment)
    total_reward = 0 # the reward our agent accumulates over the 100000 episodes
    dead = False
    for step in range(max_steps):
        num = random.random()
        if num > epsilon: # condition to determin exploration or exploitation
            action = np.argmax(qtable[state,:]) # exploitation
            explore_exploit_data.append("exploiting")
        else:
            action = env.action_space.sample()
            explore_exploit_data.append("exploring")
            # action = random.choice(qtable[state,:]) # exploration

        new_state, reward, dead, info = env.step(action) # getting the result of our action. a new state, a reward, information on our health, and general debugging info

        qtable[state,action] += learning_rate * (reward + gamma * max(qtable[new_state,:]) - qtable[state,action]) # the q-function for updating state value

        total_reward += reward # rewards gotten in the episode
        state = new_state # moving our agent to the newly acquired position or state

        if dead == True: # if our agent happened to die, stop the episode 
            break

    epsilon = min_epsilon + epsilon_range * np.exp(-decay_rate * episode) # carrying out exponential decay on epsilon to encourage less exploration as more episodes pass
    accumulated_rewards.append(total_reward) # keeps track of the reward accumulated

In [79]:
for i in range(0,len(accumulated_rewards),100):
    print (accumulated_rewards[i])
print (sum(accumulated_rewards)) # to observe performance of agent for the hyperparameters we used.
print (qtable) # observe the value of state (the table can help tell us the state to be in a point in time and how to navigate the game to obtain maximium sccores)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0


## analysing exploit to explore ratio

In [43]:
import pandas as pd

In [44]:
df = pd.DataFrame(explore_exploit_data)

In [45]:
df.describe()

Unnamed: 0,0
count,36127
unique,2
top,exploiting
freq,31309


## agent exploited 31309 time while exploring 

## testing: navigating the frozen lake environment with the goal and start state set at 15 and 0 respectively

In [85]:
accumulated_rewards = [] # resetting our accumulate rewards
env.reset() # resetting our environment
for i in range(10): # performing 10 episodes using qtable
    state = env.reset()
    dead = False
    total_reward = 0
    for step in range(max_steps):
        action = np.argmax(qtable[state,:])

        new_state, reward, dead, info = env.step(action)

        total_reward += reward
        if new_state == 15: # if goal state is reached
            print ("eureka, agent has won!! in steps" + str(step+1) )
        if dead == True:
            env.render() # prints the environment if the agent dies
            break
        state = new_state

    accumulated_rewards.append(total_reward)

eureka, agent has won!! in steps53
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
eureka, agent has won!! in steps51
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
eureka, agent has won!! in steps23
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
eureka, agent has won!! in steps28
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
  (Right)
SFFF
FHF[41mH[0m
FFFH
HFFG
eureka, agent has won!! in steps18
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
eureka, agent has won!! in steps16
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
eureka, agent has won!! in steps76
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
