# Basic Q Tables with FrozenLake-v0

In [24]:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import random

In [69]:
env = gym.make('FrozenLake-v0')
print("Observation space", env.observation_space)
print("Action space: ", env.action_space)

Observation space Discrete(16)
Action space:  Discrete(4)


# Implementing Q-Table Learning algorithm

In [70]:
#Initialize with Os
Q = np.zeros([env.observation_space.n, env.action_space.n]) # [16, 4] matrix with rows for states, cols for actions
learning_rate = 0.8
discount = 0.95
num_episodes = 2000

rList = []
for i in range(num_episodes):
    s = env.reset() #gives first observation
    rAll = 0
    d = False
    j = 0
    
    #Q-Table learning algorithm 
    while j < 99:
        j += 1
        #Choose a greedy action + noise
        a = np.argmax(Q[s,:] + np.random.randn(1, env.action_space.n) * (1./(i+1)))
        #Get new state and reward
        s1, r, d, _ = env.step(a) #this is a greedy action
        #Update the Q-table
        Q[s, a] = Q[s, a] + learning_rate*(r + discount*np.max(Q[s1, :]) - Q[s, a])
        rAll += r
        s = s1  #Update to new state
        if d == True:
            break
    rList.append(rAll)
    

In [23]:
print("Score over time: " + str(sum(rList) / num_episodes))
print("\nFinal Q-table values: \n", Q)

Score over time: 0.369

Final Q-table values: 
 [[1.72225274e-03 1.44678490e-03 5.78905101e-02 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 3.51131221e-04 3.66267346e-02]
 [5.39540164e-04 4.31081165e-04 1.05397709e-03 2.36448731e-02]
 [4.03126312e-04 5.80626680e-04 3.53212456e-04 1.95949509e-02]
 [6.32960702e-02 0.00000000e+00 0.00000000e+00 1.44678490e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.50833495e-04 1.20092335e-08 3.88851358e-05 1.54651619e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 3.87326450e-03 0.00000000e+00 1.35635738e-01]
 [0.00000000e+00 4.56046517e-01 0.00000000e+00 0.00000000e+00]
 [1.01509051e-01 0.00000000e+00 0.00000000e+00 7.65427698e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 6.88889153e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 2.37715329e-01 0.00000000e+00]
 [0.000

# Same with a Neural Net instead of a Table

In [65]:
tf.reset_default_graph()

In [71]:
inputs1 = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
Qout = tf.matmul(inputs1, W)
predict = tf.argmax(Qout, 1)

nextQ = tf.placeholder(shape=[1,4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = optimizer.minimize(loss)

In [72]:
init = tf.global_variables_initializer()
discount = 0.99
epsilon = 0.1    #for epsilon greedy obv
num_episodes = 2000

jList = [] #step list
rList = [] #reward list

with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        s = env.reset()
        rAll = 0
        d = False
        j = 0
        while j < 99:
            j += 1
            
            a, allQ = sess.run([predict, Qout], feed_dict={inputs1:np.identity(16)[s:s+1]})
            if np.random.rand(1) < epsilon:
                a[0] = env.action_space.sample()  #choose a random action with epsilon probability
            #take a step
            s1, r, d, _ = env.step(a[0])
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout, feed_dict={inputs1:np.identity(16)[s1:s1+1]})
            #Obtain the maxQ' and set our target value for chosen action
            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0, a[0]] = r + discount * maxQ1
            #Training with target and predicted
            _, W1 = sess.run([updateModel, W], feed_dict={inputs1:np.identity(16)[s:s+1], nextQ:targetQ})
            
            rAll += r
            s = s1
            if d == True:
                #Reducing chance of random action as the model learns more and more
                epsilon = 1./((i/50) + 10)
                break
        jList.append(j)
        rList.append(rAll)


In [73]:
print("Percent of successful episodes: " + str(sum(rList)/num_episodes) + "%")


Percent of successful episodes: 0.1535%


# Personal experiment
> Trying the FrozenLake8x8-v0 

In [45]:
env = gym.make('FrozenLake8x8-v0')
print("Observation space", env.observation_space)
print("Action space: ", env.action_space)

Observation space Discrete(64)
Action space:  Discrete(4)
