# Setting up

In [1]:
pip install gym




In [2]:
import numpy as np
import random 
import time
import gym
from IPython.display import clear_output

In [3]:
# creating the environment
env = gym.make('FrozenLake-v1')

# Initialzing Q_table

In [4]:
states_num = env.observation_space.n
actions_num = env.action_space.n

# Q = np.zeros((states_num,actions_num))

# Fixed Parameters

In [5]:
max_steps = 100
max_epsilon = 1
min_epsilon = 0.01

# Training Q

#### Defining a function for updating Q_table by using episodes:


In [6]:
def Q_trainer(discount_rate = 0.99, no_of_episodes = 10000, learning_rate=0.01, epsilon_decay=0.001):
    """
    
    This is a function to improve our Q_table from ground level.
    
    Inputs = (no_of_episodes, learning_rate, epsilon_decay, Q_table)
    
    outputs = (Q_table , all_episodic_returns)

    """
    Q = np.zeros((states_num,actions_num))
    epsilon = 1
    all_episodic_rewards = []
    
    for episode in range(no_of_episodes):
        state,_ = env.reset()
        done = False
        episode_reward = 0

        for step in range(max_steps):
            random_number = random.random()
            if random_number > epsilon:
                action = np.argmax(Q[state,:])
            else :
                action = env.action_space.sample()

            new_state, reward,done,info,_ = env.step(action)
            Q[state,action] = (learning_rate)*(reward + discount_rate*(np.max(Q[new_state,:]))) + (1-learning_rate)*Q[state,action]
            state = new_state
            episode_reward += reward

            if(done==True):
                break

        all_episodic_rewards.append(episode_reward)
        epsilon = min_epsilon + (max_epsilon-min_epsilon)*np.exp(-epsilon_decay*episode)
    
    return Q, all_episodic_rewards
        

In [7]:
Q_new, all_episodic_rewards = Q_trainer(0.99,10000, 0.01, 0.001)

  if not isinstance(terminated, (bool, np.bool8)):


In [8]:
average_rewards_per_thousand = []
for i in range(9):
    sum = np.sum(all_episodic_rewards[i*1000:(i+1)*1000])
    average_rewards_per_thousand.append(sum/1000)

print('average_rewards_per_thousand =' , average_rewards_per_thousand,'\n') 
print("Q-Table: \n", Q_new)

average_rewards_per_thousand = [0.018, 0.035, 0.036, 0.037, 0.05, 0.044, 0.043, 0.037, 0.04] 

Q-Table: 
 [[3.58367774e-02 1.60160797e-02 1.59187759e-02 1.40467565e-02]
 [1.75957130e-04 6.69486795e-03 2.25151761e-05 6.42091352e-05]
 [2.80364023e-05 6.73226685e-03 2.98634189e-04 7.90951003e-06]
 [4.49910119e-06 1.70298368e-04 4.39705366e-07 1.66018692e-06]
 [1.73089736e-02 1.02595805e-02 3.66826067e-02 5.51842886e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.02273331e-03 7.42054585e-02 1.67931498e-03 6.99927047e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.58064609e-03 1.05917227e-02 7.44886315e-02 1.66934023e-02]
 [1.11563854e-02 1.50139889e-02 1.90180487e-01 8.68047357e-03]
 [3.85822191e-02 3.00242105e-02 2.24847496e-01 2.96377478e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [8.09150172e-03 2.50200953e-02 3.87691425e-01 1.54772749e-02]
 [3.97374852

# Watching the game

In [9]:
env_test = gym.make('FrozenLake-v1',render_mode='human')

In [10]:
wins = 0
loses = 0

for epi in range(20):
    print('Starting a new episode')
    state = env_test.reset()[0]
    env_test.render()
    done  = False
    
    for step in range(max_steps):
        action = np.argmax(Q_new[state,:])
        next_state , reward, done, info, _ = env_test.step(action)
        
        if reward==1:
            wins = wins+1
            break
            
        elif done==True:
            loses = loses+1
            break  
            
        state = next_state

env.close()

Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode
Starting a new episode


# Results and Accuracy

In [11]:
print('no of times our agent won out of 20 times:', int(wins) )
print('no of times our agent lost out of 20 times:', int(loses) )
print('no of times our agent draw out of 20 times:', int(20-wins-loses) )

no of times our agent won out of 20 times: 2
no of times our agent lost out of 20 times: 18
no of times our agent draw out of 20 times: 0


# Improving the Model

In [12]:
# need epsilon decay and learning rate patterns
help(Q_trainer)

Help on function Q_trainer in module __main__:

Q_trainer(discount_rate=0.99, no_of_episodes=10000, learning_rate=0.01, epsilon_decay=0.001)
    This is a function to improve our Q_table from ground level.
    
    Inputs = (no_of_episodes, learning_rate, epsilon_decay, Q_table)
    
    outputs = (Q_table , all_episodic_returns)



In [13]:
learning_rates = [0.5,0.1,0.01,0.001]
epsilon_decays = [0.1,0.01,0.001]
accuracy = []

for i in learning_rates:
    for j in epsilon_decays:
        Q_improve,Rewards = Q_trainer(0.99,10000,i,j)
        r = np.sum(Rewards[9000:10000])
        accuracy.append(r)
        print(f'accuracy for lr={i}, decay={j} is {r}')

accuracy for lr=0.5, decay=0.1 is 0.0
accuracy for lr=0.5, decay=0.01 is 0.0
accuracy for lr=0.5, decay=0.001 is 586.0
accuracy for lr=0.1, decay=0.1 is 0.0
accuracy for lr=0.1, decay=0.01 is 675.0
accuracy for lr=0.1, decay=0.001 is 693.0
accuracy for lr=0.01, decay=0.1 is 0.0
accuracy for lr=0.01, decay=0.01 is 113.0
accuracy for lr=0.01, decay=0.001 is 48.0
accuracy for lr=0.001, decay=0.1 is 0.0
accuracy for lr=0.001, decay=0.01 is 0.0
accuracy for lr=0.001, decay=0.001 is 59.0


In [15]:
discount_rates = [0.99,0.9,0.8,0.7]
for i in discount_rates:
    Q_improve,Rewards = Q_trainer(i,10000,0.1,0.01)
    r = np.sum(Rewards[9000:10000])
    accuracy.append(r)
    print(f'accuracy for dr={i}, decay={j} is {r}')

accuracy for dr=0.99, decay=0.001 is 0.0
accuracy for dr=0.9, decay=0.001 is 0.0
accuracy for dr=0.8, decay=0.001 is 0.0
accuracy for dr=0.7, decay=0.001 is 0.0
