### Import all required modules

In [31]:
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from itertools import product
from tqdm import tqdm

#### create a environment for problem defination

In [2]:
# create emvirnoment
env = gym.make("Taxi-v3").env

# render function used to visualize state of problem.
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



#### do some basic operations with env and get info about env

In [3]:
# configure problem state with new values.
# 4 -> taxi row , 2 -> taxi column , 2 -> passenger location , 0 -> destination location.
state = env.encode(4,2,2,0)
print("state val : ",state)

# change environment problem state.
env.s = state

state val :  448


In [4]:
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+



In [5]:
# env.P is table that contains different information about each state.
env.P[448]
# parameters order -> (probability, state, reward, goal)

{0: [(1.0, 448, -1, False)],
 1: [(1.0, 348, -1, False)],
 2: [(1.0, 448, -1, False)],
 3: [(1.0, 428, -1, False)],
 4: [(1.0, 448, -10, False)],
 5: [(1.0, 448, -10, False)]}

In [6]:
# oberservation_space -> total states in environment
# action_space -> total actions in environment

print("Observation space : ",env.observation_space)
print("Action space : ",env.action_space)

Observation space :  Discrete(500)
Action space :  Discrete(6)


### Training and evaluation of Agent

- first we train agent on set of exmaples -> from the training we got the q-table which helps us to select appropriate action.

- Then we evaluate that q-table on the set of examples as the part of evaluation.

In [43]:

def q_learning(alpha,gamma,epsilon,episodes=7000):
    """ This function try to achieve optimize q-table
        values by learning set of examples
    """
    
    # initialize q-table with all zeros.
    q_table = np.zeros((env.observation_space.n,env.action_space.n))

    all_penalties = []

    for i in range(1,episodes):
        
        # reset environment
        state = env.reset()
        
        epoch = 0
        reward = 0
        penalty = 0

        done = False

        while not done:
            # add some randomness to get optimal values of q-table.
            if random.uniform(0,1) < 0.1:
                action = env.action_space.sample()
            else:
                # choose action with maximum q-value
                action = np.argmax(q_table[state])
            
            # apply choosen action on environment
            next_state, reward, done, info = env.step(action)
            
            
            old_q_value = q_table[state,action]

            next_max_q_value = np.max(q_table[next_state])
            
            # compute new q-value 
            new_q_value = (1 - alpha) * old_q_value + alpha * ( reward + gamma * next_max_q_value )
            
            # replace old q-value with newly computed q-value
            q_table[state, action] = new_q_value
    
            if reward == -10:
                # maintain one penalty counter
                penalty += 1
            state = next_state
            epoch += 1
        
        all_penalties.append(penalty)
    return evaluate_q_learning(q_table),q_table

def evaluate_q_learning(q_table,total_episodes=30):
    """ This function is used to evaluate the q-table
        Q-table which we got after training for some number of
        episodes that table will be evaluated.
    """
    # for evaluation purpose we have 3 measures
    # trip -> number of steps made by agent to reach to the goal - it should be minimum
    # penalty -> wrong steps made by agent - it should be minimum
    # loops -> sometimes agent would not be able to reach to the goal
    #          that's called loops or dead conditions. it should be zero.
    
    total_trips,total_penalties = 0,0
    total_loops = 0
    for i in range(total_episodes):

        reward = 0
        trip_length = 0
        penalty = 0
        loops = 0
        done = False
        state = env.reset()
        while not done:
            # define max trip value if it exceed this max value means it 
            # consider as loops or dead conditions
            if trip_length >= 30:
                total_loops+=1
                break
            action = np.argmax(q_table[state])

            next_state, reward, done, info = env.step(action)
            state = next_state
            if reward == -10:
                penalty+=1

            trip_length+=1

        total_trips += trip_length
        total_penalties += penalty
    
    return {"avg_penalty":total_penalties/total_episodes,
              "avg_triplength":total_trips/total_episodes,
              "avg_loops":total_loops/total_episodes}

#### Hyper Parameter tuning

##### Three parameters can be tuned here:
- alpha or learning rate
- gamma or discount function
- epsilon (factor of randomness)

In [32]:
alpha_list = [0.1,0.2,0.3,0.4]
gamma_list = [0.3,0.4,0.5,0.6]
epsilon_list = [0.2,0.3,0.4,0.15]

stats = {}

# product function will used as cartesian product.
for a,g,e in tqdm(list(product(alpha_list,gamma_list,epsilon_list))):
    stats[f"alpha={a},gamma={g},epsilon={e}"] = q_learning(a,g,e)


100%|██████████| 64/64 [04:52<00:00,  3.41s/it]


In [41]:
# sort the results based on average trip length 
for key in sorted(stats,key=lambda x: stats[x]["avg_triplength"])[:10]:
    print(key)
    print(stats[key])


alpha=0.3,gamma=0.3,epsilon=0.3
{'avg_penalty': 0.0, 'avg_triplength': 11.766666666666667, 'avg_loops': 0.0}
alpha=0.4,gamma=0.5,epsilon=0.2
{'avg_penalty': 0.0, 'avg_triplength': 11.933333333333334, 'avg_loops': 0.0}
alpha=0.3,gamma=0.6,epsilon=0.2
{'avg_penalty': 0.0, 'avg_triplength': 12.233333333333333, 'avg_loops': 0.0}
alpha=0.4,gamma=0.4,epsilon=0.15
{'avg_penalty': 0.0, 'avg_triplength': 12.333333333333334, 'avg_loops': 0.0}
alpha=0.4,gamma=0.6,epsilon=0.15
{'avg_penalty': 0.0, 'avg_triplength': 12.333333333333334, 'avg_loops': 0.0}
alpha=0.4,gamma=0.5,epsilon=0.4
{'avg_penalty': 0.0, 'avg_triplength': 12.533333333333333, 'avg_loops': 0.0}
alpha=0.4,gamma=0.5,epsilon=0.15
{'avg_penalty': 0.0, 'avg_triplength': 12.666666666666666, 'avg_loops': 0.0}
alpha=0.3,gamma=0.3,epsilon=0.2
{'avg_penalty': 0.0, 'avg_triplength': 12.7, 'avg_loops': 0.0}
alpha=0.3,gamma=0.5,epsilon=0.3
{'avg_penalty': 0.0, 'avg_triplength': 12.7, 'avg_loops': 0.0}
alpha=0.2,gamma=0.4,epsilon=0.15
{'avg_penal

In [1]:
# sometimes different results will be found for same parameters values just because of randomness.
# for ex: alpha=0.3,gamma=0.3,epsilon=0.3 i got average trip length is 11.7
# when i again train model with same alpha, gamma and epsilon value then i will get different average trip length
# that makes more difficult the process of selecting optimal parameters

#### Referance:

https://medium.com/analytics-vidhya/a-beginners-guide-to-reinforcement-learning-and-its-basic-implementation-from-scratch-2c0b5444cc49