In [11]:
## Ignore all Warnings
import warnings
warnings.filterwarnings("ignore")

In [12]:
### Import other supporting libraries for practice
import time
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay
import numpy as np
import pandas as pd
import random

In [13]:
##
N_States = 6  ## possible states in 1D
Actions = ['left', 'right'] ## available actions


lr = 0.01  # Learning Rate
gamma = 0.99   # discout factor

max_episode = 5



In [15]:
def initialise_q(n_states,actions):
    q_table  = pd.DataFrame(np.zeros((n_states,len(actions))),columns  = actions)
    return q_table
    

In [16]:
np.random.uniform()

0.8565175889895095

In [17]:
def epsilon_greedy_action(state,q_table,epsilon = 0.9):
    state_actions = q_table.iloc[state,:] # state 1, [0.87656, 0.98765]
    if np.random.uniform() >= epsilon:# or ((state_actions==0).all()):
        ## Exploitation
        action_name = state_actions.idxmax() ### Argmax
    else:
        ## Exploration
        action_name = np.random.choice(Actions)
    return action_name


In [7]:
def take_step(state,action):
    if action == 'right':
        if state == N_States - 2:
            next_state = 'terminal'
            reward = 1
        else:
            next_state = state + 1
            reward = 0
            
    else: ## if action is left        
        reward = 0
        if state == 0:
            next_state = 0 ##reached to wall
        else:
            next_state = state - 1
    
    if next_state == 'terminal':
        done = True
    else:
        done = False
    return next_state,reward, done
        
                
    

In [18]:
def env_render(S,episode,step_counter):
    env_list = ['-']*(N_States - 1) + ['T'] # ------T is our environment
    if S == 'terminal':
        interaction = 'Episode %s: Total steps = %s' %(episode+1, step_counter)
        
        print('\r{}'.format(interaction),end ='')
        time.sleep(1)
        
        print('\r                               ',end ='')

    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction),end = '')
        time.sleep(1)
        

In [19]:
def train_rl_agent(gamma = 0.8,lr = 0.1):
    q_table = initialise_q(N_States,Actions)
    
    ##Episodes
    
    for episode in range(max_episode):
        step_counter = 0
        current_state = 0
        #render the environment 
        is_terminated = False
        env_render(current_state,episode,step_counter)
        
        ##Run main loop
        
        while not is_terminated:
            action = epsilon_greedy_action(current_state,q_table)
            next_state,reward,termination = take_step(current_state,action)
            ##fetch curernt state-action q value
            q_current_sa = q_table.loc[current_state,action]
            
            if next_state != 'terminal':
                q_target = reward + gamma * q_table.iloc[next_state,:].max()
            else:
                q_target = reward
                is_terminated = True
                
            ## Update the q_table
            q_table.loc[current_state,action] += lr * (q_target - q_current_sa)
            
            ####
            # q(s,a) = 0.23
            # q_target = 10324 ==> 0.001 * (10324 - 0.23) approx =  10.324
            # q(s,a) = 0.23 + 10.324 = 10.554
            ####
            
            ## Move to next state
            current_state = next_state
            
            env_render(current_state,episode,step_counter+1)
            
            ##
            step_counter = step_counter+1
            
    return q_table
    

In [20]:
q_table = train_rl_agent(gamma,lr)
print(q_table)

                                      left     right
0  0.000118  0.000470
1  0.000039  0.002668
2  0.000340  0.022865
3  0.002740  0.124165
4  0.014526  0.409510
5  0.000000  0.000000


In [21]:
q_table

Unnamed: 0,left,right
0,0.000118,0.00047
1,3.9e-05,0.002668
2,0.00034,0.022865
3,0.00274,0.124165
4,0.014526,0.40951
5,0.0,0.0


In [22]:

### Watch the Agent 
def watch_agent(optimal_q):
    step_counter = 0
    current_state = 0
    is_terminated = False
    env_render(current_state,0,step_counter)
    while not is_terminated:
        action = epsilon_greedy_action(current_state,optimal_q,epsilon=0.1)
        #print("state ={}, action = {} ".format(current_state,action))
        next_state,reward,done = take_step(current_state,action)
        current_state = next_state
        
        
        env_render(current_state,0,step_counter+1)
        step_counter = step_counter+1
        
        if current_state == 'terminal':
            print("Finished")
            break

In [23]:
watch_agent(q_table)

                               Finished


In [35]:
# q_table.loc[0,:].argmax()

1