In [2]:
import numpy as np

In [3]:
states = [1,2,3,4,5,6,7,8,9]
actions = ['droite', 'haut','gauche','bas']

In [4]:
def transition_matrix(u):
    if u == 'gauche':
        mini = np.array([[1,0,0],[1,0,0],[0,1,0]])
        zeros_block = np.array([[0,0,0],[0,0,0],[0,0,0]])
        tr = np.block([[mini, zeros_block, zeros_block], [zeros_block,mini, zeros_block],[zeros_block,zeros_block,mini]])
    
    if u == 'droite':
        mini = np.array([[0,1,0],[0,0,1],[0,0,1]])
        zeros_block = np.array([[0,0,0],[0,0,0],[0,0,0]])
        tr = np.block([[mini, zeros_block, zeros_block], [zeros_block,mini, zeros_block],[zeros_block,zeros_block,mini]])
        
    if u == 'haut':
        mini = np.eye(3)
        zeros_block = np.array([[0,0,0],[0,0,0],[0,0,0]])
        tr = np.block([[mini, zeros_block, zeros_block], [mini, zeros_block, zeros_block],[zeros_block,mini,zeros_block]])
        
    if u == 'bas':
        mini = np.eye(3)
        zeros_block = np.array([[0,0,0],[0,0,0],[0,0,0]])
        tr = np.block([[zeros_block, mini, zeros_block], [zeros_block, zeros_block, mini],[zeros_block,zeros_block, mini]])    
    return tr

In [5]:
def transition(s, u):
    tr = transition_matrix(u)
    ligne = tr[s - 1]
    return np.random.choice(states, size=1, replace=True, p=ligne)[0]

In [6]:
def reward(s, u):
#     if ((s == 2) and (u=='droite')) or (s==3) or ((s==6) and (u=='haut')):
#         return(10)
#     elif ((s==4)) or ((s==1) and (u=='bas')) or ((s==5) and (u=='gauche')) or ((s==7) and (u == 'haut')):
#         return(-10)
    if s==3:
        return 10
    elif s==4:
        return -10
    else:
        return 0

In [7]:
def  env_step(s, u):
    next_state = transition(s,u)
    rew = reward(s, u)
    done = ((s == 3) or (next_state == 3))
    return next_state, rew, done

# Q-learning algorithm

In [65]:
q_table = np.zeros([len(states),len(actions)])

In [66]:
# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

In [67]:
for i in range(10000):
    s = np.random.randint(1,len(states)+1) #initial state
    state_index = s-1
    
    epochs, penalties, rew = 0, 0, 0
    done = False
    
    for j in range(3):
        if np.random.uniform(0, 1) < epsilon:
            action_index = np.random.randint(0,len(actions)) # Explore action space
        else:
            action_index = np.argmax(q_table[state_index]) # Exploit learned values

        u = actions[action_index]

        next_state, rew, done = env_step(s, u)
        next_state_index = next_state - 1
        
        old_value = q_table[state_index, action_index]
        next_max = np.max(q_table[next_state_index])

        new_value = (1 - alpha) * old_value + alpha * (rew + gamma * next_max)
        q_table[state_index, action_index] = new_value

        if rew == -10:
            penalties += 1

        s = next_state
        state_index = s-1

        epochs += 1

print("Training finished.\n")

Training finished.



In [68]:
print(actions)
print(q_table)

['droite', 'haut', 'gauche', 'bas']
[[  9.           5.39126415   5.39530811  -2.74919147]
 [ 15.           8.99944262   5.39483282   5.39605199]
 [ 25.          25.          19.          19.        ]
 [ -4.61347     -4.6        -12.50596377  -7.7408071 ]
 [  9.           8.68931742  -2.60033457   3.18391972]
 [  8.9999709   15.           5.39994817   5.39999697]
 [  3.24        -2.6434041    1.8703958    1.8772461 ]
 [  5.4          5.39213509   1.92701706   3.23352088]
 [  5.39958833   9.           3.23769652   5.3998709 ]]


## computing values using dynamic programming


In [57]:
q_dp = np.zeros([len(states),len(actions)])
q_dp[2]=10
q_dp[3]=-10


In [69]:
for t in range(100, 0,-1):
    q_dp_new = q_dp.copy()
    for i in range(len(states)):
        for j in range(len(actions)):
            u = actions[j]
            s = states[i]
            trmat = transition_matrix(u)
            ligne_i = trmat[i]
            q_max = np.max(q_dp, axis = 1)
            q_dp_new[i][j] = reward(s,u) + gamma*np.dot(q_max, ligne_i)
    q_dp = q_dp_new

In [70]:
print(actions)
q_dp

['droite', 'haut', 'gauche', 'bas']


array([[  9.   ,   5.4  ,   5.4  ,  -2.76 ],
       [ 15.   ,   9.   ,   5.4  ,   5.4  ],
       [ 25.   ,  25.   ,  19.   ,  19.   ],
       [ -4.6  ,  -4.6  , -12.76 ,  -8.056],
       [  9.   ,   9.   ,  -2.76 ,   3.24 ],
       [  9.   ,  15.   ,   5.4  ,   5.4  ],
       [  3.24 ,  -2.76 ,   1.944,   1.944],
       [  5.4  ,   5.4  ,   1.944,   3.24 ],
       [  5.4  ,   9.   ,   3.24 ,   5.4  ]])

In [71]:
q_table

array([[  9.        ,   5.39126415,   5.39530811,  -2.74919147],
       [ 15.        ,   8.99944262,   5.39483282,   5.39605199],
       [ 25.        ,  25.        ,  19.        ,  19.        ],
       [ -4.61347   ,  -4.6       , -12.50596377,  -7.7408071 ],
       [  9.        ,   8.68931742,  -2.60033457,   3.18391972],
       [  8.9999709 ,  15.        ,   5.39994817,   5.39999697],
       [  3.24      ,  -2.6434041 ,   1.8703958 ,   1.8772461 ],
       [  5.4       ,   5.39213509,   1.92701706,   3.23352088],
       [  5.39958833,   9.        ,   3.23769652,   5.3998709 ]])