# Problem Description

<img src='images/reward_map.PNG' /></a>

The yellow box is the initial state. The Green box is goal state. The task is creating the most profitable using Q-Learning

In [3]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

## Import the rewardmap data

In [4]:
rw = pd.read_csv('data.txt', delimiter = '\t', header = None)
rw = np.array(rw).tolist()
rw

[[-1, -3, -5, -1, -3, -3, -5, -5, -1, 100],
 [-2, -1, -1, -4, -2, -5, -3, -5, -5, -5],
 [-3, -4, -4, -1, -3, -5, -5, -4, -3, -5],
 [-3, -5, -2, -5, -1, -4, -5, -1, -3, -4],
 [-4, -3, -3, -2, -1, -1, -1, -4, -3, -4],
 [-4, -2, -5, -2, -4, -5, -1, -2, -2, -4],
 [-4, -3, -2, -3, -1, -3, -4, -3, -1, -3],
 [-4, -2, -5, -4, -1, -4, -5, -5, -2, -4],
 [-2, -1, -1, -4, -1, -3, -5, -1, -4, -1],
 [-5, -3, -1, -2, -4, -3, -5, -2, -2, -2]]

In [5]:
def build_R(rw):
    xlen = len(rw)
    ylen = len(rw[0])

    d = pd.DataFrame(columns = ['up', 'right', 'down', 'left', 'this'])
    idx = 0 
    
    for i in range(ylen):
        for j in range(xlen):
            move = [float('-inf'),float('-inf'),float('-inf'),float('-inf'), rw[i][j]]
            if i>0: #up
                move[0] = rw[i-1][j]
            if i<ylen-1: #down
                move[2] = rw[i+1][j]
            if j>0: #left
                move[3] = rw[i][j-1]
            if j<xlen-1: #right
                move[1] = rw[i][j+1]
            d.loc[idx] = move
            idx+=1
    return d
    

In [6]:
R = build_R(rw)
R = np.array(R)
R

array([[ -inf,   -3.,   -2.,  -inf,   -1.],
       [ -inf,   -5.,   -1.,   -1.,   -3.],
       [ -inf,   -1.,   -1.,   -3.,   -5.],
       [ -inf,   -3.,   -4.,   -5.,   -1.],
       [ -inf,   -3.,   -2.,   -1.,   -3.],
       [ -inf,   -5.,   -5.,   -3.,   -3.],
       [ -inf,   -5.,   -3.,   -3.,   -5.],
       [ -inf,   -1.,   -5.,   -5.,   -5.],
       [ -inf,  100.,   -5.,   -5.,   -1.],
       [ -inf,  -inf,   -5.,   -1.,  100.],
       [  -1.,   -1.,   -3.,  -inf,   -2.],
       [  -3.,   -1.,   -4.,   -2.,   -1.],
       [  -5.,   -4.,   -4.,   -1.,   -1.],
       [  -1.,   -2.,   -1.,   -1.,   -4.],
       [  -3.,   -5.,   -3.,   -4.,   -2.],
       [  -3.,   -3.,   -5.,   -2.,   -5.],
       [  -5.,   -5.,   -5.,   -5.,   -3.],
       [  -5.,   -5.,   -4.,   -3.,   -5.],
       [  -1.,   -5.,   -3.,   -5.,   -5.],
       [ 100.,  -inf,   -5.,   -5.,   -5.],
       [  -2.,   -4.,   -3.,  -inf,   -3.],
       [  -1.,   -4.,   -5.,   -3.,   -4.],
       [  -1.,   -1.,   -2.,   -

## Build Q Matrice

In [7]:
Q = np.zeros((100,5))
Q

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  

## Build Transition Matrix
format row = 

    [up, right, down, left, none]

In [8]:
def build_trans(rw):
    xlen = len(rw)
    ylen = len(rw[0])
    
    d = {'up':[] ,'right':[], 'down':[], 'left':[]}
    d = pd.DataFrame(columns = ['up', 'right', 'down', 'left', 'none'])
    idx = 0 
    
    for i in range(ylen):
        for j in range(xlen):
            trans = [-1,-1,-1,-1, idx]
            if i>0: #up
                trans[0] = idx-xlen
            if i<ylen-1: #down
                trans[2] = idx+xlen
            if j>0: #left
                trans[3] = idx-1
            if j<xlen-1: #right
                trans[1] = idx+1
            d.loc[idx] = trans
            idx+=1
    return d

In [9]:
trans = build_trans(rw)
trans = np.array(trans)

In [10]:
trans

array([[-1, 1, 10, -1, 0],
       [-1, 2, 11, 0, 1],
       [-1, 3, 12, 1, 2],
       [-1, 4, 13, 2, 3],
       [-1, 5, 14, 3, 4],
       [-1, 6, 15, 4, 5],
       [-1, 7, 16, 5, 6],
       [-1, 8, 17, 6, 7],
       [-1, 9, 18, 7, 8],
       [-1, -1, 19, 8, 9],
       [0, 11, 20, -1, 10],
       [1, 12, 21, 10, 11],
       [2, 13, 22, 11, 12],
       [3, 14, 23, 12, 13],
       [4, 15, 24, 13, 14],
       [5, 16, 25, 14, 15],
       [6, 17, 26, 15, 16],
       [7, 18, 27, 16, 17],
       [8, 19, 28, 17, 18],
       [9, -1, 29, 18, 19],
       [10, 21, 30, -1, 20],
       [11, 22, 31, 20, 21],
       [12, 23, 32, 21, 22],
       [13, 24, 33, 22, 23],
       [14, 25, 34, 23, 24],
       [15, 26, 35, 24, 25],
       [16, 27, 36, 25, 26],
       [17, 28, 37, 26, 27],
       [18, 29, 38, 27, 28],
       [19, -1, 39, 28, 29],
       [20, 31, 40, -1, 30],
       [21, 32, 41, 30, 31],
       [22, 33, 42, 31, 32],
       [23, 34, 43, 32, 33],
       [24, 35, 44, 33, 34],
       [25, 36, 45, 34,

## Build List of Valid Actions for Each States
format : <br>
0: up
1 : right
2 : down
3 : left
4 : none

In [11]:
def build_va(trans):
    va = []
    for i in range(len(trans)):
        current_va = []
        if (trans[i][0] != -1): #up
            current_va.append(0)
        if (trans[i][1] != -1): #right
            current_va.append(1)
        if (trans[i][2] != -1): #down
            current_va.append(2)
        if (trans[i][3] != -1): #left
            current_va.append(3)
        current_va.append(4)
        va.append(current_va)
    return va

In [12]:
va = build_va(trans)
va = np.array(va)
va

array([list([1, 2, 4]), list([1, 2, 3, 4]), list([1, 2, 3, 4]),
       list([1, 2, 3, 4]), list([1, 2, 3, 4]), list([1, 2, 3, 4]),
       list([1, 2, 3, 4]), list([1, 2, 3, 4]), list([1, 2, 3, 4]),
       list([2, 3, 4]), list([0, 1, 2, 4]), list([0, 1, 2, 3, 4]),
       list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]),
       list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]),
       list([0, 1, 2, 3, 4]), list([0, 2, 3, 4]), list([0, 1, 2, 4]),
       list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]),
       list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]),
       list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]), list([0, 2, 3, 4]),
       list([0, 1, 2, 4]), list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]),
       list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]),
       list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]), list([0, 1, 2, 3, 4]),
       list([0, 2, 3, 4]), list([0, 1, 2, 4]), list([0, 1, 2, 3

# Main Program


In [13]:
gamma = 0.8
episodes = 10000
reward_list = []
for i in range(episodes):
    start_state = 90
    goal_state = 9
    current_state = start_state
    current_reward = R[current_state][4]
    while current_state != goal_state:
        action = random.choice(va[current_state])
        next_state = trans[current_state][action]
        future_rewards = []
        for action_nxt in va[next_state]:
            future_rewards.append(Q[next_state][action_nxt])
            
        #update Q
#         print('CS : {}\nAct : {}'.format(current_state, action))
        qstate = R[current_state][action] + gamma*max(future_rewards)
        Q[current_state][action] = qstate
#         print(Q)
        current_state = next_state
        current_reward += R[current_state][4]
        if(current_state == goal_state):
            print('Episode {}, score : {}'.format(i, current_reward))
            reward_list.append(current_reward)

Episode 0, score : -245.0
Episode 1, score : -3440.0
Episode 2, score : -737.0
Episode 3, score : -1244.0
Episode 4, score : -2195.0
Episode 5, score : -3047.0
Episode 6, score : -3034.0
Episode 7, score : -995.0
Episode 8, score : -2350.0
Episode 9, score : -663.0
Episode 10, score : -720.0
Episode 11, score : -1956.0
Episode 12, score : -8690.0
Episode 13, score : -5706.0
Episode 14, score : -2932.0
Episode 15, score : -2386.0
Episode 16, score : -3411.0
Episode 17, score : -4017.0
Episode 18, score : -4262.0
Episode 19, score : -1006.0
Episode 20, score : -458.0
Episode 21, score : -259.0
Episode 22, score : -3065.0
Episode 23, score : -2748.0
Episode 24, score : -766.0
Episode 25, score : -3445.0
Episode 26, score : -5523.0
Episode 27, score : -1208.0
Episode 28, score : -2372.0
Episode 29, score : -333.0
Episode 30, score : -10471.0
Episode 31, score : -2152.0
Episode 32, score : -1357.0
Episode 33, score : -1052.0
Episode 34, score : -5680.0
Episode 35, score : -526.0
Episode 36,

KeyboardInterrupt: 

In [None]:
Q_df = pd.DataFrame(columns = ['up', 'right', 'down', 'left', 'none'])

In [19]:
for i in range(len(Q)):
    Q_df.loc[i] = Q[i]

In [20]:
Q_df

Unnamed: 0,up,right,down,left,none
0,0.000000,3.213581,1.048692,0.000000,1.570865
1,0.000000,7.766976,3.810865,1.570865,3.213581
2,0.000000,15.958720,6.013581,3.213581,7.766976
3,0.000000,21.198400,8.766976,7.766976,15.958720
4,0.000000,30.248000,14.958720,15.958720,21.198400
5,0.000000,41.560000,19.198400,21.198400,30.248000
6,0.000000,58.200000,30.248000,30.248000,41.560000
7,0.000000,79.000000,41.560000,41.560000,58.200000
8,0.000000,100.000000,58.200000,58.200000,79.000000
9,0.000000,0.000000,0.000000,0.000000,0.000000


In [21]:
max(reward_list)

69.0