<a href="https://colab.research.google.com/github/kesavan7287/RiverSwim-RL/blob/main/RL_RiverSwim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random as random
from copy import deepcopy as dcopy

In [5]:
np.random.gamma(2, 2, 2)

array([1.02166293, 0.19959217])

In [41]:
class RiverSwim:
    def __init__(self, n_states, n_actions=2, gamma=0.05, alpha=0.1, epsilon=0.35):
        self.nstates = n_states
        self.nactions = n_actions
        self.qmat = np.zeros((self.nstates, self.nactions))
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        pass

    def prize(self, state, action, ret)->float:
        if state == 0:
            if action == 0:
                return 5/1000 if ret == 0 else 0
        if state == self.nstates-1:
            if action == 1:
                return 1 if ret == 1 else 0
        return 0
        pass

    def qlearning(self, state, action, reward, landing):
        self.qmat[state, action] += self.alpha * (reward + self.gamma * np.max(self.qmat[landing,:]) - self.qmat[state, action])
        pass

    def step(self, state, action):
        ret = np.argmax(np.random.gamma(2, 2, 2))
        if action == 0:
            if state == 0:
                landing = 0
            else:
                landing = state - 1 if np.random.rand() < .6 else state
        else:
            if state == self.nstates - 1:
                landing = self.nstates - 1
            else:
                landing = state + 1 if np.random.rand() < .6 else state
        reward = self.prize(state, action, ret)
        self.qlearning(state, action, reward, landing)
        flag = True if landing == self.nstates-1 else False
        msg = 1 if state == self.nstates - 1 and action == 1 else 0
        return reward, landing, flag, msg
        pass

    def greedy(self, state):
        return np.argmax(self.qmat[state,:])

    def epsilon_greedy(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.nactions)
        else:
            return self.greedy(state)

In [42]:
env = RiverSwim(6)

In [43]:
trials = 100

In [44]:
for trial in range(1, trials+1):
    state = 0
    total_reward = 0
    steps = 0
    end = False
    curmat = np.zeros((env.nstates, env.nactions))
    print(f'\tTrail: {trial}: ')
    while True:
        action = np.argmax(np.random.gamma(2, 2, 2))
        reward, landing, end, msg = env.step(state, action)

        state = landing
        total_reward += reward
        steps += 1

        if end is True and msg == 1:
            break

    print(f'Matrix:\n{np.matrix(env.qmat)}')
    if np.allclose(np.matrix(env.qmat), curmat, atol = 1e-04):
        break
    else:
        curmat = dcopy(np.matrix(env.qmat))

	Trail: 1: 
Matrix:
[[5.00e-04 2.25e-06]
 [0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00]
 [0.00e+00 1.00e-01]]
	Trail: 2: 
Matrix:
[[2.22209527e-03 2.61406354e-05]
 [1.33423208e-05 1.56491623e-08]
 [1.59917979e-07 4.66031873e-10]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 5.00000000e-04]
 [0.00000000e+00 9.05000000e-02]]
	Trail: 3: 
Matrix:
[[2.40009834e-03 4.33736401e-05]
 [3.71607985e-05 4.03227153e-07]
 [2.91292335e-07 1.02875179e-09]
 [0.00000000e+00 2.50000000e-06]
 [0.00000000e+00 9.02500000e-04]
 [0.00000000e+00 1.81902500e-01]]
	Trail: 4: 
Matrix:
[[2.65445501e-03 7.78049255e-05]
 [3.45259852e-05 2.72961649e-07]
 [6.56379138e-07 3.54024308e-08]
 [2.73692288e-08 6.76250000e-06]
 [4.08381250e-06 1.45437138e-03]
 [9.09512500e-04 1.64621762e-01]]
	Trail: 5: 
Matrix:
[[2.56057972e-03 7.31990710e-05]
 [5.71374431e-05 4.09020323e-07]
 [1.13879039e-06 2.40743266e-07]
 [2.18123766e-07 2.98405917e-05]
 [3.21480800e-06 2.74194757e-03]
 [8.2922146

In [52]:
for state in range(env.nstates):
    print(env.epsilon_greedy(state))

0
1
1
1
1
1
