<a href="https://colab.research.google.com/github/jesung/RL-Book-Exercises/blob/master/RLbook_4_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import math
import random
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns


grid = 20
upper_bound = 10

In [0]:
def poisson(n, l):
    return pow(l,n) * math.exp(-l) / math.factorial(n)

class location:
    def __init__(self):
        self.request_rate = 1
        self.return_rate = 1
        self.requests = np.zeros((grid+1))
        self.returns = np.zeros((grid+1))
  
    def calc_prob(self):
        for i in range(grid+1):
          self.requests[i] = poisson(i, self.request_rate)
          self.returns[i] = poisson(i, self.return_rate)

def expected_return(state, action, state_value, gamma):
    returns = 0.0
    returns -= 2 * abs(action)
    
    #one free move from loc 1 to 2
    if action > 0:
      returns += 2
    
    for loc1_req in range(0,upper_bound):
        for loc2_req in range(0,upper_bound):
            # moving cars
            num_of_cars_first_loc = int(min(state[0] - action, grid))
            num_of_cars_second_loc = int(min(state[1] + action, grid))

            # valid rental requests should be less than actual # of cars
            real_rental_first_loc = min(num_of_cars_first_loc, loc1_req)
            real_rental_second_loc = min(num_of_cars_second_loc, loc2_req)

            # get credits for renting
            reward = (real_rental_first_loc + real_rental_second_loc) * 10
            num_of_cars_first_loc -= real_rental_first_loc
            num_of_cars_second_loc -= real_rental_second_loc
            
            # probability for current combination of rental requests
            p = loc1.requests[loc1_req] * loc2.requests[loc2_req]

            # get returned cars, those cars can be used for renting tomorrow
            num_of_cars_first_loc = min(num_of_cars_first_loc + loc1.return_rate, grid)
            num_of_cars_second_loc = min(num_of_cars_second_loc + loc2.return_rate, grid)
            
            #add parking cost
            reward -= 4 * (max(0, num_of_cars_first_loc - 10) + max(0, num_of_cars_second_loc - 10))
            
            returns += p * (reward + gamma * state_value[num_of_cars_first_loc, num_of_cars_second_loc])
    return returns
  
def car_rental(loc1, loc2, gamma=0.9, e=0.001):
    #initialization
    v_s = np.zeros((grid+1,grid+1))
    pi = np.zeros((grid+1,grid+1), dtype=np.int)
    
    while True:
      #policy evaluation
      d = e+1
      while d > e:
        d = 0.0
        new_val = np.copy(v_s)

        for i in range(0,grid+1):
          for j in range(0,grid+1):
            new_val[i,j] = expected_return([i,j], pi[i,j], new_val, gamma)
            
        d = np.abs((new_val - v_s)).sum()
        v_s = new_val
        #print(d)

      #print(v_s)

      #policy improvement
      stable = True
      print(pi)
      for i in range(0,grid+1):
        for j in range(0,grid+1):
          s = (i,j)
          old = np.copy(pi[s])
          tmp_a = np.zeros(11)

          for k in range(11):
            a = range(-5,6)[k]
            
            tmp_a[k] = expected_return([i,j], a, v_s, gamma)

          #print(tmp_a, np.argmax(tmp_a))
          pi[s] = range(-5,6)[np.argmax(tmp_a)]
          if old != pi[s]:
            stable = False
          
      if stable:
        fig = sns.heatmap(np.flipud(pi), cmap="YlGnBu")
        return v_s, pi, fig

In [0]:
loc1 = location()
loc1.request_rate = 3
loc1.return_rate = 3
loc1.calc_prob()

loc2 = location()
loc2.request_rate = 4
loc2.return_rate = 2
loc2.calc_prob()

#print('requests', loc2.requests)

#print(np.sum(loc1.requests))
#0.9999999999168558

In [280]:
v_s, pi, fig = car_rental(loc1, loc2, gamma=0.9, e=0.001)

#print(v_s, pi)
#plt.show()

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[ 0  0  0  0  0  0  0  0 -1 -1 -1 -2 -2 -3 -3 -3 -4 -

In [0]:
print(v_s, pi)