In [1]:
# jack's car rental
import numpy as np
import sys
sys.path.append('C:\Program Files\python36\Lib\site-packages')
import gym
from gym.utils import seeding
from gym import Env, spaces
from gym.envs.toy_text import discrete

In [39]:
# car rental enviroment
# distribution model is difficult, try to establish a sample model

def categorical_sample(prob_n, np_random):
    prob_n = np.asarray(prob_n)
    csprob_n = np.cumsum(prob_n)
    return (csprob_n > np_random.rand()).argmax()

class CarRentalEnv(Env):
    # action means take how many cars from place one to place two, -5 means take 5 cars from place two to place one
    def __init__(self, max_cars, max_removeable_cars, isd=None):
        self.max_cars = max_cars
        self.nS = max_cars ** 2
        self.nA = 2 * max_removeable_cars + 1
        if isd == None:
            self.isd = np.ones(self.nS) / self.nS
        self.max_removeable_cars = max_removeable_cars
        self.shape = (max_cars, max_cars)
        self.observation_space = spaces.Discrete(self.nS)
        self.action_space = spaces.Discrete(self.nA)

        self.seed()
        self.s = categorical_sample(self.isd, self.np_random)
        self.lastaction = None

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def _render(self):
        pass
    
    def reset(self):
        self.s = categorical_sample(self.isd, self.np_random)
        self.lastaction = None
        return self.s
    
    def step(self,a):
        x, y = np.unravel_index(self.s, self.shape)
        # resize action
        if a - self.max_removeable_cars > x:
            a = self.max_removeable_cars + x
        if self.max_removeable_cars - a > y:
            a = self.max_removeable_cars - y
        available_x = x - (a - self.max_removeable_cars)
        available_y = y + (a - self.max_removeable_cars)
        # sample rent cars and return cars
        rent_cars_in_x = np.random.poisson(3)
        print(f"{rent_cars_in_x} cars wanted in place one")
        return_cars_in_x = np.random.poisson(3)
        print(f"{return_cars_in_x} cars returned in place one")
        actual_rent_cars_in_x = min(rent_cars_in_x, available_x)
        print(f"{actual_rent_cars_in_x} cars rented actually")
        newx = min(available_x - actual_rent_cars_in_x + return_cars_in_x, self.max_cars)
        print(f"newx is {newx}")
              
        rent_cars_in_y = np.random.poisson(4)
        print(f"{rent_cars_in_y} cars wanted in place two")
        return_cars_in_y = np.random.poisson(2)
        print(f"{return_cars_in_y} cars returned in place two")
        actual_rent_cars_in_y = min(rent_cars_in_y, available_y)
        print(f"{actual_rent_cars_in_y} cars rented actually")
        newy = min(available_y - actual_rent_cars_in_y + return_cars_in_y, self.max_cars)
        print(f"newy is {newy}")
              
        s = np.ravel_multi_index((newx, newy), self.shape)
        self.s = s
        self.lastaction = a
        rewards = 10 * (actual_rent_cars_in_x + actual_rent_cars_in_y) - 2 * abs(a - self.max_removeable_cars)
        # output infomation
        if a < self.max_removeable_cars:
            print(f"Action: Take {self.max_removeable_cars - a} cars  from place two to place one")
        elif a == self.max_removeable_cars:
            print(f"Action: No cars are moved")
        else:
            print(f"Action: Take {a - self.max_removeable_cars} cars from place two to place one")
        print(f"Next State is ({newx},{newy})")
        return (s, rewards, False, '')

In [40]:
# policy iteration to solve the car rental problem
env = CarRentalEnv(20, 5)

In [41]:
env.s
np.unravel_index(env.s, env.shape)

(1, 8)

In [70]:
env.step(3)


1 cars wanted in place one
2 cars returned in place one
1 cars rented actually
newx is 6
5 cars wanted in place two
0 cars returned in place two
3 cars rented actually
newy is 0
Action: Take 2 cars  from place two to place one
Next State is (6,0)


(120, 36, False, '')

In [68]:
policy = np.random.randint(9, size=(20, 20))
for i in range(20):
    for j in range(20):
        if policy[i, j] - 5 > i:
            policy[i, j] = 5 + i
        if 5 - policy[i, j] > j:
            policy[i, j] = 5 - j

In [69]:
policy
            

array([[5, 5, 5, 5, 4, 5, 4, 1, 1, 3, 1, 5, 3, 5, 5, 5, 5, 3, 5, 5],
       [5, 4, 6, 2, 4, 6, 6, 3, 6, 6, 6, 5, 5, 2, 2, 1, 1, 0, 2, 6],
       [5, 4, 4, 6, 2, 1, 4, 7, 0, 5, 7, 0, 2, 1, 1, 5, 7, 7, 5, 7],
       [5, 4, 5, 8, 1, 1, 4, 0, 4, 8, 8, 8, 4, 7, 0, 8, 4, 5, 3, 2],
       [6, 8, 3, 5, 6, 8, 4, 6, 8, 7, 5, 0, 0, 2, 3, 5, 2, 6, 5, 7],
       [7, 4, 3, 5, 1, 3, 1, 7, 0, 2, 4, 3, 7, 2, 2, 7, 2, 4, 0, 1],
       [5, 5, 5, 7, 1, 5, 7, 0, 2, 7, 4, 7, 8, 1, 4, 0, 8, 7, 6, 6],
       [5, 6, 3, 2, 7, 2, 0, 6, 5, 4, 3, 2, 6, 3, 3, 6, 1, 3, 1, 3],
       [5, 4, 4, 3, 5, 5, 7, 5, 4, 7, 6, 8, 4, 7, 1, 4, 0, 7, 0, 8],
       [5, 7, 3, 2, 2, 3, 2, 7, 1, 5, 3, 0, 5, 3, 8, 2, 3, 4, 5, 6],
       [5, 4, 3, 4, 5, 2, 0, 3, 8, 2, 0, 2, 8, 0, 2, 1, 7, 0, 1, 0],
       [5, 4, 3, 3, 1, 1, 5, 1, 5, 7, 8, 5, 8, 5, 3, 2, 8, 5, 0, 8],
       [5, 8, 3, 2, 5, 5, 4, 0, 8, 4, 1, 3, 8, 0, 2, 1, 3, 7, 6, 6],
       [6, 6, 3, 2, 1, 6, 3, 0, 8, 2, 2, 4, 6, 5, 3, 1, 0, 1, 5, 4],
       [5, 8, 3, 2, 1, 0, 6, 2, 1,

In [None]:
env = CarRentalEnv(20, 5)
def policy_evaluation(env, policy, lam=0.9):
    # init V values
    V = np.zeros(env.shape)
    x, y = np.unravel_index(env.s, env.shape)
    a = policy[x][y]
    next_state, reward, _, _ = env.step(a)
    