# Car Rental Problem

## Approach

The idea is that we have state as a tuple - 
        - Number of cars in Location 1
        - Number of cars in Location 2
        - Number of cars in Location 3
And we can take actions in the form of movement of cars from one location to another location over night.

> For size = (10, 5, 5), it is taking overall 13s and converging in 5 iterations. Maximum Value obtained is 56.9

> For size = (20, 10, 10), it is taking around 6-7 min to complete one iteration. Maximum value obtained after 1 iteration is 194.2


In [1]:
import numpy as np
import math, itertools, time, copy

In [2]:
class Environment :
    def __init__(self, size, requestList, returnList, gamma, reward) :
        self.size = size
        self.V = np.zeros(size)
        self.policy = np.zeros((size[0], size[1], size[2], 3))
        self.gamma = gamma
        self.requestList = requestList 
        self.returnList = returnList
        self.reward = reward        
        z = max(size[0], size[1], size[2])
        self.P1 = np.zeros((z, z))
        self.P2 = np.zeros((z, z))
        self.P3 = np.zeros((z, z))
        self.R1 = np.zeros((z, z))
        self.R2 = np.zeros((z, z))
        self.R3 = np.zeros((z, z))
        
    def poisson(self, k, lam) :
        return ((lam**k) * np.exp(-lam))/np.math.factorial(k)
        
    def isValid(self, i, j, k, car1, car2, car3) :
        if((i+car1) < 0 or (i+car1) >= self.size[0] or (j+car2) < 0 or (j+car2) >= self.size[1]
              or (k+car3) < 0 or (k+car3) >= self.size[2]) :
            return False
        return True
    
    def totalActions(self, car1, car2, car3) :
        actions = []
        for i, j, k in itertools.product(np.arange(-5, 6), np.arange(-5, 6), np.arange(-5, 6)) :
            if (i + j + k == 0) :
                if((self.isValid(i, j, k, car1, car2, car3))) :
                    actions.append([i, j, k])
            
        return actions
    
    #================Pre-Calculations of Probability and Rewards============
    def storeAll(self) :
        print("Pre-calculating Probability and Rewards....")
        size1 = np.arange(self.size[0])
        size2 = np.arange(self.size[1])
        size3 = np.arange(self.size[2])
        for (car, nextCar) in itertools.product(size1, size1) :
            for req, ret in itertools.product(size1, size1) :
                if(req > car or car-req+ret != nextCar):
                    continue    
                self.P1[car, nextCar] += (self.poisson(req, requestList[0]) * self.poisson(ret, returnList[0]))
                self.R1[car, nextCar] += (self.poisson(req, requestList[0]) * self.poisson(ret, returnList[0]) 
                                     * req * self.reward)
                                     
        for (car, nextCar) in itertools.product(size2, size2) :
            for req, ret in itertools.product(size2, size2) :
                if(req > car or car-req+ret != nextCar):
                    continue    
                self.P2[car, nextCar] += (self.poisson(req, requestList[1]) * self.poisson(ret, returnList[1]))
                self.R2[car, nextCar] += (self.poisson(req, requestList[1]) * self.poisson(ret, returnList[1]) 
                                     * req * self.reward)
                                    
        for (car, nextCar) in itertools.product(size3, size3) :
            for req, ret in itertools.product(size3, size3) :
                if(req > car or car-req+ret != nextCar):
                    continue    
                self.P3[car, nextCar] += (self.poisson(req, requestList[2]) * self.poisson(ret, returnList[2]))
                self.R3[car, nextCar] += (self.poisson(req, requestList[2]) * self.poisson(ret, returnList[2]) 
                                     * req * self.reward)
        print("Pre-calculation done ...")
                
    def ValueIteration(self) :
        
        iter = 0
        while(iter < 10) :
            iter += 1
            print("======= iter ", iter, " ========== ")
            i = 0
            delta = 0   
            for car1, car2, car3 in itertools.product(np.arange(self.size[0]), np.arange(self.size[1]), np.arange(self.size[2])) :
                i += 1
                prevCar1, prevCar2, prevCar3 = car1, car2, car3
                bestAction = None
                actions = self.totalActions(car1, car2, car3)
                maxReward = 0

                for action in actions :
                    car1 = prevCar1 + action[0]
                    car2 = prevCar2 + action[1]
                    car3 = prevCar3 + action[2]
                    reward = abs(action[0]) * -2     # cost of 2 for Location 1
                    
                    for nextCar1, nextCar2, nextCar3 in itertools.product(np.arange(self.size[0]), np.arange(self.size[1]), np.arange(self.size[2])) :
                                     
                        P_car1 = self.P1[car1, nextCar1]
                        P_car2 = self.P2[car2, nextCar2]
                        P_car3 = self.P3[car3, nextCar3]
                                     
                        R_car1 = self.R1[car1, nextCar1]
                        R_car2 = self.R2[car2, nextCar2]
                        R_car3 = self.R3[car3, nextCar3]     
                        
                        immReward = (P_car1*P_car2*R_car3)+(P_car1*R_car2*P_car3)+(R_car1*P_car2* P_car3)
                        reward += immReward + (P_car1*P_car2*P_car3 * self.gamma * self.V[nextCar1,nextCar2, nextCar3])
                    
                    if(reward > maxReward) :
                        maxReward = reward
                        bestAction = action
                        
                prevVal = self.V[prevCar1, prevCar2, prevCar3]
                self.V[prevCar1, prevCar2, prevCar3] = maxReward
                delta = max(delta, abs(maxReward - prevVal))
                self.policy[prevCar1, prevCar2, prevCar3] = np.array(bestAction)
                if(i % 100 == 0) :
                    print("i = ", i)
                    print("State : ", (prevCar1, prevCar2, prevCar3), "V = ", self.V[prevCar1, prevCar2, prevCar3])
                    print("Max-reward : ", np.max(self.V))
                    print("delta : ", delta)
                    print("policy = ", self.policy[prevCar1, prevCar2, prevCar3])
                
            if(delta < 0.1) :
                print("\n========= Value Iteration Converged =========")
                print("TOTAL ITERATIONS : ", iter)
                break


In [3]:
size = [20, 10, 10]
# size = [10, 5, 5]
requestList = [3, 2, 2]
returnList = [3, 1, 1]
gamma = 0.9
reward = 10
envObj = Environment(size, requestList, returnList, gamma, reward)

a = time.time()
envObj.storeAll()
envObj.ValueIteration()
b = time.time()
print("Total Time taken : ", int((b-a)) ,"s")

np.savetxt("value.txt", np.ravel(envObj.V).reshape(50, 5), fmt = "%5.2f", header = 'Values')
np.savetxt("policy.txt", np.ravel(envObj.policy).reshape(250, 3), fmt = "%d", header = "Policy")
print("value.txt Saved")
print("policy.txt Saved")

Pre-calculating Probability and Rewards....
Pre-calculation done ...
i =  100
State :  (0, 9, 9) V =  49.64041637722675
Max-reward :  49.8771034377289
delta :  49.8771034377289
policy =  [ 5. -2. -3.]
i =  200
State :  (1, 9, 9) V =  54.88692203419342
Max-reward :  55.116566162667965
delta :  55.116566162667965
policy =  [ 5. -2. -3.]
i =  300
State :  (2, 9, 9) V =  58.08253374958155
Max-reward :  58.33270417043922
delta :  58.33270417043922
policy =  [ 4. -2. -2.]
i =  400
State :  (3, 9, 9) V =  61.291793483435825
Max-reward :  63.62533483090205
delta :  63.62533483090205
policy =  [ 4. -2. -2.]
i =  500
State :  (4, 9, 9) V =  67.07261139246069
Max-reward :  73.80044725416238
delta :  73.80044725416238
policy =  [ 2. -1. -1.]
i =  600
State :  (5, 9, 9) V =  71.9083572671826
Max-reward :  86.71913502076319
delta :  86.71913502076319
policy =  [ 2. -1. -1.]
i =  700
State :  (6, 9, 9) V =  75.32464035578926
Max-reward :  98.5625933974854
delta :  98.5625933974854
policy =  [ 1.  0. 

KeyboardInterrupt: 

In [5]:
np.max(envObj.V)

209.378044112479