In [5]:
import numpy as np

In [6]:
p0 = 0.9
p1 = 0.7
p2 = 0.4

# (action, l1, l2, m1, m2) : prob
probabilities = {
    # start maintenance
    ('M', 0, 0, 0, 1) : 1,
    ('M', 1, 1, 0, 1) : 1,
    ('M', 2, 2, 0, 1) : 1,

    # finish maintenance
    ('N', 0, 0, 1, 0) : 1,
    ('N', 1, 0, 1, 0) : 1,
    ('N', 2, 0, 1, 0) : 1,

    # process job
    ('J', 0, 0, 0, 0) : p0,
    ('J', 0, 1, 0, 0) : 1 - p0,

    ('J', 1, 1, 0, 0) : p1,
    ('J', 1, 2, 0, 0) : 1 - p1,

    ('J', 2, 2, 0, 0) : 1,
}

# (action, l, m): expected (over next states) reward
rewards = {
    ('J', 0, 0) : 2 * p0,
    ('J', 1, 0) : p1,
    ('J', 2, 0) : p2,
}

def p(a, l1, m1, l2, m2):
    key = (a, l1, l2, m1, m2)
    if key in probabilities:
        return probabilities[key]
    else:
        return 0

def r(a, l, m):
    key = (a, l, m)
    if key in rewards:
        return rewards[key]
    else:
        return 0

def actions(m):
    if m == 1:
        # don't allow any other actions if we are still in maintenance
        return ['N']
    else:
        return ['M','J']

In [7]:
# number of decision epochs
N = 5

# state space
L = [0,1,2] # machine state
M = [0,1] # maintenance flag
I = [(l,m) for l in L for m in M]

V = np.zeros((N+1, len(L), len(M)))
A = np.zeros((N+1, len(L), len(M)), dtype='str')

In [8]:
len(I)

6

In [9]:
# no final rewards
V[0,:,:] = 0

for n in range(1,N+1):
    for l1,m1 in I:
        maximum = 0
        argmax = ''
        for a in actions(m1):
            S = sum(
                p(a, l1,m1, l2,m2) * V[n-1,l2,m2]
                for l2,m2 in I
            ) + r(a, l1, m1)
            if S > maximum:
                maximum = S
                argmax = a
            
        V[n,l1,m1] = maximum
        A[n,l1,m1] = argmax

In [14]:
V[N,0,0]

8.04445