# 1 MIP Bellman

## 1.1 DRMDP with decision dependent ambiguity set (DRMDP DD1)

In [None]:
import gurobipy as gp
from gurobipy import GRB
import numpy as np

def solve_bellman_formulation1(act_dim, num_states, lam, next_V, k, delta_s, rho_s, delta_0_s, rho_0_s, ub_a):
    model2 = gp.Model()
    model2.setParam('OutputFlag', False)
    a = [0 for i in range(act_dim)]
    w = [0 for i in range(num_states)]
    u = [0 for i in range(num_states)]
    m0 = [[0 for j in range(num_states)] for i in range(act_dim)]
    m1 = [[0 for j in range(num_states)] for i in range(act_dim)]
    r = model2.addVar(vtype=GRB.CONTINUOUS, lb = -GRB.INFINITY, ub = GRB.INFINITY, name = 'r')
    q = model2.addVar(vtype=GRB.CONTINUOUS, lb = -GRB.INFINITY, ub = GRB.INFINITY, name = 'q')
    for i in range(act_dim):
        a[i] = model2.addVar(vtype=GRB.INTEGER, lb = 0.0, ub = ub_a[i], name = 'a%d' %i)
    for i in range(num_states):
        w[i] = model2.addVar(vtype=GRB.CONTINUOUS, lb = 0.0, ub = k[i], name = 'w%d' %i)
    for i in range(num_states):
        u[i] = model2.addVar(vtype=GRB.CONTINUOUS, lb = 0.0, ub = k[i], name = 'u%d' %i)
    for i in range(act_dim):
        for j in range(num_states):
            m0[i][j] = model2.addVar(vtype=GRB.CONTINUOUS, lb = -GRB.INFINITY, ub = GRB.INFINITY, name = 'm0-%d%d' %(i,j))
    for i in range(act_dim):
        for j in range(num_states):
                m1[i][j] = model2.addVar(vtype=GRB.CONTINUOUS, lb = -GRB.INFINITY, ub = GRB.INFINITY, name = 'm1-%d%d' %(i,j))

    # Objective
    obj = 1*r + delta_0_s
    for i in range(act_dim):
        obj.addTerms(delta_s[i], a[i])
    for j in range(num_states):
        obj.addTerms(-rho_0_s[j], w[j])
    for i in range(act_dim):
        for j in range(num_states):
            obj.addTerms(-rho_s[i][j], m0[i][j])
    for j in range(num_states):
        obj.addTerms(rho_0_s[j], u[j])
    for i in range(act_dim):
        for j in range(num_states):
            obj.addTerms(rho_s[i][j], m1[i][j])
    model2.setObjective(obj, GRB.MAXIMIZE)   

    # Constraint 1
    model2.addConstr(q-r, GRB.GREATER_EQUAL, 0, "c1");

    # Constraint 2
    for j in range(num_states):
        model2.addConstr(lam*next_V[j] + w[j] - u[j] - q >= 0)
    
    # McCormick envelopes
    # M0
    for i in range(act_dim):
        for j in range(num_states):
            model2.addConstr(m0[i][j] - 0*w[j] - a[i]*0 + 0*0 >= 0, "m0-1-%d%d" %(i,j))
            model2.addConstr(m0[i][j] - ub_a[i]*w[j] - a[i]*k[j] + ub_a[i]*k[j] >= 0, "m0-2-%d%d" %(i,j))
            model2.addConstr(m0[i][j] - ub_a[i]*w[j] - a[i]*0 + ub_a[i]*0 <= 0, "m0-3-%d%d" %(i,j))
            model2.addConstr(m0[i][j] - a[i]*k[j] - 0*w[j] + 0*k[j] <= 0, "m0-4-%d%d" %(i,j))
    
    # M1
    for i in range(act_dim):
        for j in range(num_states):
            model2.addConstr(m1[i][j] - 0*u[j] - a[i]*0 + 0*0 >= 0, "m1-1-%d%d" %(i,j))
            model2.addConstr(m1[i][j] - ub_a[i]*u[j] - a[i]*k[j] + ub_a[i]*k[j] >= 0, "m1-2-%d%d" %(i,j))
            model2.addConstr(m1[i][j] - ub_a[i]*u[j] - a[i]*0 + ub_a[i]*0 <= 0, "m1-3-%d%d" %(i,j))
            model2.addConstr(m1[i][j] - a[i]*k[j] - 0*u[j] + 0*k[j] <= 0, "m1-4-%d%d" %(i,j))

    result = model2.optimize()
    a = []
    for v in model2.getVars():
        # print('%s %g,' % (v.varName, v.x), end = " ")
        if 'a' in v.varName:
            a.append(v.x)
    optimal_objective = model2.getObjective().getValue()
    optimal_a = np.asarray(a)
    return optimal_objective, optimal_a

## 1.3 Regular MDP

In [None]:
import gurobipy as gp
from gurobipy import GRB
import numpy as np

def solve_bellman_formulation3(act_dim, num_states, lam, next_V, delta_s, rho_s, delta_0_s, rho_0_s, ub_a):
    model3 = gp.Model()
    model3.setParam('OutputFlag', False)
    a = [0 for i in range(act_dim)]
    for i in range(act_dim):
        a[i] = model3.addVar(vtype=GRB.INTEGER, lb = 0.0, ub = ub_a[i], name = 'a%d' %i)
    # Objective
    obj = delta_0_s + gp.quicksum(delta_s[i]*a[i] for i in range(act_dim))
    for j in range(num_states):
        obj += lam*rho_0_s[j]*next_V[j]
    for i in range(act_dim):
        for j in range(num_states):
            obj.addTerms(lam*rho_s[i][j]*next_V[j], a[i])
    model3.setObjective(obj, GRB.MAXIMIZE)
    
    result = model3.optimize()
    a = []
    for v in model3.getVars():
        # print('%s %g,' % (v.varName, v.x), end = " ")
        if 'a' in v.varName:
            a.append(v.x)
    optimal_objective = model3.getObjective().getValue()
    optimal_a = np.asarray(a)
    return optimal_objective, optimal_a

## 1.4 Test

In [None]:
import numpy as np

# population size
N = 10
L = 5
M = 5

act_dim = 2
num_states = (N+1)**2
lam = 0.9
next_V = np.zeros(num_states)


# pernalty coefficients
k = np.ones(num_states)*10
K = np.ones((num_states, num_states))*10

# LDR coefficients
delta_s = np.ones(act_dim)
delta_0_s = 1
sigma_s = np.ones(act_dim)
sigma_0_s = 1
rho_s = np.ones((act_dim, num_states))
rho_0_s = np.ones(num_states)
ps = np.ones(num_states)

bar_Sigma_s = np.ones((num_states, num_states))

# bound of actions
ub_a = np.asarray([L,M])

objective, a1 = solve_bellman_formulation1(act_dim, num_states, lam, next_V, k, delta_s, rho_s, delta_0_s, rho_0_s, ub_a)
print(objective)
print(a1)

objective, a3 = solve_bellman_formulation3(act_dim, num_states, lam, next_V, delta_s, rho_s, delta_0_s, rho_0_s, ub_a)
print(objective)
print(a3)

# 2 Vaccine Model Simulation (SEIR)

In [319]:
import numpy as np
import scipy.stats as stats
from sklearn.linear_model import LinearRegression

def get_phi(state, action, population_size, vaccine_portion_num, trans_reduc_types, tau, mu):
    # tau: probability that a susceptible person becomes infected upon contact with an infectious individual 
    # mu: the rate of contacts (contacts occur according to a homogenous Poisson process, the rate of contacts)
    # alpha(t): the fractional reduction in the infection transmission rate
    N = population_size
    M = trans_reduc_types
    L = vaccine_portion_num
    pst = state[0]
    pet = state[1]
    pit = state[2]
    xst = pst*N
    xet = pet*N
    xit = pit*N
    yVt = action[0]
    yRt = action[1]
    alpha_t = (yRt/M)*0.8
    phi_t = 1 - np.exp(-(1-alpha_t)*mu*pit*tau)
    return phi_t

def get_tilde_pas(state, action, population_size, vaccine_portion_num, trans_reduc_types, discretized_N, tau, mu, precision = 50, debug = False):
    # given a state, an action, return estimate of P(*|a,s)
    N = population_size
    M = trans_reduc_types
    L = vaccine_portion_num
    dicretize_level = np.int(N/discretized_N)
    pst = state[0]
    pet = state[1]
    pit = state[2]
    xst = pst*N
    xet = pet*N
    xit = pit*N
    yVt = action[0]
    yRt = action[1]
    xst_to_vaccinate = np.int(yVt/L*xst) # number of susceptibles to vaccinate at time t
    
    lc = 1 # mean incubation period
    ld = 1 # mean infectious period
    rho_c = 1 - np.exp(-lc)
    rho_d = 1 - np.exp(-ld)
    
    phi_t = get_phi(state, action, N, L, M, tau, mu)
    pas_matrix = np.zeros((discretized_N+1, discretized_N+1, discretized_N+1))
    for ss in range(discretized_N+1):
        for se in range(discretized_N+1):
            for si in range(discretized_N+1):
                if ss + se + si <= np.int((xst + xet + xit - xst_to_vaccinate)/dicretize_level):
                    nb = (xst - xst_to_vaccinate) - ss*dicretize_level
                    nc = (xst - xst_to_vaccinate) + xet - (ss + se)*dicretize_level
                    nd = (xst - xst_to_vaccinate) + xet + xit - (ss + se + si)*dicretize_level
                    pb = stats.binom.pmf(nb, xst - xst_to_vaccinate, phi_t) 
                    pc = stats.binom.pmf(nc, xet, rho_c)
                    pd = stats.binom.pmf(nd, xit, rho_d)
                    for i in range(precision):
                        pb += stats.binom.pmf(nb + i, xst - xst_to_vaccinate, phi_t)
                        pb += stats.binom.pmf(nb - i, xst - xst_to_vaccinate, phi_t)
                        pc += stats.binom.pmf(nc + i, xet, rho_c)
                        pc += stats.binom.pmf(nc - i, xet, rho_c)
                        pd += stats.binom.pmf(nd + i, xit, rho_d)
                        pd += stats.binom.pmf(nd - i, xit, rho_d)
                    pas_matrix[ss][se][si] = pb*pc*pd
    pas = pas_matrix.flatten()
    if debug:
        print('sum') 
        print(np.sum(pas))
        print('max')
        print(np.max(pas))
        
    # keep the 100 largest elements to remove noise    
    ind = np.argpartition(pas, -100)[-100:]
    new_pas_matrix = np.zeros((discretized_N+1, discretized_N+1, discretized_N+1))
    new_pas = new_pas_matrix.flatten()
    for i in range(100):
        idx = ind[i]
        new_pas[idx] = pas[idx] 
    new_pas = new_pas/np.sum(new_pas)

    return new_pas  

def get_tilde_ras(state, action, population_size, vaccine_portion_num, trans_reduc_types,  
               cR_multiplier, cost_per_infect, vaccine_price, tau, mu, lam):
    # given a state, an action, return estimate of r(a,s)
    N = population_size
    M = trans_reduc_types
    L = vaccine_portion_num
    pst = state[0]
    pet = state[1]
    pit = state[2]
    xst = pst*N
    xet = pet*N
    xit = pit*N
    yVt = action[0]
    yRt = np.int(action[1])
    xst_to_vaccinate = np.int(yVt/L*xst) # number of susceptibles to vaccinate at time t
    phi_t = get_phi(state, action, N, L, M, tau, mu)
    
    lc = 1 # mean incubation period
    ld = 1 # mean infectious period
    rho_c = 1 - np.exp(-lc)
    rho_d = 1 - np.exp(-ld)
    
    E_it = xit + xet*rho_c - xit*rho_d # expected infection
    E_et = xet + (xst - xst_to_vaccinate)*phi_t - xet*rho_c # expected exposed
    
    # cost of vaccination
    p = vaccine_price
    c_V = p*xst_to_vaccinate
    # cost of implementing transmission reducing method 
    c_R = np.arange(M+1) * cR_multiplier
    # cost of infection
    w = cost_per_infect
    c_I = w*E_it + lam*w*(E_et*rho_c)
    
    r = - c_V - c_R[yRt] - c_I
    return r

def compute_actual_cost(action, state, next_state, population_size, vaccine_portion_num, trans_reduc_types, cR_multiplier, cost_per_infect, vaccine_price):
    N = population_size
    M = trans_reduc_types
    L = vaccine_portion_num
    xst = state[0]*N
    i_t = next_state[2]*N
    yVt = action[0]
    yRt = np.int(action[1])
    xst_to_vaccinate = np.int(yVt/L*xst) # number of susceptibles to vaccinate at time t
    
    # cost of vaccination
    p = vaccine_price
    c_V = p*xst_to_vaccinate
    # cost of implementing transmission reducing method 
    c_R = np.arange(M+1) * cR_multiplier
    # cost of infection
    w = cost_per_infect
    c_I = w*i_t

    cost = - c_V - c_R[yRt] - c_I
    return cost

def get_bar_pas_coefficients(state, population_size, vaccine_portion_num, trans_reduc_types, discretized_N, tau, mu):
    N = population_size
    M = trans_reduc_types
    L = vaccine_portion_num
    num_states = (discretized_N+1)**3
    
    pst = state[0]
    pet = state[1]
    pit = state[2]
    xst = pst*N
    xet = pet*N
    xit = pit*N
    num_actions = 4
    ds_action = np.zeros((num_actions, 2))
    ds_pas = np.zeros((num_actions, num_states))
    i = 0
    yV_bound = [0, L]
    yR_bound = [0, M]
    
    for yv in yV_bound:
        for yr in yR_bound:
            action = np.asarray([yv, yr])
            ds_action[i,:] = action
            ds_pas[i,:] = get_tilde_pas(state, action, N, L, M, discretized_N, tau, mu)
            i += 1
    reg = LinearRegression().fit(ds_action, ds_pas)
    rho_0_s = reg.intercept_
    rho_s = reg.coef_
    return rho_0_s, np.transpose(rho_s)

def get_bar_pas(action, population_size, act_dim, rho_0_s, rho_s, discretized_N = 100):
    N = population_size
    num_states = (discretized_N+1)**3
    pas = rho_0_s.copy()
    for j in range(num_states):
        for i in range(act_dim):
            pas[j] += rho_s[i][j]*action[i]
    return pas

def get_worst_case_pas(state, action, population_size, vaccine_portion_num, trans_reduc_types, discretized_N, tau, mu, precision = 50):
    tilde_pas = get_tilde_pas(state, action, population_size, vaccine_portion_num, trans_reduc_types, discretized_N, tau, mu, precision)
    non_zero_idx = np.where(tilde_pas > 0.01)[0]
    num_non_zero = len(non_zero_idx)
    ind = np.argpartition(tilde_pas, -num_non_zero)[-num_non_zero:]
    sorted_nonzero_values = tilde_pas[ind].copy()
    ss_list = []
    se_list = []
    si_list = []
    for i in range(num_non_zero):
        idx = ind[i]
        si = idx % (discretized_N + 1)
        si_list.append(si)
        se = ((idx - si) / (discretized_N + 1)) % (discretized_N + 1)
        se_list.append(se)
        ss = np.int((idx - se*(discretized_N+1) - si) / (discretized_N+1)**2)
        ss_list.append(ss)
    
    worst_case_ss = np.min(ss_list)
    worst_case_se = np.min(se_list)
    worst_case_si = np.max(si_list)
    
    worst_case_idx = worst_case_ss*(discretized_N+1)**2 + worst_case_se*(discretized_N+1) + worst_case_si
    
    pas = tilde_pas.copy()
    if worst_case_ss > 5:
        worst_case_idx1 = (worst_case_ss - 5)*(discretized_N+1)**2 + worst_case_se*(discretized_N+1) + (worst_case_si + 5)
        pas[np.int(worst_case_idx1)] += 0.5
    else:
        pas[np.int(worst_case_idx)] += 0.5
    return pas  

def get_ras_coefficients(state, population_size, vaccine_portion_num, trans_reduc_types, cR_multiplier,
                     cost_per_infect, vaccine_price, tau, mu, lam):
    N = population_size
    M = trans_reduc_types
    L = vaccine_portion_num

    pst = state[0]
    pet = state[1]
    pit = state[2]
    xst = pst*N
    xet = pet*N
    xit = pit*N
    num_actions = 4
    ds_action = np.zeros((num_actions, 2))
    ds_reward = np.zeros(num_actions)
    i = 0
    yV_bound = [0, L]
    yR_bound = [0, M]
    
    for yv in yV_bound:
        for yr in yR_bound:
            action = np.asarray([yv, yr])
            ds_action[i,:] = action
            ds_reward[i] = get_tilde_ras(state, action, N, L, M, cR_multiplier,  
                                                cost_per_infect, vaccine_price, tau, mu, lam)
            i += 1
    reg = LinearRegression().fit(ds_action, ds_reward)
    delta_0_s = reg.intercept_
    delta_s = reg.coef_
    return delta_0_s, delta_s

def get_ras(action, delta_s, delta_0_s):
    ras = delta_0_s.copy()
    for i in range(len(action)):
        ras += delta_s[i]*action[i]
    return ras

def get_fast_max_ras(state, population_size, trans_reduc_types, vac_limit, delta_s, delta_0_s):
    N = population_size
    M = trans_reduc_types
    
    pst = state[0]
    pet = state[1]
    pit = state[2]
    xst = pst*N
    xet = pet*N
    xit = pit*N
    num_actions = 4
    ds_action = np.zeros((num_actions, 2))
    ds_reward = np.zeros(num_actions)
    i = 0
    if xst <= vac_limit:
        yV_bound = [0, L]
    else:
        yV_bound = [0, np.int(vac_limit/xst*L)]
    yR_bound = [0, M]
    
    for yv in yV_bound:
        for yr in yR_bound:
            action = np.asarray([yv, yr])
            ds_action[i,:] = action
            ds_reward[i] = get_ras(action, delta_s, delta_0_s)
            i += 1
    max_ras = np.max(ds_reward)
    max_idx = np.argmax(ds_reward)
    return ds_action[max_idx], max_ras

# 3 Large Scale Experiment

In [320]:
import random
import numpy as np

niter = 5
T = 12 # time horizon
N = 1000 # population size
discretized_N = 10
M = 5 # types of transmission-reducing interventions
L = 5 # possible portions of susceptibles to vaccinate
lam = 0.9
vac_limit = 200
act_dim = 2
discretize_level = np.int(N/discretized_N)
num_states = (discretized_N+1)**3

# pernalty coefficients
k = np.ones(num_states)*1000
K = np.ones((num_states, num_states))*1000

# define initial state
ps_init = 0.7
pe_init = 0.1
pi_init = 0.2
s_init = np.asarray([ps_init, pe_init, pi_init])

# define reward parameters
unit = 50000
cR_multiplier = 3 * unit 
cost_per_infect = 1 * unit
vaccine_price = 0.1 * unit

# define pas paramters 
tau = 0.8
mu = 8

# build heuristic funcion 
V0 = np.zeros(num_states)
for ss in range(discretized_N+1):
    for se in range(discretized_N+1):
        for si in range(discretized_N+1):
            current_s_idx = ss*(discretized_N+1)**2 + se*(discretized_N+1) + si
            ps = ss*discretize_level/N
            pe = se*discretize_level/N
            pi = si*discretize_level/N
            s = np.asarray([ps,pe,pi])
            delta_0_s, delta_s = get_ras_coefficients(s, N, L, M, cR_multiplier, cost_per_infect, vaccine_price, tau, mu, lam)
            _, max_ras = get_fast_max_ras(s, N, M, vac_limit, delta_s, delta_0_s)
            V0[current_s_idx] = max_ras 

## 3.1 Transition probability = bar_tilde_pas

### 3.1.1 DRMDP DD1

In [None]:
V = np.zeros((num_states, T))
ps_table = np.zeros((niter, T))
pe_table = np.zeros((niter, T))
pi_table = np.zeros((niter, T))
a1_table = np.zeros((niter, T-1))
a2_table = np.zeros((niter, T-1))
for t in range(T-1):
    V[:,t] = V0
episode_rewards = np.zeros(niter)
stagewise_rewards = np.zeros((niter, T-1))
for i in range(niter):
    print('Episode %d begins' %i)
    total_reward = 0 
    s = s_init
    for t in range(T-1):
        rho_0_s, rho_s = get_bar_pas_coefficients(s, N, L, M, discretized_N, tau, mu)
        delta_0_s, delta_s = get_ras_coefficients(s, N, L, M, cR_multiplier, cost_per_infect, vaccine_price, tau, mu, lam)
        xst = s[0]*N
        if xst <= vac_limit:
            ub_a = np.asarray([L, M])
        else:
            ub_a = np.asarray([np.int(vac_limit/xst*L), M])
        objective, a = solve_bellman_formulation1(act_dim, num_states, lam, V[:,t+1], k, delta_s, rho_s, delta_0_s, rho_0_s, ub_a)
        # update value table
        ss = np.int(s[0]*N/discretize_level)
        se = np.int(s[1]*N/discretize_level)
        si = np.int(s[2]*N/discretize_level)
        current_s_idx = ss*(discretized_N+1)**2 + se*(discretized_N+1) + si
        V[current_s_idx,t] = objective
        ps_table[i][t] = s[0]
        pe_table[i][t] = s[1]
        pi_table[i][t] = s[2]
        a1_table[i][t] = a[0]
        a2_table[i][t] = a[1]
        # sample next state 
        pas = get_tilde_pas(s, a, N, L, M, discretized_N, tau, mu, 60)
        next_s_idx = random.choices(np.arange(num_states), pas)[0]
        
        si = next_s_idx % (discretized_N + 1)
        se = ((next_s_idx - si) / (discretized_N + 1)) % (discretized_N + 1)
        ss = np.int((next_s_idx - se*(discretized_N+1) - si) / (discretized_N+1)**2)
            
        ps = ss*discretize_level/N
        pe = se*discretize_level/N
        pi = si*discretize_level/N
        
        next_s = np.asarray([ps, pe, pi])
        # compute reward
        reward = compute_actual_cost(a, s, next_s, N, L, M, cR_multiplier, cost_per_infect, vaccine_price)
        total_reward += (lam**t) * reward
        stagewise_rewards[i][t] = reward
        s = next_s
    ps_table[i][T-1] = s[0]
    pe_table[i][T-1] = s[1]
    pi_table[i][T-1] = s[2]
    print('Reward', total_reward)
    episode_rewards[i] = total_reward
    print('Episode %d ends' %i)
print('Episodes reward', episode_rewards)
print('Average reward', np.mean(episode_rewards))
print('std', np.std(episode_rewards) * 0.5)
print('Stagewise reward', stagewise_rewards)
print('PS: \n', ps_table)
print('PE: \n', pe_table)
print('PI: \n', pi_table)
print('A1: \n', a1_table)
print('A2: \n', a2_table)
print('Average ps', np.mean(ps_table, axis = 0))
print('Average pe', np.mean(pe_table, axis = 0))
print('Average pi', np.mean(pi_table, axis = 0))
print('Average a1', np.mean(a1_table, axis = 0))
print('Average a2', np.mean(a2_table, axis = 0))

### 3.1.2 Regular MDP

In [None]:
V = np.zeros((num_states, T))
ps_table = np.zeros((niter, T))
pe_table = np.zeros((niter, T))
pi_table = np.zeros((niter, T))
a1_table = np.zeros((niter, T-1))
a2_table = np.zeros((niter, T-1))
for t in range(T-1):
    V[:,t] = V0
episode_rewards = np.zeros(niter)
stagewise_rewards = np.zeros((niter, T-1))
for i in range(niter):
    print('Episode %d begins' %i)
    total_reward = 0 
    s = s_init
    for t in range(T-1):
        rho_0_s, rho_s = get_bar_pas_coefficients(s, N, L, M, discretized_N, tau, mu)
        delta_0_s, delta_s = get_ras_coefficients(s, N, L, M, cR_multiplier, cost_per_infect, vaccine_price, tau, mu, lam)
        xst = s[0]*N
        if xst <= vac_limit:
            ub_a = np.asarray([L, M])
        else:
            ub_a = np.asarray([np.int(vac_limit/xst*L), M])
        objective, a = solve_bellman_formulation3(act_dim, num_states, lam, V[:,t+1], delta_s, rho_s, delta_0_s, rho_0_s, ub_a)
        # update value table
        ss = np.int(s[0]*N/discretize_level)
        se = np.int(s[1]*N/discretize_level)
        si = np.int(s[2]*N/discretize_level)
        current_s_idx = ss*(discretized_N+1)**2 + se*(discretized_N+1) + si
        V[current_s_idx,t] = objective
        ps_table[i][t] = s[0]
        pe_table[i][t] = s[1]
        pi_table[i][t] = s[2]
        a1_table[i][t] = a[0]
        a2_table[i][t] = a[1]
        # sample next state 
        pas = get_tilde_pas(s, a, N, L, M, discretized_N, tau, mu, 70)
        next_s_idx = random.choices(np.arange(num_states), pas)[0]
        
        si = next_s_idx % (discretized_N + 1)
        se = ((next_s_idx - si) / (discretized_N + 1)) % (discretized_N + 1)
        ss = np.int((next_s_idx - se*(discretized_N+1) - si) / (discretized_N+1)**2)
            
        ps = ss*discretize_level/N
        pe = se*discretize_level/N
        pi = si*discretize_level/N
        
        next_s = np.asarray([ps, pe, pi])
        # compute reward
        reward = compute_actual_cost(a, s, next_s, N, L, M, cR_multiplier, cost_per_infect, vaccine_price)
        total_reward += (lam**t) * reward
        stagewise_rewards[i][t] = reward
        s = next_s
    ps_table[i][T-1] = s[0]
    pe_table[i][T-1] = s[1]
    pi_table[i][T-1] = s[2]
    print('Reward', total_reward)
    episode_rewards[i] = total_reward
    print('Episode %d ends' %i)
print('Episodes reward', episode_rewards)
print('Average reward', np.mean(episode_rewards))
print('std', np.std(episode_rewards) * 0.5)
print('Stagewise reward', stagewise_rewards)
print('PS: \n', ps_table)
print('PE: \n', pe_table)
print('PI: \n', pi_table)
print('A1: \n', a1_table)
print('A2: \n', a2_table)
print('Average ps', np.mean(ps_table, axis = 0))
print('Average pe', np.mean(pe_table, axis = 0))
print('Average pi', np.mean(pi_table, axis = 0))
print('Average a1', np.mean(a1_table, axis = 0))
print('Average a2', np.mean(a2_table, axis = 0))

## 3.2 Transition probability far from bar_pas

### 3.2.1 DRMDP DD1

In [321]:
V = np.zeros((num_states, T))
ps_table = np.zeros((niter, T))
pe_table = np.zeros((niter, T))
pi_table = np.zeros((niter, T))
a1_table = np.zeros((niter, T-1))
a2_table = np.zeros((niter, T-1))
for t in range(T-1):
    V[:,t] = V0
episode_rewards = np.zeros(niter)
stagewise_rewards = np.zeros((niter, T-1))
for i in range(niter):
    print('Episode %d begins' %i)
    total_reward = 0 
    s = s_init
    for t in range(T-1):
        rho_0_s, rho_s = get_bar_pas_coefficients(s, N, L, M, discretized_N, tau, mu)
        delta_0_s, delta_s = get_ras_coefficients(s, N, L, M, cR_multiplier, cost_per_infect, vaccine_price, tau, mu, lam)
        xst = s[0]*N
        if xst <= vac_limit:
            ub_a = np.asarray([L, M])
        else:
            ub_a = np.asarray([np.int(vac_limit/xst*L), M])
        objective, a = solve_bellman_formulation1(act_dim, num_states, lam, V[:,t+1], k, delta_s, rho_s, delta_0_s, rho_0_s, ub_a)
        # update value table
        ss = np.int(s[0]*N/discretize_level)
        se = np.int(s[1]*N/discretize_level)
        si = np.int(s[2]*N/discretize_level)
        current_s_idx = ss*(discretized_N+1)**2 + se*(discretized_N+1) + si
        V[current_s_idx,t] = objective
        ps_table[i][t] = s[0]
        pe_table[i][t] = s[1]
        pi_table[i][t] = s[2]
        a1_table[i][t] = a[0]
        a2_table[i][t] = a[1]
        # sample next state 
        pas = get_worst_case_pas(s, a, N, L, M, discretized_N, tau, mu, 60)
        next_s_idx = random.choices(np.arange(num_states), pas)[0]
        
        si = next_s_idx % (discretized_N + 1)
        se = ((next_s_idx - si) / (discretized_N + 1)) % (discretized_N + 1)
        ss = np.int((next_s_idx - se*(discretized_N+1) - si) / (discretized_N+1)**2)
            
        ps = ss*discretize_level/N
        pe = se*discretize_level/N
        pi = si*discretize_level/N
        
        next_s = np.asarray([ps, pe, pi])
        # compute reward
        reward = compute_actual_cost(a, s, next_s, N, L, M, cR_multiplier, cost_per_infect, vaccine_price)
        total_reward += (lam**t) * reward
        stagewise_rewards[i][t] = reward
        s = next_s
    ps_table[i][T-1] = s[0]
    pe_table[i][T-1] = s[1]
    pi_table[i][T-1] = s[2]
    print('Reward', total_reward)
    episode_rewards[i] = total_reward
    print('Episode %d ends' %i)
print('Episodes reward', episode_rewards)
print('Average reward', np.mean(episode_rewards))
print('std', np.std(episode_rewards) * 0.5)
print('Stagewise reward', stagewise_rewards)
print('PS: \n', ps_table)
print('PE: \n', pe_table)
print('PI: \n', pi_table)
print('A1: \n', a1_table)
print('A2: \n', a2_table)
print('Average ps', np.mean(ps_table, axis = 0))
print('Average pe', np.mean(pe_table, axis = 0))
print('Average pi', np.mean(pi_table, axis = 0))
print('Average a1', np.mean(a1_table, axis = 0))
print('Average a2', np.mean(a2_table, axis = 0))

Episode 0 begins
Reward -33288000.0
Episode 0 ends
Episode 1 begins
Reward -36240450.0
Episode 1 ends
Episode 2 begins
Reward -22812500.0
Episode 2 ends
Episode 3 begins
Reward -30007500.0
Episode 3 ends
Episode 4 begins
Reward -32883000.0
Episode 4 ends
Episodes reward [-33288000. -36240450. -22812500. -30007500. -32883000.]
Average reward -31046290.0
std 2283116.8156929687
Stagewise reward [[ -6450000. -11550000. -11750000.  -5000000.  -5000000.        -0.
         -0.        -0.        -0.        -0.        -0.]
 [ -6450000. -11550000. -11750000.  -5000000.  -5000000.  -5000000.
         -0.        -0.        -0.        -0.        -0.]
 [-11450000.  -6550000.  -6750000.        -0.        -0.        -0.
         -0.        -0.        -0.        -0.        -0.]
 [ -6450000. -11550000. -11750000.  -5000000.        -0.        -0.
         -0.        -0.        -0.        -0.        -0.]
 [ -6450000. -11550000.  -6750000. -10000000.  -5000000.        -0.
         -0.        -0.        -0

### 3.2.2 Regular MDP

In [None]:
V = np.zeros((num_states, T))
ps_table = np.zeros((niter, T))
pe_table = np.zeros((niter, T))
pi_table = np.zeros((niter, T))
a1_table = np.zeros((niter, T-1))
a2_table = np.zeros((niter, T-1))
for t in range(T-1):
    V[:,t] = V0
episode_rewards = np.zeros(niter)
stagewise_rewards = np.zeros((niter, T-1))
for i in range(niter):
    print('Episode %d begins' %i)
    total_reward = 0 
    s = s_init
    for t in range(T-1):
        rho_0_s, rho_s = get_bar_pas_coefficients(s, N, L, M, discretized_N, tau, mu)
        delta_0_s, delta_s = get_ras_coefficients(s, N, L, M, cR_multiplier, cost_per_infect, vaccine_price, tau, mu, lam)
        xst = s[0]*N
        if xst <= vac_limit:
            ub_a = np.asarray([L, M])
        else:
            ub_a = np.asarray([np.int(vac_limit/xst*L), M])
        objective, a = solve_bellman_formulation3(act_dim, num_states, lam, V[:,t+1], delta_s, rho_s, delta_0_s, rho_0_s, ub_a)
        # update value table
        ss = np.int(s[0]*N/discretize_level)
        se = np.int(s[1]*N/discretize_level)
        si = np.int(s[2]*N/discretize_level)
        current_s_idx = ss*(discretized_N+1)**2 + se*(discretized_N+1) + si
        V[current_s_idx,t] = objective
        ps_table[i][t] = s[0]
        pe_table[i][t] = s[1]
        pi_table[i][t] = s[2]
        a1_table[i][t] = a[0]
        a2_table[i][t] = a[1]
        # sample next state 
        pas = get_worst_case_pas(s, a, N, L, M, discretized_N, tau, mu, 60)
        next_s_idx = random.choices(np.arange(num_states), pas)[0]
        
        si = next_s_idx % (discretized_N + 1)
        se = ((next_s_idx - si) / (discretized_N + 1)) % (discretized_N + 1)
        ss = np.int((next_s_idx - se*(discretized_N+1) - si) / (discretized_N+1)**2)
            
        ps = ss*discretize_level/N
        pe = se*discretize_level/N
        pi = si*discretize_level/N
        
        next_s = np.asarray([ps, pe, pi])
        # compute reward
        reward = compute_actual_cost(a, s, next_s, N, L, M, cR_multiplier, cost_per_infect, vaccine_price)
        total_reward += (lam**t) * reward
        stagewise_rewards[i][t] = reward
        s = next_s
    ps_table[i][T-1] = s[0]
    pe_table[i][T-1] = s[1]
    pi_table[i][T-1] = s[2]
    print('Reward', total_reward)
    episode_rewards[i] = total_reward
    print('Episode %d ends' %i)
print('Episodes reward', episode_rewards)
print('Average reward', np.mean(episode_rewards))
print('std', np.std(episode_rewards) * 0.5)
print('Stagewise reward', stagewise_rewards)
print('PS: \n', ps_table)
print('PE: \n', pe_table)
print('PI: \n', pi_table)
print('A1: \n', a1_table)
print('A2: \n', a2_table)
print('Average ps', np.mean(ps_table, axis = 0))
print('Average pe', np.mean(pe_table, axis = 0))
print('Average pi', np.mean(pi_table, axis = 0))
print('Average a1', np.mean(a1_table, axis = 0))
print('Average a2', np.mean(a2_table, axis = 0))