In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.optimize import minimize

from simulation import Simulator, CustomerGenerator
from hazard_models import ExponentialHazard
from utility_learner import ProjectedVolumeLearner, diam
from degradation_learner import DegradationLearner

from utils import unit_ball_rejection_sample, correct_signs
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

import logging
logging.basicConfig(level=logging.INFO)

np.set_printoptions(suppress=True)

In [2]:
# --- 2. Define Sampling Functions ---
# def context_sampler() -> np.ndarray:
#     """Samples a customer's context vector from a uniform distribution."""
#     return np.random.uniform(low=0.0, high=1.0, size=D)

def context_sampler() -> np.ndarray:
    """Samples a customer's context vector uniformly from the unit ball."""
    return np.abs(unit_ball_rejection_sample(D))

def rental_sampler() -> float:
    """Samples a customer's desired rental duration from an exponential distribution."""
    return np.random.exponential(scale=10.0)

def interarrival_sampler() -> float:
    """Samples the time until the next customer arrives."""
    return np.random.exponential(scale=5.0)

In [3]:
# --- 1. Simulation Configuration ---
D = 4                                  # Dimension of context vectors
LAMBDA_VAL = 0.001                     # Baseline hazard constant
NUM_CUSTOMERS = 100000                   # Total number of customers to simulate, i.e. T

# Set a random seed for reproducibility
np.random.seed(41)

# Ground truth vectors
THETA_TRUE = np.array([0.5, 0.2, 0.1, 0.3])#$, 0.4])    # For degradation
UTILITY_TRUE = context_sampler()  # For customer's willingness to pay

# --- Machine's Pricing Vector 'r' ---
# This is a fallback pricing vector, when we don't feed u_hat to calculate_price
PRICING_R = np.zeros(D)

In [4]:
usage_exp_hazard_model = ExponentialHazard(lambda_val=LAMBDA_VAL)
# spontaneous_exp_hazard_model = None # ExponentialHazard(lambda_val=0.01)

customer_gen = CustomerGenerator(
    d=D,
    context_sampler=context_sampler,
    rental_sampler=rental_sampler,
    interarrival_sampler=interarrival_sampler
)

centroid_params = {
    # 'num_samples': 2000,
    # 'thin': None,
    # 'burn_in': 500 * D ** 2,
    # 'tol': 1e-4,
    # 'rho_target': 0.01
}

termination_rule = lambda diameter: diameter < 0.0005  # Example custom termination rule

projected_volume_learner = ProjectedVolumeLearner(
    T=NUM_CUSTOMERS, 
    d=D, 
    centroid_params=centroid_params,
    incentive_constant=1.1,
    termination_rule=termination_rule,
)

mdp_params = {
    'replacement_cost': 1.5,   # Cost to replace the machine
    'failure_cost': 0.75,      # Additional penalty for in-service failure
    'holding_cost_rate': 0.02,   # Cost per unit of idle time
    'gamma': 0.99,             # Discount factor
    'learning_rate': 1e-3,      # Learning rate for the Adam optimizer
    'target_update_freq': 10    # How often to update the target network (in iterations)
}

training_hyperparams = {
    'num_iterations': 50, # Number of training iterations per policy update
    'dataset_size': 500000,      # Number of transitions to generate for the offline dataset
    'batch_size': 2048           # Batch size for training
}

policy_params = {
    'type': 'softmax',
    'tau': 1.0,
    'epsilon': 0.1,
}

# Instantiate the Simulator with the new parameters
simulator = Simulator(
    d=D,
    T=NUM_CUSTOMERS,
    
    theta_true=THETA_TRUE,
    utility_true=UTILITY_TRUE,
    pricing_r=PRICING_R,
    
    usage_hazard_model=usage_exp_hazard_model,
    customer_generator=customer_gen,
    projected_volume_learner=projected_volume_learner,  # Use default ProjectedVolumeLearner
    
    mdp_params=mdp_params,
    training_hyperparams=training_hyperparams,
    policy_params=policy_params,
    policy_update_threshold=5,
    time_normalize=True,
)

In [5]:
# simulator.projected_volume_learner.is_terminated = True
simulation_data = simulator.run(num_customers=NUM_CUSTOMERS)
degradation_df = pd.DataFrame(simulator.degradation_history)
simulation_df = pd.DataFrame(simulator.history)

INFO:root:Starting simulation for 100000 customers...
  0%|          | 0/100000 [00:00<?, ?it/s]

Set parameter Username


INFO:gurobipy:Set parameter Username


Set parameter LicenseID to value 2651514


INFO:gurobipy:Set parameter LicenseID to value 2651514


Academic license - for non-commercial use only - expires 2026-04-14


INFO:gurobipy:Academic license - for non-commercial use only - expires 2026-04-14
INFO:root:Customer 1: Diameter: 1.0041
  0%|          | 1/100000 [00:02<82:20:35,  2.96s/it]INFO:root:Customer 2: Diameter: 0.7731
  0%|          | 2/100000 [00:06<88:01:30,  3.17s/it]INFO:root:Customer 3: Diameter: 0.6616
  0%|          | 3/100000 [00:09<94:00:59,  3.38s/it]INFO:root:Customer 4: Diameter: 0.4420
  0%|          | 4/100000 [00:13<100:14:34,  3.61s/it]INFO:root:Customer 5: Diameter: 0.3997
  0%|          | 5/100000 [00:18<106:17:59,  3.83s/it]INFO:root:Customer 6: Diameter: 0.6467
  0%|          | 6/100000 [00:22<112:59:09,  4.07s/it]INFO:root:Customer 7: Diameter: 0.4222
  0%|          | 7/100000 [00:27<120:52:14,  4.35s/it]INFO:root:Customer 8: Diameter: 0.3064
  0%|          | 8/100000 [00:32<130:04:44,  4.68s/it]INFO:root:Customer 9: Diameter: 0.2615
  0%|          | 9/100000 [00:38<139:45:03,  5.03s/it]INFO:root:Customer 10: Diameter: 0.4127
  0%|          | 10/100000 [00:44<149:52:13,

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  1.79176D+00    |proj g|=  1.00000D+00

At iterate    1    f=  6.46413D-01    |proj g|=  3.89924D-01

At iterate    2    f=  3.52838D-01    |proj g|=  0.00000D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      2      4      5     0     2   0.000D+00   3.528D-01
  F =  0.35283828155771779     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
Using device: mps
Generating 500000 experience samples...


100%|██████████| 500000/500000 [01:00<00:00, 8241.52it/s]



Starting FQI training loop...


INFO:root:Iter 1/50 | Loss: 1.2684 | Avg Q-Value: 0.67
INFO:root:Iter 2/50 | Loss: 1.2527 | Avg Q-Value: 0.68
INFO:root:Iter 3/50 | Loss: 1.2499 | Avg Q-Value: 0.70
INFO:root:Iter 4/50 | Loss: 1.2465 | Avg Q-Value: 0.74
INFO:root:Iter 5/50 | Loss: 1.2425 | Avg Q-Value: 0.74
INFO:root:Iter 6/50 | Loss: 1.2553 | Avg Q-Value: 0.74
INFO:root:Iter 7/50 | Loss: 1.2400 | Avg Q-Value: 0.77
INFO:root:Iter 8/50 | Loss: 1.2377 | Avg Q-Value: 0.79
INFO:root:Iter 9/50 | Loss: 1.2357 | Avg Q-Value: 0.75
INFO:root:Iter 10/50 | Loss: 1.2405 | Avg Q-Value: 0.79
INFO:root:Iter 11/50 | Loss: 1.3282 | Avg Q-Value: 1.06
INFO:root:Iter 12/50 | Loss: 1.3261 | Avg Q-Value: 1.10
INFO:root:Iter 13/50 | Loss: 1.3249 | Avg Q-Value: 1.10
INFO:root:Iter 14/50 | Loss: 1.3236 | Avg Q-Value: 1.13
INFO:root:Iter 15/50 | Loss: 1.3237 | Avg Q-Value: 1.14
INFO:root:Iter 16/50 | Loss: 1.3248 | Avg Q-Value: 1.11
INFO:root:Iter 17/50 | Loss: 1.3230 | Avg Q-Value: 1.11
INFO:root:Iter 18/50 | Loss: 1.3262 | Avg Q-Value: 1.12
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  9.13130D+00    |proj g|=  1.00000D+00

At iterate    1    f=  7.08112D+00    |proj g|=  1.00000D+00

At iterate    2    f=  5.87814D+00    |proj g|=  6.40112D-01

At iterate    3    f=  5.08174D+00    |proj g|=  6.93673D-02

At iterate    4    f=  5.08010D+00    |proj g|=  3.02227D-02

At iterate    5    f=  5.07971D+00    |proj g|=  6.92841D-04

At iterate    6    f=  5.07971D+00    |proj g|=  7.14806D-06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     T

100%|██████████| 500000/500000 [02:38<00:00, 3149.45it/s]



Starting FQI training loop...


INFO:root:Iter 1/50 | Loss: 8.9280 | Avg Q-Value: 0.22
INFO:root:Iter 2/50 | Loss: 8.9126 | Avg Q-Value: 0.25
INFO:root:Iter 3/50 | Loss: 8.9084 | Avg Q-Value: 0.26
INFO:root:Iter 4/50 | Loss: 8.9035 | Avg Q-Value: 0.29
INFO:root:Iter 5/50 | Loss: 8.9026 | Avg Q-Value: 0.26
INFO:root:Iter 6/50 | Loss: 8.9035 | Avg Q-Value: 0.28
INFO:root:Iter 7/50 | Loss: 8.8980 | Avg Q-Value: 0.28
INFO:root:Iter 8/50 | Loss: 8.8968 | Avg Q-Value: 0.27
INFO:root:Iter 9/50 | Loss: 9.5348 | Avg Q-Value: 0.29
INFO:root:Iter 10/50 | Loss: 8.8939 | Avg Q-Value: 0.29
INFO:root:Iter 11/50 | Loss: 8.9481 | Avg Q-Value: 0.32
INFO:root:Iter 12/50 | Loss: 8.9458 | Avg Q-Value: 0.34
INFO:root:Iter 13/50 | Loss: 8.9438 | Avg Q-Value: 0.32
INFO:root:Iter 14/50 | Loss: 8.9435 | Avg Q-Value: 0.34
INFO:root:Iter 15/50 | Loss: 8.9420 | Avg Q-Value: 0.34
INFO:root:Iter 16/50 | Loss: 8.9408 | Avg Q-Value: 0.32
INFO:root:Iter 17/50 | Loss: 8.9398 | Avg Q-Value: 0.34
INFO:root:Iter 18/50 | Loss: 8.9404 | Avg Q-Value: 0.34
I


Training complete.


  1%|          | 674/100000 [2:34:18<106:03:16,  3.84s/it]INFO:root:Updating optimal policy...
INFO:root:Theta updated. New theta_hat: [0.356 0.    0.44  0.636]


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  2.82477D+01    |proj g|=  1.00000D+00

At iterate    1    f=  2.51120D+01    |proj g|=  4.79646D-01

At iterate    2    f=  2.49098D+01    |proj g|=  6.12250D-01

At iterate    3    f=  2.45175D+01    |proj g|=  4.99813D-01

At iterate    4    f=  2.44816D+01    |proj g|=  2.08562D-01

At iterate    5    f=  2.44763D+01    |proj g|=  9.41274D-02

At iterate    6    f=  2.44753D+01    |proj g|=  4.46977D-02

At iterate    7    f=  2.44744D+01    |proj g|=  2.61079D-02

At iterate    8    f=  2.44743D+01    |proj g|=  7.52670D-03

At iterate    9    f=  2.44743D+01    |proj g|=  8.63931D-04

At iterate   10    f=  2.44743D+01    |proj g|=  3.20102D-04

At iterate   11    f=  2.44743D+01    |proj g|=  7.56513D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function ev

100%|██████████| 500000/500000 [03:20<00:00, 2488.46it/s]]



Starting FQI training loop...


INFO:root:Iter 1/50 | Loss: 0.8726 | Avg Q-Value: 0.11
INFO:root:Iter 2/50 | Loss: 0.8507 | Avg Q-Value: 0.12
INFO:root:Iter 3/50 | Loss: 0.8443 | Avg Q-Value: 0.12
INFO:root:Iter 4/50 | Loss: 0.8409 | Avg Q-Value: 0.15
INFO:root:Iter 5/50 | Loss: 0.8435 | Avg Q-Value: 0.15
INFO:root:Iter 6/50 | Loss: 0.8368 | Avg Q-Value: 0.16
INFO:root:Iter 7/50 | Loss: 0.8362 | Avg Q-Value: 0.18
INFO:root:Iter 8/50 | Loss: 0.8340 | Avg Q-Value: 0.16
INFO:root:Iter 9/50 | Loss: 0.8332 | Avg Q-Value: 0.18
INFO:root:Iter 10/50 | Loss: 0.8309 | Avg Q-Value: 0.18
INFO:root:Iter 11/50 | Loss: 0.9138 | Avg Q-Value: 0.27
INFO:root:Iter 12/50 | Loss: 0.9135 | Avg Q-Value: 0.24
INFO:root:Iter 13/50 | Loss: 0.9113 | Avg Q-Value: 0.26
INFO:root:Iter 14/50 | Loss: 0.9102 | Avg Q-Value: 0.28
INFO:root:Iter 15/50 | Loss: 0.9099 | Avg Q-Value: 0.25
INFO:root:Iter 16/50 | Loss: 0.9090 | Avg Q-Value: 0.27
INFO:root:Iter 17/50 | Loss: 0.9079 | Avg Q-Value: 0.30
INFO:root:Iter 18/50 | Loss: 0.9087 | Avg Q-Value: 0.31
I


Training complete.


  1%|          | 972/100000 [3:49:22<221:03:22,  8.04s/it]INFO:root:Updating optimal policy...
INFO:root:Theta updated. New theta_hat: [0.369 0.    0.479 0.661]


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  4.64943D+01    |proj g|=  1.00000D+00

At iterate    1    f=  4.06775D+01    |proj g|=  4.21109D-01

At iterate    2    f=  4.05697D+01    |proj g|=  5.57755D-01

At iterate    3    f=  4.02489D+01    |proj g|=  5.80325D-01

At iterate    4    f=  4.02064D+01    |proj g|=  3.17210D-01

At iterate    5    f=  4.01946D+01    |proj g|=  7.09349D-02

At iterate    6    f=  4.01941D+01    |proj g|=  6.74101D-02

At iterate    7    f=  4.01934D+01    |proj g|=  1.73089D-02

At iterate    8    f=  4.01934D+01    |proj g|=  9.18220D-04

At iterate    9    f=  4.01934D+01    |proj g|=  6.88940D-05

At iterate   10    f=  4.01934D+01    |proj g|=  2.11172D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cau

100%|██████████| 500000/500000 [08:16<00:00, 1006.58it/s]]



Starting FQI training loop...


INFO:root:Iter 1/50 | Loss: 1.2947 | Avg Q-Value: 0.07
INFO:root:Iter 2/50 | Loss: 1.2884 | Avg Q-Value: 0.06
INFO:root:Iter 3/50 | Loss: 1.4918 | Avg Q-Value: 0.06
INFO:root:Iter 4/50 | Loss: 1.2702 | Avg Q-Value: 0.07
INFO:root:Iter 5/50 | Loss: 1.2675 | Avg Q-Value: 0.10
INFO:root:Iter 6/50 | Loss: 1.2671 | Avg Q-Value: 0.07
INFO:root:Iter 7/50 | Loss: 1.2634 | Avg Q-Value: 0.10
INFO:root:Iter 8/50 | Loss: 1.2618 | Avg Q-Value: 0.11
INFO:root:Iter 9/50 | Loss: 1.2615 | Avg Q-Value: 0.08
INFO:root:Iter 10/50 | Loss: 1.2612 | Avg Q-Value: 0.10
INFO:root:Iter 11/50 | Loss: 1.3317 | Avg Q-Value: 0.18
INFO:root:Iter 12/50 | Loss: 1.3293 | Avg Q-Value: 0.16
INFO:root:Iter 13/50 | Loss: 1.3281 | Avg Q-Value: 0.17
INFO:root:Iter 14/50 | Loss: 1.3275 | Avg Q-Value: 0.17
INFO:root:Iter 15/50 | Loss: 1.3260 | Avg Q-Value: 0.21
INFO:root:Iter 16/50 | Loss: 1.3285 | Avg Q-Value: 0.20
INFO:root:Iter 17/50 | Loss: 1.3255 | Avg Q-Value: 0.19
INFO:root:Iter 18/50 | Loss: 1.3245 | Avg Q-Value: 0.20
I


Training complete.


  1%|▏         | 1267/100000 [5:08:55<269:13:02,  9.82s/it]INFO:root:Updating optimal policy...
INFO:root:Theta updated. New theta_hat: [0.225 0.    0.613 0.594]


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  6.72788D+01    |proj g|=  1.00000D+00

At iterate    1    f=  6.03604D+01    |proj g|=  4.55986D-01

At iterate    2    f=  6.02604D+01    |proj g|=  5.96495D-01

At iterate    3    f=  5.99737D+01    |proj g|=  4.87697D-01

At iterate    4    f=  5.99280D+01    |proj g|=  3.13049D-01

At iterate    5    f=  5.99212D+01    |proj g|=  3.32861D-02

At iterate    6    f=  5.99211D+01    |proj g|=  2.53782D-02

At iterate    7    f=  5.99210D+01    |proj g|=  1.60910D-03

At iterate    8    f=  5.99210D+01    |proj g|=  1.43592D-04

At iterate    9    f=  5.99210D+01    |proj g|=  2.78381D-06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = nu

100%|██████████| 500000/500000 [06:11<00:00, 1346.29it/s]t]



Starting FQI training loop...


INFO:root:Iter 1/50 | Loss: 0.8905 | Avg Q-Value: 0.27
INFO:root:Iter 2/50 | Loss: 0.8850 | Avg Q-Value: 0.29
INFO:root:Iter 3/50 | Loss: 0.8719 | Avg Q-Value: 0.31
INFO:root:Iter 4/50 | Loss: 0.8690 | Avg Q-Value: 0.29
INFO:root:Iter 5/50 | Loss: 0.8672 | Avg Q-Value: 0.30
INFO:root:Iter 6/50 | Loss: 0.8654 | Avg Q-Value: 0.31
INFO:root:Iter 7/50 | Loss: 0.8650 | Avg Q-Value: 0.30
INFO:root:Iter 8/50 | Loss: 0.8668 | Avg Q-Value: 0.30
INFO:root:Iter 9/50 | Loss: 0.8689 | Avg Q-Value: 0.31
INFO:root:Iter 10/50 | Loss: 0.8599 | Avg Q-Value: 0.32
INFO:root:Iter 11/50 | Loss: 0.9444 | Avg Q-Value: 0.39
INFO:root:Iter 12/50 | Loss: 0.9430 | Avg Q-Value: 0.44
INFO:root:Iter 13/50 | Loss: 0.9435 | Avg Q-Value: 0.43
INFO:root:Iter 14/50 | Loss: 0.9426 | Avg Q-Value: 0.42
INFO:root:Iter 15/50 | Loss: 0.9404 | Avg Q-Value: 0.41
INFO:root:Iter 16/50 | Loss: 0.9390 | Avg Q-Value: 0.40
INFO:root:Iter 17/50 | Loss: 0.9379 | Avg Q-Value: 0.43
INFO:root:Iter 18/50 | Loss: 0.9374 | Avg Q-Value: 0.39
I

KeyboardInterrupt: 

## Training policy under perfect information

In [None]:
from policy import DPAgent

class PerfectDegradationLearner:
    def __init__(self, d, theta_true, hazard_model):
        self.d = d
        self.theta_true = theta_true
        self.hazard_model = hazard_model  # Placeholder, not used
        
    def get_theta(self):
        return self.theta_true
    
    def cum_baseline(self, t):
        return self.hazard_model.Lambda_0(t)
    
    def inverse_cum_baseline(self, u):
        return self.hazard_model.Lambda_0_inverse(u)
    
perfect_degradation_learner = PerfectDegradationLearner(
    d=D, 
    theta_true=THETA_TRUE,
    hazard_model=usage_exp_hazard_model,
)
perfect_dpagent = DPAgent(
    d=D,
    u_hat=UTILITY_TRUE,
    time_normalize=True,
    degradation_learner=perfect_degradation_learner,
    customer_generator=customer_gen,
    params=mdp_params,
)

perfect_dpagent.train(
    num_iterations=50,
    dataset_size=5000000,
    batch_size=1024
)

perfect_policy = perfect_dpagent.get_policy(
    {'type': 'greedy'}
)

Using device: mps
Generating 5000000 experience samples...


100%|██████████| 5000000/5000000 [00:42<00:00, 117220.35it/s]



Starting FQI training loop...


  0%|          | 0/50 [00:00<?, ?it/s]

### Debugging: Look at experience dataset

In [None]:
from policy import DPAgent

print(simulator.projected_volume_learner.get_estimate())
print(UTILITY_TRUE)

dp_agent = DPAgent(
    d=simulator.d,
    u_hat=simulator.projected_volume_learner.get_estimate(),
    time_normalize=simulator.time_normalize,
    degradation_learner=simulator.degradation_learner,
    customer_generator=simulator.customer_generator,
    params=simulator.mdp_params
)

dataset = dp_agent.experience_generator.generate(500)

for data in dataset:
    state, action, reward, next_state = data
    print("State:", state.round(3))
    print("Action:", dp_agent.experience_generator.ACTION_MAP[action])
    print("Reward:", reward)
    print("Next State:", next_state.round(3))
    print("-----")
    print()

In [6]:
degradation_learner = DegradationLearner(d=D, initial_theta=np.zeros(D))
degradation_df = pd.DataFrame(simulator.degradation_history)
degradation_learner.fit(degradation_df)
degradation_learner.get_theta()

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  6.72788D+01    |proj g|=  1.00000D+00

At iterate    1    f=  6.03604D+01    |proj g|=  4.55986D-01

At iterate    2    f=  6.02604D+01    |proj g|=  5.96495D-01

At iterate    3    f=  5.99737D+01    |proj g|=  4.87697D-01

At iterate    4    f=  5.99280D+01    |proj g|=  3.13049D-01

At iterate    5    f=  5.99212D+01    |proj g|=  3.32861D-02

At iterate    6    f=  5.99211D+01    |proj g|=  2.53782D-02

At iterate    7    f=  5.99210D+01    |proj g|=  1.60910D-03

At iterate    8    f=  5.99210D+01    |proj g|=  1.43592D-04

At iterate    9    f=  5.99210D+01    |proj g|=  2.78381D-06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = nu

array([0.22545111, 0.00038501, 0.61250084, 0.59371561])

Testing Policy

In [None]:
i = 50

X_i = df.loc[i, 'sum_of_contexts_after']
I_i = 3 # df.loc[i, '']
x_i = df.loc[i+1, 'customer_context']
T_i = df.loc[i+1, 'rental_duration']

arrival_state = np.concatenate([
    X_i,
    x_i,
    [T_i, I_i, 0.0]
])
action_arrival = optimal_policy(arrival_state)
action_map = {0: 'Give Max Acceptable Price', 1: 'Shutdown'}
print(f"Sample Arrival State. Optimal Action: {action_map[action_arrival]}")


In [None]:
# Test Case 2: Departure State
departure_state = np.concatenate([
    X_i+x_i*10, 
    np.zeros(D), 
    [0.0, I_i, 1.0]
])
action_departure = optimal_policy(departure_state)
action_map = {2: 'Replace Machine', 3: 'Do Not Replace'}
print(f"Sample Departure State. Optimal Action: {action_map[action_departure]}")
