In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.optimize import minimize

from simulation import Simulator, CustomerGenerator
from hazard_models import ExponentialHazard
from utility_learner import ProjectedVolumeLearner, diam
from degradation_learner import DegradationLearner

from utils import unit_ball_rejection_sample, correct_signs
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

import logging
logging.basicConfig(level=logging.INFO)

np.set_printoptions(suppress=True)

In [2]:
# --- 2. Define Sampling Functions ---
# def context_sampler() -> np.ndarray:
#     """Samples a customer's context vector from a uniform distribution."""
#     return np.random.uniform(low=0.0, high=1.0, size=D)

def context_sampler() -> np.ndarray:
    """Samples a customer's context vector uniformly from the unit ball."""
    return np.abs(unit_ball_rejection_sample(D))

def rental_sampler() -> float:
    """Samples a customer's desired rental duration from an exponential distribution."""
    return np.random.exponential(scale=10.0)

def interarrival_sampler() -> float:
    """Samples the time until the next customer arrives."""
    return np.random.exponential(scale=5.0)

In [None]:
# --- 1. Simulation Configuration ---
D = 4                                  # Dimension of context vectors
LAMBDA_VAL = 0.001                     # Baseline hazard constant
NUM_CUSTOMERS = 10000                   # Total number of customers to simulate, i.e. T

# Set a random seed for reproducibility
np.random.seed(41)

# Ground truth vectors
THETA_TRUE = np.array([0.5, 0.2, 0.1, 0.3])#$, 0.4])    # For degradation
UTILITY_TRUE = context_sampler()  # For customer's willingness to pay

# --- Machine's Pricing Vector 'r' ---
# This is a fallback pricing vector, when we don't feed u_hat to calculate_price
PRICING_R = np.zeros(D)

In [4]:
usage_exp_hazard_model = ExponentialHazard(lambda_val=LAMBDA_VAL)
# spontaneous_exp_hazard_model = None # ExponentialHazard(lambda_val=0.01)

customer_gen = CustomerGenerator(
    d=D,
    context_sampler=context_sampler,
    rental_sampler=rental_sampler,
    interarrival_sampler=interarrival_sampler
)

centroid_params = {
    # 'num_samples': 2000,
    # 'thin': None,
    # 'burn_in': 500 * D ** 2,
    # 'tol': 1e-4,
    # 'rho_target': 0.01
}

termination_rule = lambda diameter: diameter < 0.0005  # Example custom termination rule

projected_volume_learner = ProjectedVolumeLearner(
    T=NUM_CUSTOMERS, 
    d=D, 
    centroid_params=centroid_params,
    incentive_constant=1.1,
    termination_rule=termination_rule,
)

mdp_params = {
    'replacement_cost': 1.5,   # Cost to replace the machine
    'failure_cost': 0.75,      # Additional penalty for in-service failure
    'holding_cost_rate': 0.02,   # Cost per unit of idle time
    'gamma': 0.99,             # Discount factor
    'learning_rate': 1e-3,      # Learning rate for the Adam optimizer
    'target_update_freq': 10    # How often to update the target network (in iterations)
}

training_hyperparams = {
    'num_iterations': 20, # Number of training iterations per policy update
    'dataset_size': 100000,      # Number of transitions to generate for the offline dataset
    'batch_size': 2048           # Batch size for training
}

policy_params = {
    'type': 'softmax',
    'tau': 1.0,
    'epsilon': 0.1,
}

# Instantiate the Simulator with the new parameters
simulator = Simulator(
    d=D,
    T=NUM_CUSTOMERS,
    
    theta_true=THETA_TRUE,
    utility_true=UTILITY_TRUE,
    pricing_r=PRICING_R,
    
    usage_hazard_model=usage_exp_hazard_model,
    customer_generator=customer_gen,
    projected_volume_learner=projected_volume_learner,  # Use default ProjectedVolumeLearner
    
    mdp_params=mdp_params,
    training_hyperparams=training_hyperparams,
    policy_params=policy_params,
    policy_update_threshold=5,
    time_normalize=True,
)

In [None]:
# simulator.projected_volume_learner.is_terminated = True
simulation_data = simulator.run(num_customers=NUM_CUSTOMERS)
degradation_df = pd.DataFrame(simulator.degradation_history)
simulation_df = pd.DataFrame(simulator.history)

INFO:root:Starting simulation for 100000 customers...
  0%|          | 0/100000 [00:00<?, ?it/s]

Set parameter Username


INFO:gurobipy:Set parameter Username


Set parameter LicenseID to value 2651514


INFO:gurobipy:Set parameter LicenseID to value 2651514


Academic license - for non-commercial use only - expires 2026-04-14


INFO:gurobipy:Academic license - for non-commercial use only - expires 2026-04-14
INFO:root:Customer 1: Diameter: 1.0041
  0%|          | 1/100000 [00:02<80:15:44,  2.89s/it]INFO:root:Customer 2: Diameter: 0.7731
  0%|          | 2/100000 [00:06<87:17:55,  3.14s/it]INFO:root:Customer 3: Diameter: 0.6616
  0%|          | 3/100000 [00:09<92:59:12,  3.35s/it]INFO:root:Customer 4: Diameter: 0.4420
  0%|          | 4/100000 [00:13<99:57:15,  3.60s/it]INFO:root:Customer 5: Diameter: 0.3997
  0%|          | 5/100000 [00:17<105:28:09,  3.80s/it]INFO:root:Customer 6: Diameter: 0.6467
  0%|          | 6/100000 [00:22<111:47:55,  4.02s/it]INFO:root:Customer 7: Diameter: 0.4222
  0%|          | 7/100000 [00:27<120:09:34,  4.33s/it]INFO:root:Customer 8: Diameter: 0.3064
  0%|          | 8/100000 [00:32<128:35:11,  4.63s/it]INFO:root:Customer 9: Diameter: 0.2615
  0%|          | 9/100000 [00:38<137:30:51,  4.95s/it]INFO:root:Customer 10: Diameter: 0.4127
  0%|          | 10/100000 [00:44<146:53:46, 

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  1.79176D+00    |proj g|=  1.00000D+00

At iterate    1    f=  6.46413D-01    |proj g|=  3.89924D-01

At iterate    2    f=  3.52838D-01    |proj g|=  0.00000D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      2      4      5     0     2   0.000D+00   3.528D-01
  F =  0.35283828155771779     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:13<00:00, 7559.74it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.6782 | Avg Q-Value: 0.25
INFO:root:Iter 2/20 | Loss: 0.6482 | Avg Q-Value: 0.34
INFO:root:Iter 3/20 | Loss: 0.6461 | Avg Q-Value: 0.35
INFO:root:Iter 4/20 | Loss: 0.6365 | Avg Q-Value: 0.35
INFO:root:Iter 5/20 | Loss: 0.6345 | Avg Q-Value: 0.35
INFO:root:Iter 6/20 | Loss: 0.6324 | Avg Q-Value: 0.36
INFO:root:Iter 7/20 | Loss: 0.6307 | Avg Q-Value: 0.35
INFO:root:Iter 8/20 | Loss: 0.6296 | Avg Q-Value: 0.36
INFO:root:Iter 9/20 | Loss: 0.6287 | Avg Q-Value: 0.36
INFO:root:Iter 10/20 | Loss: 0.6283 | Avg Q-Value: 0.36
INFO:root:Iter 11/20 | Loss: 0.6678 | Avg Q-Value: 0.48
INFO:root:Iter 12/20 | Loss: 0.6605 | Avg Q-Value: 0.49
INFO:root:Iter 13/20 | Loss: 0.6579 | Avg Q-Value: 0.50
INFO:root:Iter 14/20 | Loss: 0.6557 | Avg Q-Value: 0.50
INFO:root:Iter 15/20 | Loss: 0.6548 | Avg Q-Value: 0.51
INFO:root:Iter 16/20 | Loss: 0.6552 | Avg Q-Value: 0.49
INFO:root:Iter 17/20 | Loss: 0.6540 | Avg Q-Value: 0.51
INFO:root:Iter 18/20 | Loss: 0.6546 | Avg Q-Value: 0.50
I


Training complete.


  0%|          | 44/100000 [13:06<3329:48:22, 119.93s/it]INFO:root:Updating optimal policy...
INFO:root:Theta updated. New theta_hat: [0.661 0.    0.481 0.659]


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  1.16097D+01    |proj g|=  1.00000D+00

At iterate    1    f=  9.36125D+00    |proj g|=  3.55166D-01

At iterate    2    f=  9.34258D+00    |proj g|=  2.62706D-01

At iterate    3    f=  9.28888D+00    |proj g|=  2.18294D-01

At iterate    4    f=  9.26291D+00    |proj g|=  1.53639D-01

At iterate    5    f=  9.25245D+00    |proj g|=  8.51296D-02

At iterate    6    f=  9.25108D+00    |proj g|=  6.00015D-02

At iterate    7    f=  9.24955D+00    |proj g|=  1.32192D-02

At iterate    8    f=  9.24940D+00    |proj g|=  5.53577D-03

At iterate    9    f=  9.24938D+00    |proj g|=  4.64912D-05

At iterate   10    f=  9.24938D+00    |proj g|=  6.84895D-07

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cau

100%|██████████| 100000/100000 [00:18<00:00, 5523.00it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.6979 | Avg Q-Value: 0.30
INFO:root:Iter 2/20 | Loss: 0.6675 | Avg Q-Value: 0.38
INFO:root:Iter 3/20 | Loss: 0.6581 | Avg Q-Value: 0.37
INFO:root:Iter 4/20 | Loss: 0.6544 | Avg Q-Value: 0.41
INFO:root:Iter 5/20 | Loss: 0.6524 | Avg Q-Value: 0.39
INFO:root:Iter 6/20 | Loss: 0.6499 | Avg Q-Value: 0.41
INFO:root:Iter 7/20 | Loss: 0.6477 | Avg Q-Value: 0.42
INFO:root:Iter 8/20 | Loss: 0.6462 | Avg Q-Value: 0.39
INFO:root:Iter 9/20 | Loss: 0.6449 | Avg Q-Value: 0.44
INFO:root:Iter 10/20 | Loss: 0.6427 | Avg Q-Value: 0.40
INFO:root:Iter 11/20 | Loss: 0.6654 | Avg Q-Value: 0.49
INFO:root:Iter 12/20 | Loss: 0.6570 | Avg Q-Value: 0.50
INFO:root:Iter 13/20 | Loss: 0.6523 | Avg Q-Value: 0.49
INFO:root:Iter 14/20 | Loss: 0.6517 | Avg Q-Value: 0.50
INFO:root:Iter 15/20 | Loss: 0.6537 | Avg Q-Value: 0.50
INFO:root:Iter 16/20 | Loss: 0.6531 | Avg Q-Value: 0.50
INFO:root:Iter 17/20 | Loss: 0.6500 | Avg Q-Value: 0.50
INFO:root:Iter 18/20 | Loss: 0.6493 | Avg Q-Value: 0.50
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  2.48235D+01    |proj g|=  1.00000D+00

At iterate    1    f=  2.38661D+01    |proj g|=  8.15025D-01

At iterate    2    f=  2.33524D+01    |proj g|=  6.86677D-01

At iterate    3    f=  2.28520D+01    |proj g|=  3.64684D-01

At iterate    4    f=  2.26763D+01    |proj g|=  2.37167D-02


INFO:root:Theta updated. New theta_hat: [0.927 0.    0.    0.212]



At iterate    5    f=  2.26762D+01    |proj g|=  3.42459D-03

At iterate    6    f=  2.26762D+01    |proj g|=  1.07736D-04

At iterate    7    f=  2.26762D+01    |proj g|=  2.89402D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      7      9     12     0     2   2.894D-05   2.268D+01
  F =   22.676240730385409     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:26<00:00, 3729.76it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 1.3405 | Avg Q-Value: 0.16
INFO:root:Iter 2/20 | Loss: 1.3256 | Avg Q-Value: 0.19
INFO:root:Iter 3/20 | Loss: 1.3226 | Avg Q-Value: 0.20
INFO:root:Iter 4/20 | Loss: 1.3197 | Avg Q-Value: 0.20
INFO:root:Iter 5/20 | Loss: 1.3162 | Avg Q-Value: 0.19
INFO:root:Iter 6/20 | Loss: 1.3250 | Avg Q-Value: 0.21
INFO:root:Iter 7/20 | Loss: 1.3128 | Avg Q-Value: 0.21
INFO:root:Iter 8/20 | Loss: 1.3117 | Avg Q-Value: 0.22
INFO:root:Iter 9/20 | Loss: 1.3101 | Avg Q-Value: 0.22
INFO:root:Iter 10/20 | Loss: 1.3090 | Avg Q-Value: 0.22
INFO:root:Iter 11/20 | Loss: 1.3601 | Avg Q-Value: 0.29
INFO:root:Iter 12/20 | Loss: 1.3559 | Avg Q-Value: 0.31
INFO:root:Iter 13/20 | Loss: 1.3553 | Avg Q-Value: 0.31
INFO:root:Iter 14/20 | Loss: 1.3560 | Avg Q-Value: 0.30
INFO:root:Iter 15/20 | Loss: 1.3542 | Avg Q-Value: 0.32
INFO:root:Iter 16/20 | Loss: 1.3537 | Avg Q-Value: 0.30
INFO:root:Iter 17/20 | Loss: 1.3521 | Avg Q-Value: 0.31
INFO:root:Iter 18/20 | Loss: 1.3512 | Avg Q-Value: 0.33
I


Training complete.


  1%|          | 545/100000 [25:27<31:48:48,  1.15s/it]INFO:root:Updating optimal policy...
INFO:root:Theta updated. New theta_hat: [1.    0.    0.276 0.   ]


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  4.50397D+01    |proj g|=  1.00000D+00

At iterate    1    f=  4.31572D+01    |proj g|=  7.59225D-01

At iterate    2    f=  4.20775D+01    |proj g|=  6.84759D-01

At iterate    3    f=  4.12364D+01    |proj g|=  2.32084D-01

At iterate    4    f=  4.11235D+01    |proj g|=  2.20837D-01

At iterate    5    f=  4.11137D+01    |proj g|=  2.09318D-02

At iterate    6    f=  4.11137D+01    |proj g|=  3.16701D-04

At iterate    7    f=  4.11137D+01    |proj g|=  1.56667D-07

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function 

100%|██████████| 100000/100000 [01:07<00:00, 1488.56it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.8507 | Avg Q-Value: 0.52
INFO:root:Iter 2/20 | Loss: 0.7695 | Avg Q-Value: 0.59
INFO:root:Iter 3/20 | Loss: 0.7525 | Avg Q-Value: 0.62
INFO:root:Iter 4/20 | Loss: 0.7501 | Avg Q-Value: 0.64
INFO:root:Iter 5/20 | Loss: 0.7475 | Avg Q-Value: 0.65
INFO:root:Iter 6/20 | Loss: 0.7451 | Avg Q-Value: 0.65
INFO:root:Iter 7/20 | Loss: 0.7488 | Avg Q-Value: 0.64
INFO:root:Iter 8/20 | Loss: 0.7428 | Avg Q-Value: 0.64
INFO:root:Iter 9/20 | Loss: 0.7405 | Avg Q-Value: 0.64
INFO:root:Iter 10/20 | Loss: 0.7393 | Avg Q-Value: 0.64
INFO:root:Iter 11/20 | Loss: 0.8196 | Avg Q-Value: 0.88
INFO:root:Iter 12/20 | Loss: 0.7854 | Avg Q-Value: 0.89
INFO:root:Iter 13/20 | Loss: 0.7843 | Avg Q-Value: 0.90
INFO:root:Iter 14/20 | Loss: 0.7830 | Avg Q-Value: 0.90
INFO:root:Iter 15/20 | Loss: 0.7827 | Avg Q-Value: 0.92
INFO:root:Iter 16/20 | Loss: 0.7815 | Avg Q-Value: 0.89
INFO:root:Iter 17/20 | Loss: 0.7810 | Avg Q-Value: 0.89
INFO:root:Iter 18/20 | Loss: 0.7804 | Avg Q-Value: 0.91
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  6.00777D+01    |proj g|=  1.00000D+00

At iterate    1    f=  5.76177D+01    |proj g|=  8.40421D-01

At iterate    2    f=  5.73319D+01    |proj g|=  7.97268D-01

At iterate    3    f=  5.69951D+01    |proj g|=  2.51084D-01

At iterate    4    f=  5.69873D+01    |proj g|=  1.81863D-01

At iterate    5    f=  5.69790D+01    |proj g|=  5.16988D-02

At iterate    6    f=  5.69789D+01    |proj g|=  3.39361D-02

At iterate    7    f=  5.69788D+01    |proj g|=  2.65179D-05


INFO:root:Theta updated. New theta_hat: [0.248 0.    0.377 0.123]



At iterate    8    f=  5.69788D+01    |proj g|=  5.32747D-06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      8     10     12     0     1   5.327D-06   5.698D+01
  F =   56.978830189602213     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:46<00:00, 2162.02it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.6865 | Avg Q-Value: 0.09
INFO:root:Iter 2/20 | Loss: 0.6562 | Avg Q-Value: 0.23
INFO:root:Iter 3/20 | Loss: 0.6476 | Avg Q-Value: 0.23
INFO:root:Iter 4/20 | Loss: 0.6433 | Avg Q-Value: 0.23
INFO:root:Iter 5/20 | Loss: 0.6438 | Avg Q-Value: 0.24
INFO:root:Iter 6/20 | Loss: 0.6390 | Avg Q-Value: 0.24
INFO:root:Iter 7/20 | Loss: 0.6370 | Avg Q-Value: 0.24
INFO:root:Iter 8/20 | Loss: 0.6354 | Avg Q-Value: 0.25
INFO:root:Iter 9/20 | Loss: 0.6330 | Avg Q-Value: 0.25
INFO:root:Iter 10/20 | Loss: 0.6323 | Avg Q-Value: 0.25
INFO:root:Iter 11/20 | Loss: 0.6779 | Avg Q-Value: 0.27
INFO:root:Iter 12/20 | Loss: 0.6715 | Avg Q-Value: 0.28
INFO:root:Iter 13/20 | Loss: 0.6706 | Avg Q-Value: 0.28
INFO:root:Iter 14/20 | Loss: 0.6712 | Avg Q-Value: 0.28
INFO:root:Iter 15/20 | Loss: 0.6700 | Avg Q-Value: 0.29
INFO:root:Iter 16/20 | Loss: 0.6694 | Avg Q-Value: 0.29
INFO:root:Iter 17/20 | Loss: 0.6694 | Avg Q-Value: 0.30
INFO:root:Iter 18/20 | Loss: 0.6686 | Avg Q-Value: 0.29
I


Training complete.


  1%|          | 1135/100000 [38:55<13:29:26,  2.04it/s]INFO:root:Updating optimal policy...
INFO:root:Theta updated. New theta_hat: [0.328 0.083 0.261 0.   ]


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  8.90901D+01    |proj g|=  1.00000D+00

At iterate    1    f=  8.65801D+01    |proj g|=  8.48168D-01

At iterate    2    f=  8.64058D+01    |proj g|=  7.90375D-01

At iterate    3    f=  8.61877D+01    |proj g|=  2.78820D-01

At iterate    4    f=  8.61701D+01    |proj g|=  2.76096D-01

At iterate    5    f=  8.61683D+01    |proj g|=  2.84744D-02

At iterate    6    f=  8.61682D+01    |proj g|=  1.90188D-02

At iterate    7    f=  8.61682D+01    |proj g|=  1.94429D-02

At iterate    8    f=  8.61682D+01    |proj g|=  7.50174D-03

At iterate    9    f=  8.61682D+01    |proj g|=  8.83329D-04

At iterate   10    f=  8.61682D+01    |proj g|=  6.37528D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cau

100%|██████████| 100000/100000 [00:41<00:00, 2384.81it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 1.4747 | Avg Q-Value: 0.38
INFO:root:Iter 2/20 | Loss: 1.3974 | Avg Q-Value: 0.39
INFO:root:Iter 3/20 | Loss: 1.3891 | Avg Q-Value: 0.39
INFO:root:Iter 4/20 | Loss: 1.3859 | Avg Q-Value: 0.40
INFO:root:Iter 5/20 | Loss: 1.3832 | Avg Q-Value: 0.42
INFO:root:Iter 6/20 | Loss: 1.3814 | Avg Q-Value: 0.42
INFO:root:Iter 7/20 | Loss: 1.3904 | Avg Q-Value: 0.42
INFO:root:Iter 8/20 | Loss: 1.3764 | Avg Q-Value: 0.43
INFO:root:Iter 9/20 | Loss: 1.3754 | Avg Q-Value: 0.44
INFO:root:Iter 10/20 | Loss: 1.3745 | Avg Q-Value: 0.44
INFO:root:Iter 11/20 | Loss: 1.4594 | Avg Q-Value: 0.70
INFO:root:Iter 12/20 | Loss: 1.4359 | Avg Q-Value: 0.69
INFO:root:Iter 13/20 | Loss: 1.4187 | Avg Q-Value: 0.70
INFO:root:Iter 14/20 | Loss: 1.4189 | Avg Q-Value: 0.70
INFO:root:Iter 15/20 | Loss: 1.4173 | Avg Q-Value: 0.72
INFO:root:Iter 16/20 | Loss: 1.4233 | Avg Q-Value: 0.71
INFO:root:Iter 17/20 | Loss: 1.4165 | Avg Q-Value: 0.72
INFO:root:Iter 18/20 | Loss: 1.4161 | Avg Q-Value: 0.73
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  1.07139D+02    |proj g|=  1.00000D+00

At iterate    1    f=  1.04281D+02    |proj g|=  8.51480D-01

At iterate    2    f=  1.04118D+02    |proj g|=  8.27755D-01

At iterate    3    f=  1.03804D+02    |proj g|=  1.20318D-01

At iterate    4    f=  1.03801D+02    |proj g|=  4.41988D-02


INFO:root:Theta updated. New theta_hat: [0.314 0.    0.252 0.049]



At iterate    5    f=  1.03801D+02    |proj g|=  2.61328D-02

At iterate    6    f=  1.03801D+02    |proj g|=  1.01988D-02

At iterate    7    f=  1.03801D+02    |proj g|=  3.64673D-03

At iterate    8    f=  1.03801D+02    |proj g|=  7.89233D-04

At iterate    9    f=  1.03801D+02    |proj g|=  2.86479D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      9     11     13     0     1   2.865D-05   1.038D+02
  F =   103.80112495301478     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:39<00:00, 2550.86it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 1.2458 | Avg Q-Value: 0.15
INFO:root:Iter 2/20 | Loss: 1.1903 | Avg Q-Value: 0.18
INFO:root:Iter 3/20 | Loss: 1.1852 | Avg Q-Value: 0.18
INFO:root:Iter 4/20 | Loss: 1.1791 | Avg Q-Value: 0.18
INFO:root:Iter 5/20 | Loss: 1.1762 | Avg Q-Value: 0.19
INFO:root:Iter 6/20 | Loss: 1.1746 | Avg Q-Value: 0.19
INFO:root:Iter 7/20 | Loss: 1.1717 | Avg Q-Value: 0.19
INFO:root:Iter 8/20 | Loss: 1.1704 | Avg Q-Value: 0.20
INFO:root:Iter 9/20 | Loss: 1.1704 | Avg Q-Value: 0.20
INFO:root:Iter 10/20 | Loss: 1.1690 | Avg Q-Value: 0.20
INFO:root:Iter 11/20 | Loss: 1.2099 | Avg Q-Value: 0.31
INFO:root:Iter 12/20 | Loss: 1.2012 | Avg Q-Value: 0.29
INFO:root:Iter 13/20 | Loss: 1.1987 | Avg Q-Value: 0.30
INFO:root:Iter 14/20 | Loss: 1.1988 | Avg Q-Value: 0.30
INFO:root:Iter 15/20 | Loss: 1.1973 | Avg Q-Value: 0.31
INFO:root:Iter 16/20 | Loss: 1.1965 | Avg Q-Value: 0.31
INFO:root:Iter 17/20 | Loss: 1.1957 | Avg Q-Value: 0.30
INFO:root:Iter 18/20 | Loss: 1.1968 | Avg Q-Value: 0.31
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  1.28760D+02    |proj g|=  1.00000D+00

At iterate    1    f=  1.23927D+02    |proj g|=  8.13053D-01

At iterate    2    f=  1.23728D+02    |proj g|=  7.71072D-01

At iterate    3    f=  1.23258D+02    |proj g|=  1.75276D-01


INFO:root:Theta updated. New theta_hat: [0.425 0.    0.192 0.111]



At iterate    4    f=  1.23255D+02    |proj g|=  6.49286D-02

At iterate    5    f=  1.23255D+02    |proj g|=  8.68651D-03

At iterate    6    f=  1.23255D+02    |proj g|=  2.79244D-03

At iterate    7    f=  1.23255D+02    |proj g|=  3.34567D-05

At iterate    8    f=  1.23255D+02    |proj g|=  5.42252D-07

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      8     10     12     0     1   5.423D-07   1.233D+02
  F =   123.25478802509691     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:49<00:00, 2007.02it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 2.7509 | Avg Q-Value: 0.21
INFO:root:Iter 2/20 | Loss: 2.7137 | Avg Q-Value: 0.25
INFO:root:Iter 3/20 | Loss: 2.7083 | Avg Q-Value: 0.26
INFO:root:Iter 4/20 | Loss: 2.7042 | Avg Q-Value: 0.27
INFO:root:Iter 5/20 | Loss: 2.7023 | Avg Q-Value: 0.27
INFO:root:Iter 6/20 | Loss: 2.7000 | Avg Q-Value: 0.27
INFO:root:Iter 7/20 | Loss: 2.6998 | Avg Q-Value: 0.27
INFO:root:Iter 8/20 | Loss: 2.6973 | Avg Q-Value: 0.28
INFO:root:Iter 9/20 | Loss: 2.6962 | Avg Q-Value: 0.28
INFO:root:Iter 10/20 | Loss: 2.6959 | Avg Q-Value: 0.28
INFO:root:Iter 11/20 | Loss: 2.7630 | Avg Q-Value: 0.47
INFO:root:Iter 12/20 | Loss: 2.7440 | Avg Q-Value: 0.45
INFO:root:Iter 13/20 | Loss: 2.7431 | Avg Q-Value: 0.45
INFO:root:Iter 14/20 | Loss: 2.7415 | Avg Q-Value: 0.44
INFO:root:Iter 15/20 | Loss: 2.7421 | Avg Q-Value: 0.44
INFO:root:Iter 16/20 | Loss: 2.7401 | Avg Q-Value: 0.44
INFO:root:Iter 17/20 | Loss: 2.7402 | Avg Q-Value: 0.46
INFO:root:Iter 18/20 | Loss: 2.7388 | Avg Q-Value: 0.45
I


Training complete.


  2%|▏         | 1703/100000 [58:24<21:15:23,  1.28it/s]INFO:root:Updating optimal policy...
INFO:root:Theta updated. New theta_hat: [0.339 0.045 0.304 0.21 ]


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  1.53725D+02    |proj g|=  1.00000D+00

At iterate    1    f=  1.46060D+02    |proj g|=  7.77721D-01

At iterate    2    f=  1.45935D+02    |proj g|=  7.47300D-01

At iterate    3    f=  1.45711D+02    |proj g|=  1.99034D-01

At iterate    4    f=  1.45707D+02    |proj g|=  1.02092D-01

At iterate    5    f=  1.45705D+02    |proj g|=  2.00206D-02

At iterate    6    f=  1.45704D+02    |proj g|=  1.06815D-03

At iterate    7    f=  1.45704D+02    |proj g|=  9.88849D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function 

100%|██████████| 100000/100000 [00:57<00:00, 1741.59it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 1.6054 | Avg Q-Value: 0.10
INFO:root:Iter 2/20 | Loss: 1.5720 | Avg Q-Value: 0.13
INFO:root:Iter 3/20 | Loss: 1.5603 | Avg Q-Value: 0.13
INFO:root:Iter 4/20 | Loss: 1.5578 | Avg Q-Value: 0.13
INFO:root:Iter 5/20 | Loss: 1.5568 | Avg Q-Value: 0.15
INFO:root:Iter 6/20 | Loss: 1.5539 | Avg Q-Value: 0.15
INFO:root:Iter 7/20 | Loss: 1.6269 | Avg Q-Value: 0.15
INFO:root:Iter 8/20 | Loss: 1.5513 | Avg Q-Value: 0.16
INFO:root:Iter 9/20 | Loss: 1.5504 | Avg Q-Value: 0.15
INFO:root:Iter 10/20 | Loss: 1.5516 | Avg Q-Value: 0.15
INFO:root:Iter 11/20 | Loss: 1.5861 | Avg Q-Value: 0.24
INFO:root:Iter 12/20 | Loss: 1.5824 | Avg Q-Value: 0.24
INFO:root:Iter 13/20 | Loss: 1.5804 | Avg Q-Value: 0.24
INFO:root:Iter 14/20 | Loss: 1.5795 | Avg Q-Value: 0.26
INFO:root:Iter 15/20 | Loss: 1.5788 | Avg Q-Value: 0.25
INFO:root:Iter 16/20 | Loss: 1.5782 | Avg Q-Value: 0.24
INFO:root:Iter 17/20 | Loss: 1.5777 | Avg Q-Value: 0.25
INFO:root:Iter 18/20 | Loss: 1.5773 | Avg Q-Value: 0.25
I


Training complete.


  2%|▏         | 1920/100000 [1:05:07<26:17:10,  1.04it/s]INFO:root:Updating optimal policy...
INFO:root:Theta updated. New theta_hat: [0.382 0.    0.033 0.413]


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  1.78995D+02    |proj g|=  1.00000D+00

At iterate    1    f=  1.71835D+02    |proj g|=  7.99610D-01

At iterate    2    f=  1.71637D+02    |proj g|=  7.71251D-01

At iterate    3    f=  1.71152D+02    |proj g|=  3.78093D-01

At iterate    4    f=  1.71062D+02    |proj g|=  3.93648D-01

At iterate    5    f=  1.71056D+02    |proj g|=  4.77031D-02

At iterate    6    f=  1.71056D+02    |proj g|=  5.50552D-03

At iterate    7    f=  1.71056D+02    |proj g|=  1.98585D-03

At iterate    8    f=  1.71056D+02    |proj g|=  7.82437D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg 

100%|██████████| 100000/100000 [01:12<00:00, 1376.24it/s]]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 1.3200 | Avg Q-Value: 0.09
INFO:root:Iter 2/20 | Loss: 1.2737 | Avg Q-Value: 0.11
INFO:root:Iter 3/20 | Loss: 1.2592 | Avg Q-Value: 0.13
INFO:root:Iter 4/20 | Loss: 1.2565 | Avg Q-Value: 0.15
INFO:root:Iter 5/20 | Loss: 1.2522 | Avg Q-Value: 0.16
INFO:root:Iter 6/20 | Loss: 1.2494 | Avg Q-Value: 0.15
INFO:root:Iter 7/20 | Loss: 1.2479 | Avg Q-Value: 0.16
INFO:root:Iter 8/20 | Loss: 1.2459 | Avg Q-Value: 0.15
INFO:root:Iter 9/20 | Loss: 1.2458 | Avg Q-Value: 0.16
INFO:root:Iter 10/20 | Loss: 1.2444 | Avg Q-Value: 0.17
INFO:root:Iter 11/20 | Loss: 1.2909 | Avg Q-Value: 0.21
INFO:root:Iter 12/20 | Loss: 1.2878 | Avg Q-Value: 0.22
INFO:root:Iter 13/20 | Loss: 1.2844 | Avg Q-Value: 0.21
INFO:root:Iter 14/20 | Loss: 1.2841 | Avg Q-Value: 0.23
INFO:root:Iter 15/20 | Loss: 1.2826 | Avg Q-Value: 0.22
INFO:root:Iter 16/20 | Loss: 1.2858 | Avg Q-Value: 0.20
INFO:root:Iter 17/20 | Loss: 1.2816 | Avg Q-Value: 0.22
INFO:root:Iter 18/20 | Loss: 1.2842 | Avg Q-Value: 0.22
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  2.06725D+02    |proj g|=  1.00000D+00

At iterate    1    f=  1.97933D+02    |proj g|=  7.81979D-01

At iterate    2    f=  1.97666D+02    |proj g|=  7.53144D-01


INFO:root:Theta updated. New theta_hat: [0.443 0.019 0.    0.426]



At iterate    3    f=  1.97008D+02    |proj g|=  4.36345D-01

At iterate    4    f=  1.96911D+02    |proj g|=  3.94109D-01

At iterate    5    f=  1.96908D+02    |proj g|=  5.94570D-02

At iterate    6    f=  1.96908D+02    |proj g|=  4.58906D-02

At iterate    7    f=  1.96908D+02    |proj g|=  2.45532D-02

At iterate    8    f=  1.96908D+02    |proj g|=  1.68986D-03

At iterate    9    f=  1.96908D+02    |proj g|=  3.07451D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      9     11     13     0     1   3.075D-04   1.969D+02
  F =   196.90812129749932     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH   

100%|██████████| 100000/100000 [00:55<00:00, 1813.25it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.9754 | Avg Q-Value: 0.37
INFO:root:Iter 2/20 | Loss: 0.9643 | Avg Q-Value: 0.40
INFO:root:Iter 3/20 | Loss: 0.9526 | Avg Q-Value: 0.38
INFO:root:Iter 4/20 | Loss: 0.9482 | Avg Q-Value: 0.39
INFO:root:Iter 5/20 | Loss: 0.9484 | Avg Q-Value: 0.39
INFO:root:Iter 6/20 | Loss: 0.9443 | Avg Q-Value: 0.41
INFO:root:Iter 7/20 | Loss: 0.9431 | Avg Q-Value: 0.40
INFO:root:Iter 8/20 | Loss: 0.9417 | Avg Q-Value: 0.42
INFO:root:Iter 9/20 | Loss: 0.9401 | Avg Q-Value: 0.43
INFO:root:Iter 10/20 | Loss: 0.9375 | Avg Q-Value: 0.43
INFO:root:Iter 11/20 | Loss: 0.9951 | Avg Q-Value: 0.63
INFO:root:Iter 12/20 | Loss: 0.9893 | Avg Q-Value: 0.61
INFO:root:Iter 13/20 | Loss: 0.9898 | Avg Q-Value: 0.62
INFO:root:Iter 14/20 | Loss: 0.9892 | Avg Q-Value: 0.58
INFO:root:Iter 15/20 | Loss: 0.9872 | Avg Q-Value: 0.60
INFO:root:Iter 16/20 | Loss: 0.9875 | Avg Q-Value: 0.60
INFO:root:Iter 17/20 | Loss: 0.9883 | Avg Q-Value: 0.61
INFO:root:Iter 18/20 | Loss: 0.9869 | Avg Q-Value: 0.62
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  2.28385D+02    |proj g|=  1.00000D+00


INFO:root:Theta updated. New theta_hat: [0.378 0.    0.099 0.44 ]



At iterate    1    f=  2.19034D+02    |proj g|=  7.83030D-01

At iterate    2    f=  2.18764D+02    |proj g|=  7.52839D-01

At iterate    3    f=  2.18186D+02    |proj g|=  3.96154D-01

At iterate    4    f=  2.18098D+02    |proj g|=  3.94225D-01

At iterate    5    f=  2.18093D+02    |proj g|=  4.64710D-02

At iterate    6    f=  2.18093D+02    |proj g|=  2.23353D-02

At iterate    7    f=  2.18093D+02    |proj g|=  1.46480D-02

At iterate    8    f=  2.18093D+02    |proj g|=  7.19213D-04

At iterate    9    f=  2.18093D+02    |proj g|=  1.46252D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      9     11   

100%|██████████| 100000/100000 [00:43<00:00, 2324.74it/s]]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 1.0325 | Avg Q-Value: 0.07
INFO:root:Iter 2/20 | Loss: 1.0011 | Avg Q-Value: 0.08
INFO:root:Iter 3/20 | Loss: 0.9917 | Avg Q-Value: 0.10
INFO:root:Iter 4/20 | Loss: 0.9873 | Avg Q-Value: 0.10
INFO:root:Iter 5/20 | Loss: 0.9844 | Avg Q-Value: 0.11
INFO:root:Iter 6/20 | Loss: 0.9829 | Avg Q-Value: 0.11
INFO:root:Iter 7/20 | Loss: 0.9798 | Avg Q-Value: 0.10
INFO:root:Iter 8/20 | Loss: 0.9804 | Avg Q-Value: 0.09
INFO:root:Iter 9/20 | Loss: 0.9774 | Avg Q-Value: 0.11
INFO:root:Iter 10/20 | Loss: 0.9778 | Avg Q-Value: 0.09
INFO:root:Iter 11/20 | Loss: 1.0202 | Avg Q-Value: 0.18
INFO:root:Iter 12/20 | Loss: 1.0062 | Avg Q-Value: 0.19
INFO:root:Iter 13/20 | Loss: 1.0050 | Avg Q-Value: 0.18
INFO:root:Iter 14/20 | Loss: 1.0027 | Avg Q-Value: 0.19
INFO:root:Iter 15/20 | Loss: 1.0033 | Avg Q-Value: 0.19
INFO:root:Iter 16/20 | Loss: 1.0020 | Avg Q-Value: 0.20
INFO:root:Iter 17/20 | Loss: 1.0014 | Avg Q-Value: 0.20
INFO:root:Iter 18/20 | Loss: 1.0010 | Avg Q-Value: 0.20
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  2.51931D+02    |proj g|=  1.00000D+00

At iterate    1    f=  2.40276D+02    |proj g|=  7.63071D-01

At iterate    2    f=  2.39954D+02    |proj g|=  7.38734D-01

At iterate    3    f=  2.39039D+02    |proj g|=  3.35566D-01

At iterate    4    f=  2.38977D+02    |proj g|=  4.59784D-01

At iterate    5    f=  2.38974D+02    |proj g|=  4.78971D-02

At iterate    6    f=  2.38974D+02    |proj g|=  1.42330D-02

At iterate    7    f=  2.38974D+02    |proj g|=  1.34370D-02

At iterate    8    f=  2.38974D+02    |proj g|=  2.17896D-03


INFO:root:Theta updated. New theta_hat: [0.285 0.    0.193 0.546]



At iterate    9    f=  2.38974D+02    |proj g|=  1.30842D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      9     11     13     0     1   1.308D-04   2.390D+02
  F =   238.97420192824444     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:52<00:00, 1905.92it/s]]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 1.0408 | Avg Q-Value: 0.11
INFO:root:Iter 2/20 | Loss: 0.9950 | Avg Q-Value: 0.16
INFO:root:Iter 3/20 | Loss: 0.9914 | Avg Q-Value: 0.17
INFO:root:Iter 4/20 | Loss: 0.9878 | Avg Q-Value: 0.18
INFO:root:Iter 5/20 | Loss: 0.9833 | Avg Q-Value: 0.18
INFO:root:Iter 6/20 | Loss: 0.9817 | Avg Q-Value: 0.18
INFO:root:Iter 7/20 | Loss: 0.9800 | Avg Q-Value: 0.19
INFO:root:Iter 8/20 | Loss: 0.9781 | Avg Q-Value: 0.19
INFO:root:Iter 9/20 | Loss: 0.9794 | Avg Q-Value: 0.19
INFO:root:Iter 10/20 | Loss: 0.9759 | Avg Q-Value: 0.20
INFO:root:Iter 11/20 | Loss: 1.0222 | Avg Q-Value: 0.27
INFO:root:Iter 12/20 | Loss: 1.0185 | Avg Q-Value: 0.27
INFO:root:Iter 13/20 | Loss: 1.0168 | Avg Q-Value: 0.27
INFO:root:Iter 14/20 | Loss: 1.0179 | Avg Q-Value: 0.27
INFO:root:Iter 15/20 | Loss: 1.0161 | Avg Q-Value: 0.26
INFO:root:Iter 16/20 | Loss: 1.0309 | Avg Q-Value: 0.28
INFO:root:Iter 17/20 | Loss: 1.0283 | Avg Q-Value: 0.27
INFO:root:Iter 18/20 | Loss: 1.0157 | Avg Q-Value: 0.25
I


Training complete.


  3%|▎         | 2832/100000 [1:31:50<14:18:38,  1.89it/s]INFO:root:Updating optimal policy...


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  2.87753D+02    |proj g|=  1.00000D+00

At iterate    1    f=  2.73550D+02    |proj g|=  7.47357D-01

At iterate    2    f=  2.73208D+02    |proj g|=  7.16345D-01

At iterate    3    f=  2.72304D+02    |proj g|=  3.90205D-01

At iterate    4    f=  2.72184D+02    |proj g|=  5.85857D-01

At iterate    5    f=  2.72162D+02    |proj g|=  1.18939D-01

At iterate    6    f=  2.72162D+02    |proj g|=  2.17085D-02


INFO:root:Theta updated. New theta_hat: [0.357 0.    0.148 0.57 ]



At iterate    7    f=  2.72162D+02    |proj g|=  2.22862D-02

At iterate    8    f=  2.72162D+02    |proj g|=  6.41833D-03

At iterate    9    f=  2.72162D+02    |proj g|=  4.44857D-04

At iterate   10    f=  2.72162D+02    |proj g|=  2.38962D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     10     12     13     0     1   2.390D-05   2.722D+02
  F =   272.16221307961740     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:47<00:00, 2089.83it/s]]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.8299 | Avg Q-Value: 0.20
INFO:root:Iter 2/20 | Loss: 0.7952 | Avg Q-Value: 0.24
INFO:root:Iter 3/20 | Loss: 0.7892 | Avg Q-Value: 0.26
INFO:root:Iter 4/20 | Loss: 0.7850 | Avg Q-Value: 0.28
INFO:root:Iter 5/20 | Loss: 0.7831 | Avg Q-Value: 0.28
INFO:root:Iter 6/20 | Loss: 0.7802 | Avg Q-Value: 0.28
INFO:root:Iter 7/20 | Loss: 0.7792 | Avg Q-Value: 0.30
INFO:root:Iter 8/20 | Loss: 0.7844 | Avg Q-Value: 0.30
INFO:root:Iter 9/20 | Loss: 0.7751 | Avg Q-Value: 0.29
INFO:root:Iter 10/20 | Loss: 0.7747 | Avg Q-Value: 0.30
INFO:root:Iter 11/20 | Loss: 0.8336 | Avg Q-Value: 0.40
INFO:root:Iter 12/20 | Loss: 0.8227 | Avg Q-Value: 0.39
INFO:root:Iter 13/20 | Loss: 0.8230 | Avg Q-Value: 0.40
INFO:root:Iter 14/20 | Loss: 0.8213 | Avg Q-Value: 0.39
INFO:root:Iter 15/20 | Loss: 0.8233 | Avg Q-Value: 0.39
INFO:root:Iter 16/20 | Loss: 0.8207 | Avg Q-Value: 0.40
INFO:root:Iter 17/20 | Loss: 0.8261 | Avg Q-Value: 0.42
INFO:root:Iter 18/20 | Loss: 0.8196 | Avg Q-Value: 0.39
I


Training complete.


  3%|▎         | 3071/100000 [1:38:26<22:02:41,  1.22it/s]INFO:root:Updating optimal policy...


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  3.22230D+02    |proj g|=  1.00000D+00

At iterate    1    f=  3.06763D+02    |proj g|=  7.43492D-01

At iterate    2    f=  3.06455D+02    |proj g|=  7.15469D-01

At iterate    3    f=  3.05745D+02    |proj g|=  3.61969D-01

At iterate    4    f=  3.05541D+02    |proj g|=  3.66419D-01

At iterate    5    f=  3.05526D+02    |proj g|=  1.38825D-01


INFO:root:Theta updated. New theta_hat: [0.348 0.    0.163 0.586]



At iterate    6    f=  3.05525D+02    |proj g|=  1.11155D-01

At iterate    7    f=  3.05525D+02    |proj g|=  1.03161D-01

At iterate    8    f=  3.05525D+02    |proj g|=  3.42406D-02

At iterate    9    f=  3.05525D+02    |proj g|=  1.11313D-02

At iterate   10    f=  3.05525D+02    |proj g|=  2.30592D-03

At iterate   11    f=  3.05525D+02    |proj g|=  1.25974D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     11     13     14     0     1   1.260D-03   3.055D+02
  F =   305.52490255896868     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
Using device: mps
Generating 100000 experience samp

100%|██████████| 100000/100000 [01:09<00:00, 1447.67it/s]]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.8163 | Avg Q-Value: 0.09
INFO:root:Iter 2/20 | Loss: 0.7898 | Avg Q-Value: 0.12
INFO:root:Iter 3/20 | Loss: 0.7806 | Avg Q-Value: 0.12
INFO:root:Iter 4/20 | Loss: 0.7762 | Avg Q-Value: 0.15
INFO:root:Iter 5/20 | Loss: 0.7728 | Avg Q-Value: 0.15
INFO:root:Iter 6/20 | Loss: 0.7702 | Avg Q-Value: 0.15
INFO:root:Iter 7/20 | Loss: 0.7675 | Avg Q-Value: 0.15
INFO:root:Iter 8/20 | Loss: 0.7651 | Avg Q-Value: 0.16
INFO:root:Iter 9/20 | Loss: 0.7633 | Avg Q-Value: 0.16
INFO:root:Iter 10/20 | Loss: 0.7651 | Avg Q-Value: 0.16
INFO:root:Iter 11/20 | Loss: 0.8002 | Avg Q-Value: 0.26
INFO:root:Iter 12/20 | Loss: 0.7925 | Avg Q-Value: 0.27
INFO:root:Iter 13/20 | Loss: 0.7935 | Avg Q-Value: 0.26
INFO:root:Iter 14/20 | Loss: 0.7918 | Avg Q-Value: 0.26
INFO:root:Iter 15/20 | Loss: 0.7896 | Avg Q-Value: 0.27
INFO:root:Iter 16/20 | Loss: 0.7937 | Avg Q-Value: 0.27
INFO:root:Iter 17/20 | Loss: 0.7888 | Avg Q-Value: 0.28
INFO:root:Iter 18/20 | Loss: 0.7886 | Avg Q-Value: 0.27
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  3.48742D+02    |proj g|=  1.00000D+00

At iterate    1    f=  3.33625D+02    |proj g|=  7.61404D-01

At iterate    2    f=  3.33304D+02    |proj g|=  7.20752D-01

At iterate    3    f=  3.32838D+02    |proj g|=  3.58057D-01

At iterate    4    f=  3.32565D+02    |proj g|=  4.48003D-01

At iterate    5    f=  3.32526D+02    |proj g|=  1.61891D-01

At iterate    6    f=  3.32526D+02    |proj g|=  6.15066D-02

At iterate    7    f=  3.32526D+02    |proj g|=  6.41181D-02


INFO:root:Theta updated. New theta_hat: [0.41  0.073 0.059 0.472]



At iterate    8    f=  3.32526D+02    |proj g|=  4.18532D-02

At iterate    9    f=  3.32526D+02    |proj g|=  3.98128D-03

At iterate   10    f=  3.32526D+02    |proj g|=  9.90234D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     10     12     13     0     0   9.902D-04   3.325D+02
  F =   332.52561064185221     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:46<00:00, 2172.40it/s]]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.6933 | Avg Q-Value: 0.23
INFO:root:Iter 2/20 | Loss: 0.6543 | Avg Q-Value: 0.25
INFO:root:Iter 3/20 | Loss: 0.6497 | Avg Q-Value: 0.27
INFO:root:Iter 4/20 | Loss: 0.6454 | Avg Q-Value: 0.29
INFO:root:Iter 5/20 | Loss: 0.6425 | Avg Q-Value: 0.28
INFO:root:Iter 6/20 | Loss: 0.6383 | Avg Q-Value: 0.28
INFO:root:Iter 7/20 | Loss: 0.6370 | Avg Q-Value: 0.30
INFO:root:Iter 8/20 | Loss: 0.6361 | Avg Q-Value: 0.28
INFO:root:Iter 9/20 | Loss: 0.6343 | Avg Q-Value: 0.29
INFO:root:Iter 10/20 | Loss: 0.6336 | Avg Q-Value: 0.30
INFO:root:Iter 11/20 | Loss: 0.6744 | Avg Q-Value: 0.45
INFO:root:Iter 12/20 | Loss: 0.6687 | Avg Q-Value: 0.46
INFO:root:Iter 13/20 | Loss: 0.6677 | Avg Q-Value: 0.48
INFO:root:Iter 14/20 | Loss: 0.6676 | Avg Q-Value: 0.47
INFO:root:Iter 15/20 | Loss: 0.6677 | Avg Q-Value: 0.47
INFO:root:Iter 16/20 | Loss: 0.6648 | Avg Q-Value: 0.45
INFO:root:Iter 17/20 | Loss: 0.6640 | Avg Q-Value: 0.46
INFO:root:Iter 18/20 | Loss: 0.6630 | Avg Q-Value: 0.47
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  3.73103D+02    |proj g|=  1.00000D+00

At iterate    1    f=  3.54535D+02    |proj g|=  7.47030D-01

At iterate    2    f=  3.54206D+02    |proj g|=  7.14947D-01

At iterate    3    f=  3.53695D+02    |proj g|=  3.43714D-01

At iterate    4    f=  3.53367D+02    |proj g|=  5.09159D-01

At iterate    5    f=  3.53321D+02    |proj g|=  1.01108D-01

At iterate    6    f=  3.53321D+02    |proj g|=  3.10320D-02

At iterate    7    f=  3.53321D+02    |proj g|=  4.18482D-02

At iterate    8    f=  3.53321D+02    |proj g|=  2.56936D-02


INFO:root:Theta updated. New theta_hat: [0.386 0.097 0.062 0.527]



At iterate    9    f=  3.53321D+02    |proj g|=  1.12204D-03

At iterate   10    f=  3.53321D+02    |proj g|=  2.71823D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     10     12     13     0     0   2.718D-04   3.533D+02
  F =   353.32090758198518     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:51<00:00, 1923.88it/s]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.7374 | Avg Q-Value: 0.36
INFO:root:Iter 2/20 | Loss: 0.6853 | Avg Q-Value: 0.38
INFO:root:Iter 3/20 | Loss: 0.6802 | Avg Q-Value: 0.37
INFO:root:Iter 4/20 | Loss: 0.6779 | Avg Q-Value: 0.37
INFO:root:Iter 5/20 | Loss: 0.6749 | Avg Q-Value: 0.38
INFO:root:Iter 6/20 | Loss: 0.6726 | Avg Q-Value: 0.36
INFO:root:Iter 7/20 | Loss: 0.6783 | Avg Q-Value: 0.37
INFO:root:Iter 8/20 | Loss: 0.6703 | Avg Q-Value: 0.37
INFO:root:Iter 9/20 | Loss: 0.6689 | Avg Q-Value: 0.37
INFO:root:Iter 10/20 | Loss: 0.6683 | Avg Q-Value: 0.39
INFO:root:Iter 11/20 | Loss: 0.7102 | Avg Q-Value: 0.62
INFO:root:Iter 12/20 | Loss: 0.7035 | Avg Q-Value: 0.61
INFO:root:Iter 13/20 | Loss: 0.7063 | Avg Q-Value: 0.59
INFO:root:Iter 14/20 | Loss: 0.7021 | Avg Q-Value: 0.60
INFO:root:Iter 15/20 | Loss: 0.7008 | Avg Q-Value: 0.60
INFO:root:Iter 16/20 | Loss: 0.7008 | Avg Q-Value: 0.59
INFO:root:Iter 17/20 | Loss: 0.7006 | Avg Q-Value: 0.59
INFO:root:Iter 18/20 | Loss: 0.6995 | Avg Q-Value: 0.60
I


Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         4 variables are exactly at the bounds

At iterate    0    f=  3.94012D+02    |proj g|=  1.00000D+00

At iterate    1    f=  3.72177D+02    |proj g|=  7.32050D-01

At iterate    2    f=  3.71547D+02    |proj g|=  6.86992D-01

At iterate    3    f=  3.70198D+02    |proj g|=  4.97195D-01

At iterate    4    f=  3.69907D+02    |proj g|=  4.78934D-01

At iterate    5    f=  3.69893D+02    |proj g|=  1.88176D-01

At iterate    6    f=  3.69892D+02    |proj g|=  2.01508D-01


INFO:root:Theta updated. New theta_hat: [0.482 0.055 0.012 0.573]



At iterate    7    f=  3.69891D+02    |proj g|=  1.07304D-01

At iterate    8    f=  3.69891D+02    |proj g|=  7.17678D-03

At iterate    9    f=  3.69891D+02    |proj g|=  2.89093D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4      9     11     12     0     0   2.891D-03   3.699D+02
  F =   369.89089838122032     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
Using device: mps
Generating 100000 experience samples...


100%|██████████| 100000/100000 [00:58<00:00, 1694.99it/s]]



Starting FQI training loop...


INFO:root:Iter 1/20 | Loss: 0.8403 | Avg Q-Value: 0.40
INFO:root:Iter 2/20 | Loss: 0.7567 | Avg Q-Value: 0.41
INFO:root:Iter 3/20 | Loss: 0.7500 | Avg Q-Value: 0.42
INFO:root:Iter 4/20 | Loss: 0.7459 | Avg Q-Value: 0.44
INFO:root:Iter 5/20 | Loss: 0.7426 | Avg Q-Value: 0.45
INFO:root:Iter 6/20 | Loss: 0.7397 | Avg Q-Value: 0.43
INFO:root:Iter 7/20 | Loss: 0.7389 | Avg Q-Value: 0.44
INFO:root:Iter 8/20 | Loss: 0.7363 | Avg Q-Value: 0.44
INFO:root:Iter 9/20 | Loss: 0.7353 | Avg Q-Value: 0.43
INFO:root:Iter 10/20 | Loss: 0.7344 | Avg Q-Value: 0.43
INFO:root:Iter 11/20 | Loss: 0.8120 | Avg Q-Value: 0.59
INFO:root:Iter 12/20 | Loss: 0.7897 | Avg Q-Value: 0.60
INFO:root:Iter 13/20 | Loss: 0.7744 | Avg Q-Value: 0.59
INFO:root:Iter 14/20 | Loss: 0.7733 | Avg Q-Value: 0.59


In [None]:
simulation_data = simulator.run(num_customers=10000-NUM_CUSTOMERS)

## Training policy under perfect information

In [None]:
from policy import DPAgent

class PerfectDegradationLearner:
    def __init__(self, d, theta_true, hazard_model):
        self.d = d
        self.theta_true = theta_true
        self.hazard_model = hazard_model  # Placeholder, not used
        
    def get_theta(self):
        return self.theta_true
    
    def cum_baseline(self, t):
        return self.hazard_model.Lambda_0(t)
    
    def inverse_cum_baseline(self, u):
        return self.hazard_model.Lambda_0_inverse(u)
    
perfect_degradation_learner = PerfectDegradationLearner(
    d=D, 
    theta_true=THETA_TRUE,
    hazard_model=usage_exp_hazard_model,
)
perfect_dpagent = DPAgent(
    d=D,
    u_hat=UTILITY_TRUE,
    time_normalize=True,
    degradation_learner=perfect_degradation_learner,
    customer_generator=customer_gen,
    params=mdp_params,
)

perfect_dpagent.train(
    num_iterations=50,
    dataset_size=500000,
    batch_size=1024
)

perfect_policy = perfect_dpagent.get_policy(
    {'type': 'greedy'}
)

### Debugging: Look at experience dataset

In [None]:
from policy import DPAgent

print(simulator.projected_volume_learner.get_estimate())
print(UTILITY_TRUE)

dp_agent = DPAgent(
    d=simulator.d,
    u_hat=simulator.projected_volume_learner.get_estimate(),
    time_normalize=simulator.time_normalize,
    degradation_learner=simulator.degradation_learner,
    customer_generator=simulator.customer_generator,
    params=simulator.mdp_params
)

dataset = dp_agent.experience_generator.generate(500)

for data in dataset:
    state, action, reward, next_state = data
    print("State:", state.round(3))
    print("Action:", dp_agent.experience_generator.ACTION_MAP[action])
    print("Reward:", reward)
    print("Next State:", next_state.round(3))
    print("-----")
    print()

In [None]:
degradation_learner = DegradationLearner(d=D, initial_theta=np.zeros(D))
degradation_df = pd.DataFrame(simulator.degradation_history)
degradation_learner.fit(degradation_df)
degradation_learner.get_theta()

Testing Policy

In [None]:
i = 50

X_i = df.loc[i, 'sum_of_contexts_after']
I_i = 3 # df.loc[i, '']
x_i = df.loc[i+1, 'customer_context']
T_i = df.loc[i+1, 'rental_duration']

arrival_state = np.concatenate([
    X_i,
    x_i,
    [T_i, I_i, 0.0]
])
action_arrival = optimal_policy(arrival_state)
action_map = {0: 'Give Max Acceptable Price', 1: 'Shutdown'}
print(f"Sample Arrival State. Optimal Action: {action_map[action_arrival]}")


In [None]:
# Test Case 2: Departure State
departure_state = np.concatenate([
    X_i+x_i*10, 
    np.zeros(D), 
    [0.0, I_i, 1.0]
])
action_departure = optimal_policy(departure_state)
action_map = {2: 'Replace Machine', 3: 'Do Not Replace'}
print(f"Sample Departure State. Optimal Action: {action_map[action_departure]}")
