In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.optimize import minimize

from simulation import Simulator, CustomerGenerator
from hazard_models import ExponentialHazard
from utility_learner import ProjectedVolumeLearner, diam
from degradation_learner import DegradationLearner

from utils import unit_ball_rejection_sample, correct_signs
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

import logging
logging.basicConfig(level=logging.INFO)

In [2]:
# --- 2. Define Sampling Functions ---
# def context_sampler() -> np.ndarray:
#     """Samples a customer's context vector from a uniform distribution."""
#     return np.random.uniform(low=0.0, high=1.0, size=D)

def context_sampler() -> np.ndarray:
    """Samples a customer's context vector uniformly from the unit ball."""
    return np.abs(unit_ball_rejection_sample(D))

def rental_sampler() -> float:
    """Samples a customer's desired rental duration from an exponential distribution."""
    return np.random.exponential(scale=20.0)

def interarrival_sampler() -> float:
    """Samples the time until the next customer arrives."""
    return np.random.exponential(scale=5.0)

In [3]:
# --- 1. Simulation Configuration ---
D = 5                                  # Dimension of context vectors
LAMBDA_VAL = 0.001                     # Baseline hazard constant
NUM_CUSTOMERS = 2000                   # Total number of customers to simulate, i.e. T

# Set a random seed for reproducibility
np.random.seed(41)

# Ground truth vectors
THETA_TRUE = np.array([0.5, 0.2, 0.1, 0.3, 0.4])    # For degradation
UTILITY_TRUE = context_sampler()  # For customer's willingness to pay

# --- Machine's Pricing Vector 'r' ---
# This is a fallback pricing vector, when we don't feed u_hat to calculate_price
PRICING_R = np.zeros(D)

In [None]:
usage_exp_hazard_model = ExponentialHazard(lambda_val=LAMBDA_VAL)
# spontaneous_exp_hazard_model = None # ExponentialHazard(lambda_val=0.01)

customer_gen = CustomerGenerator(
    d=D,
    context_sampler=context_sampler,
    rental_sampler=rental_sampler,
    interarrival_sampler=interarrival_sampler
)

centroid_params = {
    # 'num_samples': 2000,
    # 'thin': None,
    # 'burn_in': 500 * D ** 2,
    # 'tol': 1e-4,
    # 'rho_target': 0.01
}

termination_rule = lambda diameter: diameter < 0.11  # Example custom termination rule

projected_volume_learner = ProjectedVolumeLearner(
    T=NUM_CUSTOMERS, 
    d=D, 
    centroid_params=centroid_params,
    incentive_constant=1.1,
    termination_rule=termination_rule,
)

mdp_params = {
    'replacement_cost': 1.5,   # Cost to replace the machine
    'failure_cost': 0.75,      # Additional penalty for in-service failure
    'holding_cost_rate': 0.02,   # Cost per unit of idle time
    'gamma': 0.999,             # Discount factor
    'learning_rate': 1e-4,      # Learning rate for the Adam optimizer
    'target_update_freq': 10    # How often to update the target network (in iterations)
}

training_hyperparams = {
    'num_iterations': 10, # Number of training iterations per policy update
    'dataset_size': 5000,      # Number of transitions to generate for the offline dataset
    'batch_size': 256           # Batch size for training
}

# Instantiate the Simulator with the new parameters
simulator = Simulator(
    d=D,
    T=NUM_CUSTOMERS,
    
    theta_true=THETA_TRUE,
    utility_true=UTILITY_TRUE,
    pricing_r=PRICING_R,
    
    usage_hazard_model=usage_exp_hazard_model,
    customer_generator=customer_gen,
    projected_volume_learner=projected_volume_learner,  # Use default ProjectedVolumeLearner
    
    mdp_params=mdp_params,
    training_hyperparams=training_hyperparams,
    policy_update_threshold=5,
    time_normalize=True,
)

In [5]:
# from degradation_learner import breslow_baseline_estimator

# degradation_history = pd.DataFrame(simulator.degradation_history)
# degradation_history['life_id'] = (degradation_history['event'].shift(1).fillna(-99) == 1).cumsum()  # 0 after breakdown

# breslow_df = breslow_baseline_estimator(
#     degradation_history, 
#     simulator.degradation_learner.get_theta()
# )

# breslow_df = breslow_df[breslow_df['delta_t'] > 0]
# times = breslow_df['time'].values
# lambda_step = breslow_df['lambda_0'].values

# times

In [6]:
# simulator.projected_volume_learner.is_terminated = True
simulation_data = simulator.run(num_customers=NUM_CUSTOMERS)
df = pd.DataFrame(simulation_data)

INFO:root:Starting simulation for 2000 customers...
  0%|          | 0/2000 [00:00<?, ?it/s]

Set parameter Username


INFO:gurobipy:Set parameter Username


Set parameter LicenseID to value 2651514


INFO:gurobipy:Set parameter LicenseID to value 2651514


Academic license - for non-commercial use only - expires 2026-04-14


INFO:gurobipy:Academic license - for non-commercial use only - expires 2026-04-14
INFO:root:Customer 1: Diameter: 1.0038
  0%|          | 1/2000 [00:02<1:34:01,  2.82s/it]INFO:root:Customer 2: Diameter: 0.7896
  0%|          | 2/2000 [00:05<1:38:08,  2.95s/it]INFO:root:Customer 3: Diameter: 0.6292
  0%|          | 3/2000 [00:09<1:43:27,  3.11s/it]INFO:root:Customer 4: Diameter: 0.5915
  0%|          | 4/2000 [00:12<1:48:21,  3.26s/it]INFO:root:Customer 5: Diameter: 0.6412
  0%|          | 5/2000 [00:16<1:54:21,  3.44s/it]INFO:root:Customer 6: Diameter: 0.2740
  0%|          | 6/2000 [00:20<2:01:32,  3.66s/it]INFO:root:Customer 7: Diameter: 0.4080
  0%|          | 7/2000 [00:24<2:08:23,  3.87s/it]INFO:root:Customer 8: Diameter: 0.2810
  0%|          | 8/2000 [00:29<2:14:43,  4.06s/it]INFO:root:Customer 9: Diameter: 0.3840
  0%|          | 9/2000 [00:34<2:23:09,  4.31s/it]INFO:root:Customer 10: Diameter: 0.4161
  0%|          | 10/2000 [00:39<2:29:55,  4.52s/it]INFO:root:Customer 11: Dia

RUNNING THE L-BFGS-B CODE

           * * *

Using device: mps
Machine precision = 2.220D-16
 N =            5     M =           10

At X0         5 variables are exactly at the bounds

At iterate    0    f=  1.09861D+00    |proj g|=  4.06656D-01

At iterate    1    f=  9.43172D-01    |proj g|=  3.58128D-01

At iterate    2    f=  7.50631D-01    |proj g|=  0.00000D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5      2      3      3     0     5   0.000D+00   7.506D-01
  F =  0.75063115752825205     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
Generating 50000 experience samples...


100%|██████████| 50000/50000 [00:12<00:00, 4125.25it/s]



Starting FQI training loop...


INFO:root:Iter 1/100 | Loss: 0.9747 | Avg Q-Value: 1.00
INFO:root:Iter 2/100 | Loss: 0.8193 | Avg Q-Value: 1.04
INFO:root:Iter 3/100 | Loss: 0.7593 | Avg Q-Value: 1.09
INFO:root:Iter 4/100 | Loss: 0.7283 | Avg Q-Value: 1.15
INFO:root:Iter 5/100 | Loss: 0.7146 | Avg Q-Value: 1.15
INFO:root:Iter 6/100 | Loss: 0.7047 | Avg Q-Value: 1.13
INFO:root:Iter 7/100 | Loss: 0.6979 | Avg Q-Value: 1.17
INFO:root:Iter 8/100 | Loss: 0.6951 | Avg Q-Value: 1.17
INFO:root:Iter 9/100 | Loss: 0.6908 | Avg Q-Value: 1.19
INFO:root:Iter 10/100 | Loss: 0.6865 | Avg Q-Value: 1.19
INFO:root:Iter 11/100 | Loss: 0.4133 | Avg Q-Value: 1.42
INFO:root:Iter 12/100 | Loss: 0.3567 | Avg Q-Value: 1.45
INFO:root:Iter 13/100 | Loss: 0.3535 | Avg Q-Value: 1.45
INFO:root:Iter 14/100 | Loss: 0.3525 | Avg Q-Value: 1.48
INFO:root:Iter 15/100 | Loss: 0.3513 | Avg Q-Value: 1.47
INFO:root:Iter 16/100 | Loss: 0.3504 | Avg Q-Value: 1.48
INFO:root:Iter 17/100 | Loss: 0.3503 | Avg Q-Value: 1.48
INFO:root:Iter 18/100 | Loss: 0.3508 | A


Training complete.


RuntimeError: linear(): input and weight.T shapes cannot be multiplied (1x13 and 14x64)

In [None]:
degradation_history = pd.DataFrame(simulator.degradation_history)

In [None]:
degradation_learner = DegradationLearner(d=D, initial_theta=np.zeros(D))

degradation_learner.fit(degradation_history)
degradation_learner.get_theta()

In [None]:
df[df.event_type == 'rental'].profit.mean(), df[df.event_type == 'rental'].profit.std()

In [None]:
df.feedback.value_counts()

## Estimating $\theta$

In [None]:
degradation_learner = DegradationLearner(d=D, initial_theta=np.zeros(D))

degradation_learner.fit(df)
degradation_learner.get_theta()

In [None]:
from policy import DPAgent

In [None]:
mdp_params = {
    'replacement_cost': 1.5,   # Cost to replace the machine
    'failure_cost': 0.75,      # Additional penalty for in-service failure
    'holding_cost_rate': 0.02,   # Cost per unit of idle time
    'gamma': 0.999,             # Discount factor
    'learning_rate': 1e-4,      # Learning rate for the Adam optimizer
    'target_update_freq': 10    # How often to update the target network (in iterations)
}

# Training Hyperparameters
TRAINING_ITERATIONS = 100
DATASET_SIZE = 50000          # Number of transitions to generate for the offline dataset
BATCH_SIZE = 256

u_hat = simulator.centroids[-1]


In [None]:
dp_agent = DPAgent(
    d=D,
    u_hat=u_hat,
    degradation_learner=degradation_learner,
    customer_generator=customer_gen,
    params=mdp_params
)

In [None]:
# Train the agent
dp_agent.train(
    num_iterations=TRAINING_ITERATIONS,
    dataset_size=DATASET_SIZE,
    batch_size=BATCH_SIZE
)

In [None]:
optimal_policy = dp_agent.get_policy()

In [None]:
i = 50

X_i = df.loc[i, 'sum_of_contexts_after']
I_i = 3 # df.loc[i, '']
x_i = df.loc[i+1, 'customer_context']
T_i = df.loc[i+1, 'rental_duration']

arrival_state = np.concatenate([
    X_i,
    x_i,
    [T_i, I_i, 0.0]
])
action_arrival = optimal_policy(arrival_state)
action_map = {0: 'Give Max Acceptable Price', 1: 'Shutdown'}
print(f"Sample Arrival State. Optimal Action: {action_map[action_arrival]}")


In [None]:
# Test Case 2: Departure State
departure_state = np.concatenate([
    X_i+x_i*10, 
    np.zeros(D), 
    [0.0, I_i, 1.0]
])
action_departure = optimal_policy(departure_state)
action_map = {2: 'Replace Machine', 3: 'Do Not Replace'}
print(f"Sample Departure State. Optimal Action: {action_map[action_departure]}")
