In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.optimize import minimize

from simulation import Simulator, CustomerGenerator
from hazard_models import ExponentialHazard
from utility_learner import ProjectedVolumeLearner, diam
from degradation_learner import DegradationLearner

from utils import unit_ball_rejection_sample, correct_signs
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

import logging
logging.basicConfig(level=logging.INFO)

In [2]:
# --- 2. Define Sampling Functions ---
# def context_sampler() -> np.ndarray:
#     """Samples a customer's context vector from a uniform distribution."""
#     return np.random.uniform(low=0.0, high=1.0, size=D)

def context_sampler() -> np.ndarray:
    """Samples a customer's context vector uniformly from the unit ball."""
    return np.abs(unit_ball_rejection_sample(D))

def rental_sampler() -> float:
    """Samples a customer's desired rental duration from an exponential distribution."""
    return np.random.exponential(scale=20.0)

def interarrival_sampler() -> float:
    """Samples the time until the next customer arrives."""
    return np.random.exponential(scale=5.0)

In [3]:
# --- 1. Simulation Configuration ---
D = 5                                  # Dimension of context vectors
LAMBDA_VAL = 0.001                     # Baseline hazard constant
NUM_CUSTOMERS = 2000                   # Total number of customers to simulate, i.e. T

# Set a random seed for reproducibility
np.random.seed(41)

# Ground truth vectors
THETA_TRUE = np.array([0.5, 0.2, 0.1, 0.3, 0.4])    # For degradation
UTILITY_TRUE = context_sampler()  # For customer's willingness to pay

# --- Machine's Pricing Vector 'r' ---
# You can change this to test different pricing strategies.
# Case 1: A non-zero pricing strategy
# PRICING_R = np.array([2.0, 2.0, 2.0, 2.0, 2.0])
# Case 2: Zero price (free rentals), guaranteeing 100% acceptance
PRICING_R = np.zeros(D)

In [5]:
usage_exp_hazard_model = ExponentialHazard(lambda_val=LAMBDA_VAL)
# spontaneous_exp_hazard_model = None # ExponentialHazard(lambda_val=0.01)

customer_gen = CustomerGenerator(
    d=D,
    context_sampler=context_sampler,
    rental_sampler=rental_sampler,
    interarrival_sampler=interarrival_sampler
)

centroid_params = {
    # 'num_samples': 2000,
    # 'thin': None,
    # 'burn_in': 500 * D ** 2,
    # 'tol': 1e-4,
    # 'rho_target': 0.01
}

termination_rule = lambda diameter: diameter < 0.01  # Example custom termination rule

projected_volume_learner = ProjectedVolumeLearner(
    T=NUM_CUSTOMERS, 
    d=D, 
    centroid_params=centroid_params,
    incentive_constant=1.1,
    termination_rule=termination_rule,
)

mdp_params = {
    'replacement_cost': 1.5,   # Cost to replace the machine
    'failure_cost': 0.75,      # Additional penalty for in-service failure
    'holding_cost_rate': 0.02,   # Cost per unit of idle time
    'gamma': 0.999,             # Discount factor
    'learning_rate': 1e-4,      # Learning rate for the Adam optimizer
    'target_update_freq': 10    # How often to update the target network (in iterations)
}

training_hyperparams = {
    'num_iterations': 100, # Number of training iterations per policy update
    'dataset_size': 50000,      # Number of transitions to generate for the offline dataset
    'batch_size': 256           # Batch size for training
}

# Instantiate the Simulator with the new parameters
simulator = Simulator(
    d=D,
    T=NUM_CUSTOMERS,
    
    theta_true=THETA_TRUE,
    utility_true=UTILITY_TRUE,
    pricing_r=PRICING_R,
    
    usage_hazard_model=usage_exp_hazard_model,
    customer_generator=customer_gen,
    projected_volume_learner=projected_volume_learner,  # Use default ProjectedVolumeLearner
    
    mdp_params=mdp_params,
    training_hyperparams=training_hyperparams,
    policy_update_threshold=5,
)

In [None]:
# simulator.projected_volume_learner.is_terminated = True
simulation_data = simulator.run(num_customers=NUM_CUSTOMERS)
df = pd.DataFrame(simulation_data)

INFO:root:Starting simulation for 2000 customers...
  0%|          | 0/2000 [00:00<?, ?it/s]

Restricted license - for non-production use only - expires 2026-11-23


INFO:gurobipy:Restricted license - for non-production use only - expires 2026-11-23
INFO:root:Customer 1: Diameter: 1.0038
  0%|          | 1/2000 [00:05<2:56:35,  5.30s/it]INFO:root:Customer 2: Diameter: 0.7896
  0%|          | 2/2000 [00:10<2:59:28,  5.39s/it]INFO:root:Customer 3: Diameter: 0.6292
  0%|          | 3/2000 [00:16<3:08:00,  5.65s/it]INFO:root:Customer 4: Diameter: 0.5915
  0%|          | 4/2000 [00:22<3:15:12,  5.87s/it]INFO:root:Customer 5: Diameter: 0.6412
  0%|          | 5/2000 [00:29<3:22:11,  6.08s/it]INFO:root:Customer 6: Diameter: 0.2740
  0%|          | 6/2000 [00:36<3:32:50,  6.40s/it]INFO:root:Customer 7: Diameter: 0.4080
  0%|          | 7/2000 [00:43<3:40:58,  6.65s/it]INFO:root:Customer 8: Diameter: 0.2810
  0%|          | 8/2000 [00:50<3:48:21,  6.88s/it]INFO:root:Customer 9: Diameter: 0.3840
  0%|          | 9/2000 [00:58<3:54:43,  7.07s/it]INFO:root:Customer 10: Diameter: 0.4161
  0%|          | 10/2000 [01:06<4:03:31,  7.34s/it]INFO:root:Customer 11: D

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         5 variables are exactly at the bounds

At iterate    0    f=  2.48491D+00    |proj g|=  0.00000D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5      0      1      0     0     0   0.000D+00   2.485D+00
  F =   2.4849066497880004     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
Using device: cuda
Generating 50000 experience samples...


100%|██████████| 50000/50000 [00:07<00:00, 6654.62it/s]



Starting FQI training loop...




In [6]:
degradation_history = pd.DataFrame(simulator.degradation_history)

In [None]:
degradation_learner = DegradationLearner(d=D, initial_theta=np.zeros(D))

degradation_learner.fit(degradation_history)
degradation_learner.get_theta()

In [None]:
df[df.event_type == 'rental'].profit.mean(), df[df.event_type == 'rental'].profit.std()

In [None]:
df.feedback.value_counts()

## Estimating $\theta$

In [None]:
degradation_learner = DegradationLearner(d=D, initial_theta=np.zeros(D))

degradation_learner.fit(df)
degradation_learner.get_theta()

In [None]:
from policy import DPAgent

In [None]:
mdp_params = {
    'replacement_cost': 1.5,   # Cost to replace the machine
    'failure_cost': 0.75,      # Additional penalty for in-service failure
    'holding_cost_rate': 0.02,   # Cost per unit of idle time
    'gamma': 0.999,             # Discount factor
    'learning_rate': 1e-4,      # Learning rate for the Adam optimizer
    'target_update_freq': 10    # How often to update the target network (in iterations)
}

# Training Hyperparameters
TRAINING_ITERATIONS = 100
DATASET_SIZE = 50000          # Number of transitions to generate for the offline dataset
BATCH_SIZE = 256

u_hat = simulator.centroids[-1]


In [None]:
dp_agent = DPAgent(
    d=D,
    u_hat=u_hat,
    degradation_learner=degradation_learner,
    customer_generator=customer_gen,
    params=mdp_params
)

In [None]:
# Train the agent
dp_agent.train(
    num_iterations=TRAINING_ITERATIONS,
    dataset_size=DATASET_SIZE,
    batch_size=BATCH_SIZE
)

In [None]:
optimal_policy = dp_agent.get_policy()

In [None]:
i = 50

X_i = df.loc[i, 'sum_of_contexts_after']
I_i = 3 # df.loc[i, '']
x_i = df.loc[i+1, 'customer_context']
T_i = df.loc[i+1, 'rental_duration']

arrival_state = np.concatenate([
    X_i,
    x_i,
    [T_i, I_i, 0.0]
])
action_arrival = optimal_policy(arrival_state)
action_map = {0: 'Give Max Acceptable Price', 1: 'Shutdown'}
print(f"Sample Arrival State. Optimal Action: {action_map[action_arrival]}")


In [None]:
# Test Case 2: Departure State
departure_state = np.concatenate([
    X_i+x_i*10, 
    np.zeros(D), 
    [0.0, I_i, 1.0]
])
action_departure = optimal_policy(departure_state)
action_map = {2: 'Replace Machine', 3: 'Do Not Replace'}
print(f"Sample Departure State. Optimal Action: {action_map[action_departure]}")
