In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.optimize import minimize

from simulation import Simulator, CustomerGenerator
from hazard_models import ExponentialHazard
from utility_learner import ProjectedVolumeLearner, diam
from degradation_learner import DegradationLearner

from utils import unit_ball_rejection_sample, correct_signs
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

import logging
logging.basicConfig(level=logging.INFO)

In [2]:
# --- 2. Define Sampling Functions ---
# def context_sampler() -> np.ndarray:
#     """Samples a customer's context vector from a uniform distribution."""
#     return np.random.uniform(low=0.0, high=1.0, size=D)

def context_sampler() -> np.ndarray:
    """Samples a customer's context vector uniformly from the unit ball."""
    return np.abs(unit_ball_rejection_sample(D))

def rental_sampler() -> float:
    """Samples a customer's desired rental duration from an exponential distribution."""
    return np.random.exponential(scale=20.0)

def interarrival_sampler() -> float:
    """Samples the time until the next customer arrives."""
    return np.random.exponential(scale=5.0)

In [3]:
# --- 1. Simulation Configuration ---
D = 5                                  # Dimension of context vectors
LAMBDA_VAL = 0.001                     # Baseline hazard constant
NUM_CUSTOMERS = 2000                   # Total number of customers to simulate, i.e. T

# Set a random seed for reproducibility
np.random.seed(41)

# Ground truth vectors
THETA_TRUE = np.array([0.5, 0.2, 0.1, 0.3, 0.4])    # For degradation
UTILITY_TRUE = context_sampler()  # For customer's willingness to pay

# --- Machine's Pricing Vector 'r' ---
# You can change this to test different pricing strategies.
# Case 1: A non-zero pricing strategy
# PRICING_R = np.array([2.0, 2.0, 2.0, 2.0, 2.0])
# Case 2: Zero price (free rentals), guaranteeing 100% acceptance
PRICING_R = np.zeros(D)

In [None]:
usage_exp_hazard_model = ExponentialHazard(lambda_val=LAMBDA_VAL)
# spontaneous_exp_hazard_model = None # ExponentialHazard(lambda_val=0.01)

customer_gen = CustomerGenerator(
    d=D,
    context_sampler=context_sampler,
    rental_sampler=rental_sampler,
    interarrival_sampler=interarrival_sampler
)

centroid_params = {
    # 'num_samples': 2000,
    # 'thin': None,
    # 'burn_in': 500 * D ** 2,
    # 'tol': 1e-4,
    # 'rho_target': 0.01
}

termination_rule = lambda diameter: diameter < 0.01  # Example custom termination rule

projected_volume_learner = ProjectedVolumeLearner(
    T=NUM_CUSTOMERS, 
    d=D, 
    centroid_params=centroid_params,
    incentive_constant=1.1,
    termination_rule=termination_rule,
)

mdp_params = {
    'replacement_cost': 1.5,   # Cost to replace the machine
    'failure_cost': 0.75,      # Additional penalty for in-service failure
    'holding_cost_rate': 0.02,   # Cost per unit of idle time
    'gamma': 0.999,             # Discount factor
    'learning_rate': 1e-4,      # Learning rate for the Adam optimizer
    'target_update_freq': 10    # How often to update the target network (in iterations)
}

training_hyperparams = {
    'num_iterations': 100, # Number of training iterations per policy update
    'dataset_size': 50000,      # Number of transitions to generate for the offline dataset
    'batch_size': 256           # Batch size for training
}

# Instantiate the Simulator with the new parameters
simulator = Simulator(
    d=D,
    T=NUM_CUSTOMERS,
    
    theta_true=THETA_TRUE,
    utility_true=UTILITY_TRUE,
    pricing_r=PRICING_R,
    
    usage_hazard_model=usage_exp_hazard_model,
    customer_generator=customer_gen,
    projected_volume_learner=projected_volume_learner,  # Use default ProjectedVolumeLearner
    
    mdp_params=mdp_params,
    training_hyperparams=training_hyperparams,
    policy_update_threshold=5,
)

In [18]:
simulator.degradation_learner.get_theta()

array([2.23416165, 0.        , 0.        , 0.16960622, 0.        ])

In [16]:
simulation_data = simulator.run(num_customers=NUM_CUSTOMERS)
df = pd.DataFrame(simulation_data)

INFO:root:Starting simulation for 2000 customers...
  0%|          | 0/2000 [00:00<?, ?it/s]INFO:root:Updating optimal policy...


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         5 variables are exactly at the bounds

At iterate    0    f=  2.07944D+00    |proj g|=  5.42796D-01

At iterate    1    f=  1.74439D+00    |proj g|=  3.22442D-01

At iterate    2    f=  1.35140D+00    |proj g|=  2.63416D-01

At iterate    3    f=  9.24682D-01    |proj g|=  2.19443D-01

At iterate    4    f=  3.11577D-01    |proj g|=  7.36006D-02

At iterate    5    f=  1.56087D-01    |proj g|=  3.57062D-02

At iterate    6    f=  7.32264D-02    |proj g|=  1.60069D-02

At iterate    7    f=  3.68068D-02    |proj g|=  7.84466D-03

At iterate    8    f=  1.83501D-02    |proj g|=  3.85328D-03
Using device: mps

At iterate    9    f=  9.23877D-03    |proj g|=  1.92177D-03

At iterate   10    f=  4.64293D-03    |proj g|=  9.59267D-04

At iterate   11    f=  2.33471D-03    |proj g|=  4.79929D-04

At iterate   12    f=  1.17269D-03    |proj g|=  2.40343D-04

At itera

100%|██████████| 512/512 [00:00<00:00, 11652.30it/s]



Starting FQI training loop...


Iter 5/5 | Loss: 2.0673 | Avg Q-Value: 0.28: 100%|██████████| 5/5 [00:00<00:00,  5.47it/s]
INFO:root:Policy updated. New theta_hat: [ 0.     7.988 28.807 34.041  0.   ]
  0%|          | 1/2000 [00:01<34:51,  1.05s/it]INFO:root:Updating optimal policy...



Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         5 variables are exactly at the bounds

At iterate    0    f=  1.01628D+01    |proj g|=  8.31880D+00

At iterate    1    f=  5.97322D+00    |proj g|=  1.93614D+00

At iterate    2    f=  5.09937D+00    |proj g|=  1.44370D+00

At iterate    3    f=  4.17034D+00    |proj g|=  6.92179D-01

At iterate    4    f=  3.91506D+00    |proj g|=  2.91733D-01

At iterate    5    f=  3.86228D+00    |proj g|=  1.26244D-01

At iterate    6    f=  3.84518D+00    |proj g|=  6.69278D-02

At iterate    7    f=  3.84191D+00    |proj g|=  5.43685D-02

At iterate    8    f=  3.84010D+00    |proj g|=  3.17990D-02

At iterate    9    f=  3.83932D+00    |proj g|=  1.16486D-02

At iterate   10    f=  3.83916D+00    |proj g|=  3.83514D-03

At iterate   11    f=  3.83914D+00    |proj g|=  4.54563D-04

At iterate   12    f=  3.83914D+00    |proj g|=  2.19510D-05

At ite

100%|██████████| 512/512 [00:00<00:00, 8559.16it/s]



Starting FQI training loop...


Iter 5/5 | Loss: 1.0115 | Avg Q-Value: 0.24: 100%|██████████| 5/5 [00:00<00:00,  5.95it/s]
INFO:root:Policy updated. New theta_hat: [2.737 0.    0.    0.175 0.   ]
  2%|▏         | 37/2000 [00:02<01:33, 21.05it/s]INFO:root:Updating optimal policy...



Training complete.
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         5 variables are exactly at the bounds

At iterate    0    f=  3.15703D+01    |proj g|=  9.22022D+00

At iterate    1    f=  2.65293D+01    |proj g|=  2.29690D+00

At iterate    2    f=  2.54175D+01    |proj g|=  1.66166D+00

At iterate    3    f=  2.45466D+01    |proj g|=  6.68840D-01

At iterate    4    f=  2.43862D+01    |proj g|=  2.10580D-01

At iterate    5    f=  2.43591D+01    |proj g|=  9.87088D-02

At iterate    6    f=  2.43553D+01    |proj g|=  4.63981D-02

At iterate    7    f=  2.43544D+01    |proj g|=  1.52687D-02

At iterate    8    f=  2.43543D+01    |proj g|=  7.69952D-03

At iterate    9    f=  2.43543D+01    |proj g|=  4.58239D-04

At iterate   10    f=  2.43543D+01    |proj g|=  4.60765D-05
Using device: mps

At iterate   11    f=  2.43543D+01    |proj g|=  6.85197D-07

           * * *

Tit   = total number of iteratio

100%|██████████| 512/512 [00:00<00:00, 5643.05it/s]



Starting FQI training loop...


Iter 5/5 | Loss: 1.4653 | Avg Q-Value: -0.23: 100%|██████████| 5/5 [00:01<00:00,  4.94it/s]
INFO:root:Policy updated. New theta_hat: [2.234 0.    0.    0.17  0.   ]
 20%|█▉        | 391/2000 [00:03<00:09, 177.36it/s]


Training complete.


100%|██████████| 2000/2000 [00:04<00:00, 485.59it/s] 
INFO:root:Simulation finished.


In [17]:
df

Unnamed: 0,event_type,customer_id,calendar_time,observed_duration,feedback,profit
0,rental,1,0.794966,25.701155,0.0,-1.096922
1,rental,2,33.499376,10.797388,0.0,-0.921433
2,rental,3,44.327910,53.085224,0.0,-0.735886
3,rental,4,109.317337,6.289269,0.0,-0.601321
4,rental,5,131.011646,5.680947,0.0,-0.632659
...,...,...,...,...,...,...
2128,price_rejection,1996,14033.500283,,,0.000000
2129,price_rejection,1997,14035.996758,,,0.000000
2130,price_rejection,1998,14039.682561,,,0.000000
2131,price_rejection,1999,14041.112777,,,0.000000


In [None]:
df[df.event_type == 'rental'].profit.mean(), df[df.event_type == 'rental'].profit.std()

In [None]:
df.feedback.value_counts()

## Estimating $\theta$

In [None]:
degradation_learner = DegradationLearner(d=D, initial_theta=np.zeros(D))

degradation_learner.fit(df)
degradation_learner.get_theta()

In [None]:
from policy import DPAgent

In [None]:
mdp_params = {
    'replacement_cost': 1.5,   # Cost to replace the machine
    'failure_cost': 0.75,      # Additional penalty for in-service failure
    'holding_cost_rate': 0.02,   # Cost per unit of idle time
    'gamma': 0.999,             # Discount factor
    'learning_rate': 1e-4,      # Learning rate for the Adam optimizer
    'target_update_freq': 10    # How often to update the target network (in iterations)
}

# Training Hyperparameters
TRAINING_ITERATIONS = 100
DATASET_SIZE = 50000          # Number of transitions to generate for the offline dataset
BATCH_SIZE = 256

u_hat = simulator.centroids[-1]


In [None]:
dp_agent = DPAgent(
    d=D,
    u_hat=u_hat,
    degradation_learner=degradation_learner,
    customer_generator=customer_gen,
    params=mdp_params
)

In [None]:
# Train the agent
dp_agent.train(
    num_iterations=TRAINING_ITERATIONS,
    dataset_size=DATASET_SIZE,
    batch_size=BATCH_SIZE
)

In [None]:
optimal_policy = dp_agent.get_policy()

In [None]:
i = 50

X_i = df.loc[i, 'sum_of_contexts_after']
I_i = 3 # df.loc[i, '']
x_i = df.loc[i+1, 'customer_context']
T_i = df.loc[i+1, 'rental_duration']

arrival_state = np.concatenate([
    X_i,
    x_i,
    [T_i, I_i, 0.0]
])
action_arrival = optimal_policy(arrival_state)
action_map = {0: 'Give Max Acceptable Price', 1: 'Shutdown'}
print(f"Sample Arrival State. Optimal Action: {action_map[action_arrival]}")


In [None]:
# Test Case 2: Departure State
departure_state = np.concatenate([
    X_i+x_i*10, 
    np.zeros(D), 
    [0.0, I_i, 1.0]
])
action_departure = optimal_policy(departure_state)
action_map = {2: 'Replace Machine', 3: 'Do Not Replace'}
print(f"Sample Departure State. Optimal Action: {action_map[action_departure]}")
