In [35]:
%matplotlib inline

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import gymnasium as gym
from gymnasium import Env, spaces
from scipy.special import comb
from collections import defaultdict, deque

In [37]:
with open('pickles/anonbank_lognorm_params.pkl', 'rb') as f:
    lognorm_params_df = pickle.load(f)
    
lognorm_params_dict = lognorm_params_df.set_index('param_name').T.to_dict()

lognorm_params_dict

{('1', 'REGULAR'): {'mu': 4.832017135463374,
  'sigma': 0.006958469079704907,
  'loc': -18.81216413263904},
 ('1', 'SPECIFIC'): {'mu': 4.6755415208149325,
  'sigma': 0.011323646523065594,
  'loc': -6.43006717763892},
 ('2', 'REGULAR'): {'mu': 5.0125526231624855,
  'sigma': 0.005548307071802354,
  'loc': -8.97654648869145},
 ('2', 'SPECIFIC'): {'mu': 4.617186601308905,
  'sigma': 0.010134016947128502,
  'loc': -4.806412598886724},
 'REGULAR': {'mu': 3.7589339214345716,
  'sigma': 0.024480564633599006,
  'loc': -2.846758403840244},
 'SPECIFIC': {'mu': 4.650110366782741,
  'sigma': 0.01083819523964822,
  'loc': -5.260938954957395},
 'REGULAR_x4': {'mu': 5.145228282554465,
  'sigma': 0.006121000886257698,
  'loc': -11.387033615361277},
 'SPECIFIC_x4': {'mu': 6.036404727902634,
  'sigma': 0.002709623408600959,
  'loc': -21.043755819830228}}

In [38]:
with open('pickles/anonbank_service_rate.pkl', 'rb') as f:
    service_rate = pickle.load(f)

# Convert service_rate to a dictionary for easier access
ser_rate_dict = service_rate.set_index(['group', 'type_group'])[['mean_service_time', 'std']].to_dict('index')

ser_rate_dict

{('1', 'REGULAR'): {'mean_service_time': 12.641068214208262,
  'std': 274.09685054233046},
 ('1', 'SPECIFIC'): {'mean_service_time': 14.229164478664519,
  'std': 447.916748807343},
 ('2', 'REGULAR'): {'mean_service_time': 19.02417781821472,
  'std': 344.9407640049321},
 ('2', 'SPECIFIC'): {'mean_service_time': 15.020757462895316,
  'std': 257.32365209990894}}

In [39]:

# Global variables

# Queue size
MAX_QUEUE_SIZE = 20

# Discount factor for future rewards
DISCOUNT = 0.95

# Penalty for dropping calls (when queue is full)
DROP_PENALTY = 3600.0 # 1 hour in seconds

# State space: (queue 1 size, queue 2 size, call type)
STATE_SPACE = (MAX_QUEUE_SIZE + 1, MAX_QUEUE_SIZE + 1, 2)

# Call type: 0 = regular, 1 = specific
CALL_TYPE = [0, 1]

# Actions: 0 = route to queue 1, 1 = route to queue 2
ACTIONS = [1, 2]

# Arrival parameters for regular and specific calls
ARRIVAL_REGULAR_MU = lognorm_params_dict['REGULAR']['mu']
ARRIVAL_REGULAR_SIGMA = lognorm_params_dict['REGULAR']['sigma']
ARRIVAL_REGULAR_LOC = lognorm_params_dict['REGULAR']['loc']
ARRIVAL_SPECIFIC_MU = lognorm_params_dict['SPECIFIC']['mu']
ARRIVAL_SPECIFIC_SIGMA = lognorm_params_dict['SPECIFIC']['sigma']
ARRIVAL_SPECIFIC_LOC = lognorm_params_dict['SPECIFIC']['loc']

SERVICE_REGULAR_1_MU = lognorm_params_dict[('1', 'REGULAR')]['mu']
SERVICE_REGULAR_1_SIGMA = lognorm_params_dict[('1', 'REGULAR')]['sigma']
SERVICE_REGULAR_1_LOC = lognorm_params_dict[('1', 'REGULAR')]['loc']
SERVICE_SPECIFIC_1_MU = lognorm_params_dict[('1', 'SPECIFIC')]['mu']
SERVICE_SPECIFIC_1_SIGMA = lognorm_params_dict[('1', 'SPECIFIC')]['sigma']
SERVICE_SPECIFIC_1_LOC = lognorm_params_dict[('1', 'SPECIFIC')]['loc']

SERVICE_REGULAR_2_MU = lognorm_params_dict[('2', 'REGULAR')]['mu']
SERVICE_REGULAR_2_SIGMA = lognorm_params_dict[('2', 'REGULAR')]['sigma']
SERVICE_REGULAR_2_LOC = lognorm_params_dict[('2', 'REGULAR')]['loc']
SERVICE_SPECIFIC_2_MU = lognorm_params_dict[('2', 'SPECIFIC')]['mu']
SERVICE_SPECIFIC_2_SIGMA = lognorm_params_dict[('2', 'SPECIFIC')]['sigma']
SERVICE_SPECIFIC_2_LOC = lognorm_params_dict[('2', 'SPECIFIC')]['loc']

def mean_shifted_lognormal(mu, sigma, loc):
    """
    Calculate mean of shifted lognormal distribution:
    E[X] = exp(mu + sigma^2 / 2) + loc
    """
    return np.exp(mu + (sigma**2) / 2) + loc

def std_shifted_lognormal(mu, sigma, loc):
    """
    Calculate standard deviation of shifted lognormal distribution:
    The location (loc) shifts the mean but does not affect the std.
    """
    return np.sqrt((np.exp(sigma**2) - 1) * np.exp(2*mu + sigma**2))

# Arrival rates

ARRIVAL_REGULAR = mean_shifted_lognormal(
    ARRIVAL_REGULAR_MU,
    ARRIVAL_REGULAR_SIGMA,
    ARRIVAL_REGULAR_LOC
)

ARRIVAL_SPECIFIC = mean_shifted_lognormal(
    ARRIVAL_SPECIFIC_MU,
    ARRIVAL_SPECIFIC_SIGMA,
    ARRIVAL_SPECIFIC_LOC
)

ARRIVAL_REGULAR_STD = std_shifted_lognormal(
    ARRIVAL_REGULAR_MU,
    ARRIVAL_REGULAR_SIGMA,
    ARRIVAL_REGULAR_LOC   
)

ARRIVAL_SPECIFIC_STD = std_shifted_lognormal(
    ARRIVAL_SPECIFIC_MU,
    ARRIVAL_SPECIFIC_SIGMA,
    ARRIVAL_SPECIFIC_LOC
)

# Service means and std

SERVICE_REGULAR_1 = ser_rate_dict[('1', 'REGULAR')]['mean_service_time']
SERVICE_SPECIFIC_1 = ser_rate_dict[('1', 'SPECIFIC')]['mean_service_time']
SERVICE_REGULAR_2 = ser_rate_dict[('2', 'REGULAR')]['mean_service_time']
SERVICE_SPECIFIC_2 = ser_rate_dict[('2', 'SPECIFIC')]['mean_service_time']

SERVICE_REGULAR_1_STD = ser_rate_dict[('1', 'REGULAR')]['std']
SERVICE_SPECIFIC_1_STD = ser_rate_dict[('1', 'SPECIFIC')]['std']
SERVICE_REGULAR_2_STD = ser_rate_dict[('2', 'REGULAR')]['std']
SERVICE_SPECIFIC_2_STD = ser_rate_dict[('2', 'SPECIFIC')]['std']

# SERVICE_REGULAR_1 = mean_shifted_lognormal(
#     SERVICE_REGULAR_1_MU,
#     SERVICE_REGULAR_1_SIGMA,
#     SERVICE_REGULAR_1_LOC
# )

# SERVICE_REGULAR_2 = mean_shifted_lognormal(
#     SERVICE_REGULAR_2_MU,
#     SERVICE_REGULAR_2_SIGMA,
#     SERVICE_REGULAR_2_LOC
# )

# SERVICE_SPECIFIC_1 = mean_shifted_lognormal(
#     SERVICE_SPECIFIC_1_MU,
#     SERVICE_SPECIFIC_1_SIGMA,
#     SERVICE_SPECIFIC_1_LOC
# )

# SERVICE_SPECIFIC_2 = mean_shifted_lognormal(
#     SERVICE_SPECIFIC_2_MU,
#     SERVICE_SPECIFIC_2_SIGMA,
#     SERVICE_SPECIFIC_2_LOC
# )

In [40]:
print("MEANS:")

print(f"ARRIVAL_REGULAR: {ARRIVAL_REGULAR}, ARRIVAL_SPECIFIC: {ARRIVAL_SPECIFIC}")
print(f"SERVICE_REGULAR_1: {SERVICE_REGULAR_1}, SERVICE_SPECIFIC_1: {SERVICE_SPECIFIC_1}")
print(f"SERVICE_REGULAR_2: {SERVICE_REGULAR_2}, SERVICE_SPECIFIC_2: {SERVICE_SPECIFIC_2}") 

print("STD:")

print(f"ARRIVAL_REGULAR: {ARRIVAL_REGULAR_STD}, ARRIVAL_SPECIFIC: {ARRIVAL_SPECIFIC_STD}")
print(f"SERVICE_REGULAR_1: {SERVICE_REGULAR_1_STD}, SERVICE_SPECIFIC_1: {SERVICE_SPECIFIC_1_STD}")
print(f"SERVICE_REGULAR_2: {SERVICE_REGULAR_2_STD}, SERVICE_SPECIFIC_2: {SERVICE_SPECIFIC_2_STD}")

MEANS:
ARRIVAL_REGULAR: 40.068763243462236, ARRIVAL_SPECIFIC: 99.34173344076834
SERVICE_REGULAR_1: 12.641068214208262, SERVICE_SPECIFIC_1: 14.229164478664519
SERVICE_REGULAR_2: 19.02417781821472, SERVICE_SPECIFIC_2: 15.020757462895316
STD:
ARRIVAL_REGULAR: 1.050753626188135, ARRIVAL_SPECIFIC: 1.1337374798875093
SERVICE_REGULAR_1: 274.09685054233046, SERVICE_SPECIFIC_1: 447.916748807343
SERVICE_REGULAR_2: 344.9407640049321, SERVICE_SPECIFIC_2: 257.32365209990894


In [41]:
# Lognormal expected value calculation

def expected_waiting_time_binom_kingman(q_size, p_regular, mean_r, mean_s, rho, ca, cs):
    """
    Calculate expected waiting time for a queue with q_size calls,
    each call being regular with probability p_regular,
    using direct mean service times for regular and specific calls.
    """
    # Expected waiting time
    EWq = 0.0
    for k in range(q_size + 1):
        # Binomial probability of k regular calls in the queue
        prob = comb(q_size, k) * (p_regular ** k) * ((1 - p_regular) ** (q_size - k))
        # Waiting time if k regular and (q_size - k) specific ahead
        wait_k = k * mean_r + (q_size - k) * mean_s
        EWq += prob * wait_k
        
    # Kingman scaling
    Km = rho / (1 - rho) * (ca ** 2 + cs ** 2) / 2

    return EWq * Km

def expected_waiting_time_binom(q_size, p_regular, mean_r, mean_s):
    """
    Calculate expected waiting time for a queue with q_size calls,
    each call being regular with probability p_regular,
    using direct mean service times for regular and specific calls.
    """
    # Expected waiting time
    wait = 0.0
    for k in range(q_size + 1):
        # Binomial probability of k regular calls in the queue
        prob = comb(q_size, k) * (p_regular ** k) * ((1 - p_regular) ** (q_size - k))
        # Waiting time if k regular and (q_size - k) specific ahead
        wait_k = k * mean_r + (q_size - k) * mean_s
        wait += prob * wait_k

    return wait

def rho(rate_regular, rate_specific, mean_r, mean_s):
    """
    Calculate the traffic intensity (rho) for the queue.
    """
    rate_total = rate_regular + rate_specific
    mean_comb = (rate_regular / rate_total * mean_r) + (rate_specific / rate_total * mean_s)
    
    rhoo = rate_total * mean_comb
    if rhoo >= 1:
        return float('inf')  # Avoid division by zero or negative values
    return rhoo

def cv2(var, mean):
    return var / (mean ** 2)

def combined_ca2(rate_regular, rate_specific, ca_regular, ca_specific):
    """
    Calculate the combined coefficient of variation squared for the arrival process.
    """
    rate_total = rate_regular + rate_specific
    
    return (rate_regular / rate_total) * ca_regular + (rate_specific / rate_total) * ca_specific

def combined_cs2(mean_regular, mean_specific, cs_regular, cs_specific):
    """
    Calculate the combined coefficient of variation squared for the service process.
    """
    rate_regular = 1 / mean_regular
    rate_specific = 1 / mean_specific
    rate_total = rate_regular + rate_specific
    mean_comb = (rate_regular / rate_total * mean_regular) + (rate_specific / rate_total * mean_specific)
    
    term1 = (rate_regular / rate_total) * cs_regular
    term2 = (rate_specific / rate_total) * cs_specific
    term3 = rate_regular / rate_total * ((mean_regular - mean_comb) / mean_comb) ** 2
    term4 = rate_specific / rate_total * ((mean_specific - mean_comb) / mean_comb) ** 2
    
    return term1 + term2 + term3 + term4

In [42]:
# Example usage to calculate expected waiting time for queue 1:

rate_regular = 1 / ARRIVAL_REGULAR
rate_specific = 1 / ARRIVAL_SPECIFIC
rate_total = rate_regular + rate_specific

p_regular = rate_regular / rate_total

mean_r1 = SERVICE_REGULAR_1
mean_s1 = SERVICE_SPECIFIC_1

rho1 = rho(rate_regular, rate_specific, mean_r1, mean_s1)

meana_r = ARRIVAL_REGULAR
meana_s = ARRIVAL_SPECIFIC
vara_r = ARRIVAL_REGULAR_STD ** 2
vara_s = ARRIVAL_SPECIFIC_STD ** 2

ca_r = cv2(vara_r, meana_r)
ca_s = cv2(vara_s, meana_s)

vars_r1 = SERVICE_REGULAR_1_STD ** 2
vars_s1 = SERVICE_SPECIFIC_1_STD ** 2

cs_r1 = cv2(vars_r1, mean_r1)
cs_s1 = cv2(vars_s1, mean_s1)

ca = combined_ca2(rate_regular, rate_specific, ca_r, ca_s)
cs_1 = combined_cs2(mean_r1, mean_s1, cs_r1, cs_s1)

q_size = 10 # Example queue size

expected_wait = expected_waiting_time_binom_kingman(q_size, p_regular, mean_r1, mean_s1, rho1, ca, cs_1)
print(f"Expected waiting time at queue 1 (Kingman): {expected_wait:.4f}")

# Compare with queue 2

mean_r2 = SERVICE_REGULAR_2
mean_s2 = SERVICE_SPECIFIC_2

rho2 = rho(rate_regular, rate_specific, mean_r2, mean_s2)

vars_r2 = SERVICE_REGULAR_2_STD ** 2
vars_s2 = SERVICE_SPECIFIC_2_STD ** 2

cs_r2 = cv2(vars_r2, mean_r2)
cs_s2 = cv2(vars_s2, mean_s2)

cs_2 = combined_cs2(mean_r2, mean_s2, cs_r2, cs_s2)

expected_wait2 = expected_waiting_time_binom_kingman(q_size, p_regular, mean_r2, mean_s2, rho2, ca, cs_2)
print(f"Expected waiting time at queue 2 (Kingman): {expected_wait2:.4f}")

Expected waiting time at queue 1 (Kingman): 28384089.2543
Expected waiting time at queue 2 (Kingman): 14287191.6796


In [43]:
expected_wait = expected_waiting_time_binom(q_size, p_regular, mean_r1, mean_s1)
print(f"Expected waiting time at queue 1 (Binom): {expected_wait:.4f}")
expected_wait2 = expected_waiting_time_binom(q_size, p_regular, mean_r2, mean_s2)
print(f"Expected waiting time at queue 2 (Binom): {expected_wait2:.4f}")

Expected waiting time at queue 1 (Binom): 130.9751
Expected waiting time at queue 2 (Binom): 178.7353


In [44]:
# Calculate departure probability
r1_rate = 1 / mean_r1
r2_rate = 1 / mean_r2
s1_rate = 1 / mean_s1
s2_rate = 1 / mean_s2

total_rate = rate_regular + rate_specific + r1_rate + r2_rate + s1_rate + s2_rate

print(f"Departure probability for REGULAR, Queue 1: {r1_rate / total_rate:.4f}")
print(f"Departure probability for REGULAR, Queue 2: {r2_rate / total_rate:.4f}")
print(f"Departure probability for SPECIFIC, Queue 1: {s1_rate / total_rate:.4f}")
print(f"Departure probability for SPECIFIC, Queue 2: {s2_rate / total_rate:.4f}")

Departure probability for REGULAR, Queue 1: 0.2606
Departure probability for REGULAR, Queue 2: 0.1732
Departure probability for SPECIFIC, Queue 1: 0.2315
Departure probability for SPECIFIC, Queue 2: 0.2193


### Use binomial expected waiting time

In [45]:
def plot_results(costs, Q_table, max_queue_size):
    """Plot learning curve and learned policies."""
    
    # Plot learning curve
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    plt.plot(costs, alpha=0.3, label="Episode Cost")
    window = 100
    if len(costs) > window:
        smoothed = pd.Series(costs).rolling(window).mean()
        plt.plot(smoothed, color='red', linewidth=2, label=f"Smoothed (window={window})")
    plt.title("Q-Learning: Cost vs Episodes")
    plt.xlabel("Episode")
    plt.ylabel("Cost")
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot learned policies
    policy_regular = np.zeros((max_queue_size + 1, max_queue_size + 1), dtype=int)
    policy_specific = np.zeros((max_queue_size + 1, max_queue_size + 1), dtype=int)

    for q1 in range(max_queue_size + 1):
        for q2 in range(max_queue_size + 1):
            # Regular call policy
            s_regular = (q1, q2, 0)
            if s_regular in Q_table:
                policy_regular[q1, q2] = np.argmax(Q_table[s_regular])
            
            # Specific call policy
            s_specific = (q1, q2, 1)
            if s_specific in Q_table:
                policy_specific[q1, q2] = np.argmax(Q_table[s_specific])

    # Plot policies
    plt.subplot(1, 3, 2)
    ax = sns.heatmap(np.flipud(policy_regular), cmap="RdYlBu", annot=True, 
                     cbar_kws={'label': '0=Queue1, 1=Queue2'})
    plt.title('Learned Policy: Regular Calls')
    plt.ylabel('Queue 1 Length')
    plt.xlabel('Queue 2 Length')
    ax.invert_yaxis()
    
    plt.subplot(1, 3, 3)
    ax = sns.heatmap(np.flipud(policy_specific), cmap="RdYlBu", annot=True,
                cbar_kws={'label': '0=Queue1, 1=Queue2'})
    plt.title('Learned Policy: Specific Calls')
    plt.ylabel('Queue 1 Length') 
    plt.xlabel('Queue 2 Length')
    ax.invert_yaxis()
    
    plt.tight_layout()
    plt.show()

In [46]:
class EventBasedCallCentreEnv(gym.Env):
    """
    Event-based RL environment for a two-queue call centre.
    Each step represents a call arrival requiring a routing decision.
    State: (queue1_length, queue2_length, call_type)
    Action: 0 = route to queue 1, 1 = route to queue 2
    Reward: Negative expected waiting time (cost to minimize)
    """
    
    def __init__(self, 
                 max_queue_size=10,
                 drop_penalty=1000.0,
                 arr_scaler = 1.0,
                 arrival_regular_mean=None,
                 arrival_specific_mean=None,
                 service_regular_1=None,
                 service_specific_1=None,
                 service_regular_2=None,
                 service_specific_2=None,
                 seed=None):
        """
        Initialize the event-based call centre environment.
        """
        super().__init__()
        
        self.max_queue_size = max_queue_size
        self.drop_penalty = drop_penalty
        self.arr_scaler = arr_scaler
        
        # Use provided parameters or defaults from your original code
        self.arrival_params = {
            'regular': {
                'mean': arrival_regular_mean / self.arr_scaler,  # Default values
                'mu': ARRIVAL_REGULAR_MU,      # These would come from your lognorm_params
                'sigma': ARRIVAL_REGULAR_SIGMA,
                'loc': ARRIVAL_REGULAR_LOC
            },
            'specific': {
                'mean': arrival_specific_mean / self.arr_scaler,
                'mu': ARRIVAL_SPECIFIC_MU,
                'sigma': ARRIVAL_SPECIFIC_SIGMA,
                'loc': ARRIVAL_SPECIFIC_LOC
            }
        }
        
        # Service parameters
        self.service_times = {
            (1, 'regular'): service_regular_1,
            (1, 'specific'): service_specific_1,
            (2, 'regular'): service_regular_2,
            (2, 'specific'): service_specific_2,
        }
        
        # Calculate arrival rates and probabilities
        arrival_regular_rate = 1.0 / self.arrival_params['regular']['mean']
        arrival_specific_rate = 1.0 / self.arrival_params['specific']['mean']
        self.total_arrival_rate = arrival_regular_rate + arrival_specific_rate
        self.p_regular = arrival_regular_rate / self.total_arrival_rate
        
        # Define observation and action spaces
        self.observation_space = spaces.Box(
            low=np.array([0, 0, 0]), 
            high=np.array([max_queue_size, max_queue_size, 1]), 
            dtype=np.int32
        )
        self.action_space = spaces.Discrete(2)
        
        # Initialize state
        self.queue_lengths = [0, 0]
        self.current_time = 0.0
        self.episode_step = 0
        
        self.queue_contents = [[], []]  # Contents of each queue (list of tuples: (call_type, arrival_time))
        
        if seed is not None:
            self.seed(seed)
            
        # Initialize next arrival times for each call type
        self.next_regular_arrival = 0.0
        self.next_specific_arrival = 0.0
        
        self.current_call_type, _ = self._get_next_call_info()
        
        # Generate initial arrival times
        self._generate_next_arrivals()
        
    def seed(self, seed=None):
        """Set random seed for reproducibility."""
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)
            self.np_random = np.random.RandomState(seed)
    
    def _generate_lognormal_interarrival(self, call_type):
        """Generate next inter-arrival time using lognormal distribution."""
        params = self.arrival_params[call_type]
        
        # Generate from lognormal distribution
        sample = np.random.lognormal(params['mu'], params['sigma'])
        # Add location parameter (shift)
        interarrival_time = sample + params['loc']
        # Scale the interarrival
        interarrival_time /= self.arr_scaler
        
        return max(interarrival_time, 0.1)
    
    def _generate_next_arrivals(self):
        """Generate next arrival times for both call types."""
        # Always generate next regular arrival if current one is used/passed
        if self.next_regular_arrival <= self.current_time or self.next_regular_arrival == float('inf'):
            regular_interarrival = self._generate_lognormal_interarrival('regular')
            self.next_regular_arrival = self.current_time + regular_interarrival
        
        # Always generate next specific arrival if current one is used/passed  
        if self.next_specific_arrival <= self.current_time or self.next_specific_arrival == float('inf'):
            specific_interarrival = self._generate_lognormal_interarrival('specific')
            self.next_specific_arrival = self.current_time + specific_interarrival

    def _get_next_call_info(self):
        """Determine which call arrives next and when."""
        # Ensure we have valid next arrival times
        self._generate_next_arrivals()
        
        if self.next_regular_arrival <= self.next_specific_arrival:
            # Regular call arrives next
            elapsed_time = self.next_regular_arrival - self.current_time
            call_type = 0  # regular
            # Mark this arrival as used
            self.next_regular_arrival = float('inf')
        else:
            # Specific call arrives next
            elapsed_time = self.next_specific_arrival - self.current_time
            call_type = 1  # specific
            # Mark this arrival as used
            self.next_specific_arrival = float('inf')
        
        return call_type, elapsed_time

    
    def _process_service_completions(self, elapsed_time):
        """
        Process service completions for two independent servers.
        Each queue has its own server with different service times.
        """
        for queue_idx in range(2):  # Two independent servers
            if len(self.queue_contents[queue_idx]) == 0:
                continue
                
            # Process the first call in each queue (FIFO within each server)
            call_type, arrival_time = self.queue_contents[queue_idx][0]
            call_type_str = 'regular' if call_type == 0 else 'specific'
            
            # Time this call has been in the system
            time_in_system = self.current_time - arrival_time + elapsed_time
            
            # Get service time for THIS specific server
            mean_service_time = self.service_times[(queue_idx + 1, call_type_str)]
            
            # Exponential service completion check
            service_rate = 1.0 / mean_service_time
            completion_prob = 1.0 - np.exp(-service_rate * elapsed_time)  # Use elapsed_time, not cumulative
            
            if np.random.random() < completion_prob:
                # Call completed at this server
                self.queue_contents[queue_idx].pop(0)
                self.queue_lengths[queue_idx] = len(self.queue_contents[queue_idx])
    
    def expected_waiting_time_binomial(self, queue_idx, call_type):
        """
        Calculate expected waiting time using binomial mixture of call types.
        This is the same calculation as in your original code.
        """
        queue_size = self.queue_lengths[queue_idx]
        
        if queue_size == 0:
            return 0.0
        
        # Service times for this queue
        mean_regular = self.service_times[(queue_idx + 1, 'regular')]
        mean_specific = self.service_times[(queue_idx + 1, 'specific')]
        
        # Calculate expected waiting time using binomial mixture
        expected_wait = 0.0
        for k in range(queue_size + 1):
            prob = comb(queue_size, k) * (self.p_regular ** k) * ((1 - self.p_regular) ** (queue_size - k))
            wait_time = k * mean_regular + (queue_size - k) * mean_specific
            expected_wait += prob * wait_time
        
        return expected_wait
    
    def reset(self, seed=None, options=None):
        """Reset environment to initial state."""
        if seed is not None:
            self.seed(seed)
        
        self.queue_lengths = [0, 0]
        self.queue_contents = [[], []]
        self.current_time = 0.0
        self.episode_step = 0
        
        # Reset arrival times
        self.next_regular_arrival = 0.0
        self.next_specific_arrival = 0.0
        self._generate_next_arrivals()
        
        self.current_call_type, _ = self._get_next_call_info()
        
        return self._get_state(self.current_call_type), {}
    
    def _get_state(self, call_type):
        """Get current state as numpy array."""
        return np.array([
            self.queue_lengths[0], 
            self.queue_lengths[1], 
            call_type
        ], dtype=np.int32)
    
    def step(self, action):
        """Execute action and return next state, reward, done, info."""
        # Use the current call that was determined in reset() or previous step
        call_type = self.current_call_type
        call_type_str = 'regular' if call_type == 0 else 'specific'
        
        # Get the elapsed time for this call
        _, elapsed_time = self._get_current_call_timing()
        
        # Update current time
        self.current_time += elapsed_time
        
        # Process service completions during this inter-arrival time
        self._process_service_completions(elapsed_time)
        
        reward = 0.0
        dropped = False
        expected_wait = 0.0
        
        # Check if queue is full
        if self.queue_lengths[action] >= self.max_queue_size:
            reward = -self.drop_penalty
            dropped = True
            expected_wait = None
        else:
            # Calculate expected waiting time for this call
            expected_wait = self.expected_waiting_time_binomial(action, call_type)
            reward = -expected_wait
            # Add call to chosen queue
            self.queue_contents[action].append((call_type, self.current_time))
            self.queue_lengths[action] += 1
        
        # Update step counter
        self.episode_step += 1
        
        # NOW generate the NEXT call for the next step
        self.current_call_type, _ = self._get_next_call_info()
        
        # Determine if episode should continue
        terminated = False
        truncated = self.episode_step >= 1000  # Max episode length
        
        info = {
            'queue_1_length': self.queue_lengths[0],
            'queue_2_length': self.queue_lengths[1],
            'call_type': call_type_str,
            'chosen_queue': action + 1 if not dropped else None,
            'expected_wait': expected_wait,
            'dropped': dropped,
            'reward': reward,
            'current_time': self.current_time,
            'episode_step': self.episode_step,
            'elapsed_time': elapsed_time,
            'next_call_type': 'regular' if self.current_call_type == 0 else 'specific',
            'queue_1_contents': len(self.queue_contents[0]),
            'queue_2_contents': len(self.queue_contents[1]),
            'next_regular_arrival': self.next_regular_arrival,
            'next_specific_arrival': self.next_specific_arrival
        }
        
        return self._get_state(self.current_call_type), reward, terminated, truncated, info
    
    def _get_current_call_timing(self):
        """Get timing for the current call without consuming arrival streams."""
        self._generate_next_arrivals()
        
        if self.current_call_type == 0:  # regular
            elapsed_time = self.next_regular_arrival - self.current_time
            # Mark this arrival as used
            self.next_regular_arrival = float('inf')
        else:  # specific
            elapsed_time = self.next_specific_arrival - self.current_time
            # Mark this arrival as used
            self.next_specific_arrival = float('inf')
        
        return self.current_call_type, max(elapsed_time, 0.01)
    
    def render(self, mode='human'):
        """Render current state."""
        if hasattr(self, '_current_call_type'):
            call_type_str = 'Regular' if self._current_call_type == 0 else 'Specific'
        else:
            call_type_str = 'Unknown'
        
        print(f"Step {self.episode_step} | Time: {self.current_time:.1f}s | Queue 1: {self.queue_lengths[0]:2d} | Queue 2: {self.queue_lengths[1]:2d} | Current Call: {call_type_str}")
    
    def get_state_info(self):
        """Get detailed state information for analysis."""
        call_type = self.current_call_type
        call_type_str = 'Regular' if call_type == 0 else 'Specific'
        
        wait_q1 = self.expected_waiting_time_binomial(0, call_type)
        wait_q2 = self.expected_waiting_time_binomial(1, call_type)
        
        return {
            'queue_1_length': self.queue_lengths[0],
            'queue_2_length': self.queue_lengths[1],
            'call_type': call_type_str,
            'p_regular': self.p_regular,
            'expected_wait_q1': wait_q1,
            'expected_wait_q2': wait_q2,
            'optimal_choice': 1 if wait_q1 <= wait_q2 else 2,
            'current_time': self.current_time,
            'episode_step': self.episode_step
        }


In [47]:
# Testing the environment
# if __name__ == "__main__":
#     print("Call Centre MDP Environment")
    
#     # Set parameters for the environment
#     max_queue_size = MAX_QUEUE_SIZE
#     drop_penalty = DROP_PENALTY
#     scaler = 10.0

#     # Arrival parameters
#     arrival_means = {
#         0: ARRIVAL_REGULAR,    # Regular calls
#         1: ARRIVAL_SPECIFIC    # Specific calls
#     }
    
#     # Service parameters
#     service_means = {
#         (1, 0): SERVICE_REGULAR_1,
#         (1, 1): SERVICE_SPECIFIC_1,
#         (2, 0): SERVICE_REGULAR_2,
#         (2, 1): SERVICE_SPECIFIC_2
#     }

#     print(f"Regular call probability: {p_regular:.3f}")
#     print(f"Service times (mean): Q1 Regular = {service_means[(1,0)]:.1f} s, Specific = {service_means[(1,1)]:.1f} s")
#     print(f"                     Q2 Regular = {service_means[(2,0)]:.1f} s, Specific = {service_means[(2,1)]:.1f} s")
#     print(f"Arrival times (mean inter-arrival time): Regular = {arrival_means[0] / scaler:.1f} s, Specific = {arrival_means[1] / scaler:.1f} s")
#     print("="*60)

#     # Set random seeds for reproducibility
#     random.seed(1901448)
#     np.random.seed(1901448)
    
#     # Create environment with your original parameters
#     env = EventBasedCallCentreEnv(
#         max_queue_size=max_queue_size,
#         drop_penalty=drop_penalty,
#         arr_scaler=scaler,
#         arrival_regular_mean=arrival_means[0],  # Your ARRIVAL_REGULAR
#         arrival_specific_mean=arrival_means[1],  # Your ARRIVAL_SPECIFIC
#         service_regular_1=service_means[(1, 0)],     # Your SERVICE_REGULAR_1
#         service_specific_1=service_means[(1, 1)],    # Your SERVICE_SPECIFIC_1
#         service_regular_2=service_means[(2, 0)],     # Your SERVICE_REGULAR_2
#         service_specific_2=service_means[(2, 1)],    # Your SERVICE_SPECIFIC_2
#         seed=1901448
#     )
    
#     print("Testing Event-Based Call Centre Environment")
#     print("=" * 50)
    
#     # Test environment
#     state, _ = env.reset()
#     print(f"Initial state: Queue1={state[0]}, Queue2={state[1]}, CallType={'Regular' if state[2]==0 else 'Specific'}")
    
#     # Run a few test steps
#     total_reward = 0
#     for step in range(1000):
#         # Get state information
#         info = env.get_state_info()
        
#         # Choose action (0 = Queue 1, 1 = Queue 2)
#         # Simple policy: choose queue with shorter expected wait
#         if info.get('expected_wait_q1', float('inf')) <= info.get('expected_wait_q2', float('inf')):
#             action = 0
#         else:
#             action = 1
        
#         # Take step
#         next_state, reward, terminated, truncated, step_info = env.step(action)
#         total_reward += reward
        
#         print(f"Step {step+1}:")
#         print(f"  Action: Queue {action+1}")
#         print(f"  Call Type: {step_info['call_type']}")
#         print(f"  Expected Wait: {step_info['expected_wait']:.1f}s" if step_info['expected_wait'] is not None else "  DROPPED")
#         print(f"  New State: Queue1={next_state[0]}, Queue2={next_state[1]}")
#         print(f"  Elapsed Time: {step_info['elapsed_time']:.1f}s")
#         print(f"  Reward: {reward:.1f}")
#         print()
        
#         if terminated or truncated:
#             break
    
#     print(f"Total Reward: {total_reward:.1f}")
#     print(f"Final Time: {step_info['current_time']:.1f}s")

In [48]:
def shortest_queue_baseline(env, num_episodes=100):
    """Baseline: always choose the shorter queue."""
    total_costs = []
    dropped_calls = 0
    total_calls = 0

    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_cost = 0

        for step in range(200):
            q1, q2, call_type = state

            # Choose shorter queue, with tie-breaking
            if q1 < q2:
                action = 0  # queue 1
            elif q2 < q1:
                action = 1  # queue 2
            else:
                # Tie-break: regular calls to queue 1, specific to queue 2
                action = 0 if call_type == 0 else 1

            state, reward, terminated, truncated, info = env.step(action)
            episode_cost += -reward
            total_calls += 1

            if info.get("dropped", False):
                dropped_calls += 1

            if terminated or truncated:
                break

        total_costs.append(episode_cost)

    avg_cost = np.mean(total_costs)
    std_cost = np.std(total_costs)
    drop_rate = dropped_calls / total_calls if total_calls > 0 else 0

    print("Shortest Queue Baseline:")
    print(f"  Average cost per episode: {avg_cost:.2f} ± {std_cost:.2f}")
    print(f"  Drop rate: {drop_rate:.4f} ({dropped_calls}/{total_calls})")

    return avg_cost, std_cost, drop_rate

In [49]:
class AdvancedQLearning:
    """
    Advanced Q-Learning with problem-specific improvements for call center routing.
    """
    
    def __init__(self, 
                 env,
                 initial_alpha=0.5,
                 min_alpha=0.01,
                 gamma=0.99,
                 initial_epsilon=1.0,
                 min_epsilon=0.02,
                 epsilon_decay=0.999,
                 use_prioritized_sweeping=True,
                 use_experience_replay=True,
                 replay_buffer_size=10000,
                 batch_size=32,
                 target_update_freq=1000,
                 exploration_strategy='combined'):
        
        self.env = env
        self.gamma = gamma
        self.initial_alpha = initial_alpha
        self.min_alpha = min_alpha
        self.current_alpha = initial_alpha
        self.initial_epsilon = initial_epsilon
        self.min_epsilon = min_epsilon
        self.current_epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        
        # Advanced features
        self.use_prioritized_sweeping = use_prioritized_sweeping
        self.use_experience_replay = use_experience_replay
        self.replay_buffer_size = replay_buffer_size
        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        self.exploration_strategy = exploration_strategy
        
        # Q-tables: main and target (for stability)
        self.Q = defaultdict(lambda: np.zeros(2))
        self.Q_target = defaultdict(lambda: np.zeros(2))
        
        # Experience replay buffer
        self.experience_buffer = deque(maxlen=replay_buffer_size)
        
        # State-action visit counts and recency
        self.visit_counts = defaultdict(lambda: np.zeros(2))
        self.last_visited = defaultdict(lambda: np.zeros(2))
        self.state_visits = defaultdict(int)
        
        # Priority queue for prioritized sweeping
        self.priority_queue = {}
        
        # Performance tracking
        self.episode_rewards = []
        self.episode_costs = []
        self.epsilon_history = []
        self.alpha_history = []
        self.td_errors = []
        self.exploration_stats = {'random': 0, 'greedy': 0, 'ucb': 0, 'count_based': 0}
        
        # Problem-specific parameters
        self.drop_penalty_factor = 2.0  # Extra penalty for dropping calls
        self.balance_bonus = 10.0  # Bonus for keeping queues balanced
        
    def get_exploration_bonus(self, state, action, episode):
        """Calculate exploration bonus based on visit count and recency."""
        state_tuple = tuple(state)
        visits = self.visit_counts[state_tuple][action]
        
        if visits == 0:
            return 1000.0  # High bonus for never-visited state-actions
        
        # Count-based exploration bonus
        count_bonus = 100.0 / np.sqrt(visits + 1)
        
        # Recency bonus (encourage revisiting old states)
        episodes_since_last = episode - self.last_visited[state_tuple][action]
        recency_bonus = min(50.0, episodes_since_last * 0.1)
        
        return count_bonus + recency_bonus
    
    def select_action(self, state, episode):
        """Advanced action selection with multiple exploration strategies."""
        state_tuple = tuple(state)
        q1, q2, call_type = state
        
        # Problem-specific heuristics
        if q1 >= self.env.max_queue_size and q2 >= self.env.max_queue_size:
            # Both queues full - this should rarely happen with good policy
            return 0  # Default to queue 1
        elif q1 >= self.env.max_queue_size:
            return 1  # Must choose queue 2
        elif q2 >= self.env.max_queue_size:
            return 0  # Must choose queue 1
        
        # Exploration strategies
        if np.random.random() < self.current_epsilon:
            if self.exploration_strategy == 'combined':
                # 50% random, 50% count-based exploration
                if np.random.random() < 0.5:
                    action = np.random.randint(0, 2)
                    self.exploration_stats['random'] += 1
                else:
                    # Count-based: choose less visited action
                    visits = self.visit_counts[state_tuple]
                    if visits[0] <= visits[1]:
                        action = 0
                    else:
                        action = 1
                    self.exploration_stats['count_based'] += 1
            else:
                action = np.random.randint(0, 2)
                self.exploration_stats['random'] += 1
        else:
            # Exploitation with exploration bonuses
            q_values = self.Q[state_tuple].copy()
            
            # Add exploration bonuses
            for a in range(2):
                q_values[a] += self.get_exploration_bonus(state, a, episode) * 0.01
            
            # Add problem-specific bonuses
            # Bonus for queue balancing
            queue_diff = abs(q1 - q2)
            if queue_diff > 3:  # Significant imbalance
                if q1 > q2:
                    q_values[1] += self.balance_bonus  # Prefer queue 2
                else:
                    q_values[0] += self.balance_bonus  # Prefer queue 1
            
            # Call type specific bonuses based on service times
            if call_type == 0:  # Regular calls
                # Queue 1 is faster for regular calls
                q_values[0] += 20.0
            else:  # Specific calls
                # Queue 2 might be better for specific calls in some cases
                if q2 < q1:  # If queue 2 is shorter
                    q_values[1] += 15.0
            
            action = int(np.argmax(q_values))
            self.exploration_stats['greedy'] += 1
        
        return action
    
    def store_experience(self, state, action, reward, next_state, done):
        """Store experience for replay."""
        experience = (tuple(state), action, reward, tuple(next_state), done)
        self.experience_buffer.append(experience)
    
    def sample_experience_batch(self):
        """Sample a batch of experiences for learning."""
        if len(self.experience_buffer) < self.batch_size:
            return list(self.experience_buffer)
        return random.sample(self.experience_buffer, self.batch_size)
    
    def update_q_value(self, state, action, reward, next_state, episode, done=False):
        """Enhanced Q-value update with experience replay and target network."""
        state_tuple = tuple(state)
        next_state_tuple = tuple(next_state)
        
        # Adaptive learning rate based on visit count
        visits = self.visit_counts[state_tuple][action]
        adaptive_alpha = max(self.min_alpha, self.initial_alpha / (1 + visits * 0.01))
        
        # Modified reward for better learning
        modified_reward = reward
        
        # Extra penalty for drops
        if reward <= -self.env.drop_penalty * 0.8:  # Dropped call
            modified_reward *= self.drop_penalty_factor
        
        # Standard Q-learning update using target network
        current_q = self.Q[state_tuple][action]
        
        if done:
            target_q = modified_reward
        else:
            # Use target network for stability
            next_q_max = np.max(self.Q_target[next_state_tuple])
            target_q = modified_reward + self.gamma * next_q_max
        
        td_error = target_q - current_q
        self.Q[state_tuple][action] += adaptive_alpha * td_error
        
        # Update visit counts and recency
        self.visit_counts[state_tuple][action] += 1
        self.last_visited[state_tuple][action] = episode
        self.state_visits[state_tuple] += 1
        
        return abs(td_error)
    
    def experience_replay(self, episode):
        """Perform experience replay learning."""
        if not self.use_experience_replay or len(self.experience_buffer) < self.batch_size:
            return []
        
        batch = self.sample_experience_batch()
        td_errors = []
        
        for state, action, reward, next_state, done in batch:
            td_error = self.update_q_value(
                state, action, reward, next_state, episode, done
            )
            td_errors.append(td_error)
        
        return td_errors
    
    def update_target_network(self):
        """Update target network."""
        for state_tuple in self.Q:
            self.Q_target[state_tuple] = self.Q[state_tuple].copy()
    
    def decay_parameters(self, episode):
        """Update exploration and learning parameters."""
        # More aggressive epsilon decay early, then slow down
        if episode < 5000:
            decay_rate = 0.9995
        elif episode < 15000:
            decay_rate = 0.999
        else:
            decay_rate = 0.9999
            
        self.current_epsilon = max(
            self.min_epsilon, 
            self.current_epsilon * decay_rate
        )
        
        # Learning rate decay
        self.current_alpha = max(
            self.min_alpha,
            self.current_alpha * 0.9999
        )
        
        self.epsilon_history.append(self.current_epsilon)
        self.alpha_history.append(self.current_alpha)
    
    def train(self, num_episodes=50000, max_steps_per_episode=200, 
              print_every=5000, warmup_episodes=1000):
        """Enhanced training loop with warmup period."""
        print("Starting Advanced Q-Learning Training...")
        print(f"Episodes: {num_episodes}, Warmup: {warmup_episodes}")
        print(f"Experience replay: {self.use_experience_replay}")
        print(f"Target network updates every: {self.target_update_freq} episodes")
        print("=" * 60)
        
        recent_rewards = deque(maxlen=1000)
        
        for episode in range(num_episodes):
            state, _ = self.env.reset()
            episode_reward = 0
            episode_td_errors = []
            
            # Warmup phase: more exploration
            if episode < warmup_episodes:
                self.current_epsilon = max(0.5, self.current_epsilon)
            
            for step in range(max_steps_per_episode):
                action = self.select_action(state, episode)
                next_state, reward, terminated, truncated, info = self.env.step(action)
                
                # Store experience
                self.store_experience(state, action, reward, next_state, 
                                    terminated or truncated)
                
                # Update Q-value
                td_error = self.update_q_value(state, action, reward, next_state, 
                                             episode, terminated or truncated)
                episode_td_errors.append(td_error)
                
                state = next_state
                episode_reward += reward
                
                if terminated or truncated:
                    break
            
            # Experience replay
            if episode > warmup_episodes:
                replay_td_errors = self.experience_replay(episode)
                episode_td_errors.extend(replay_td_errors)
            
            # Update target network
            if episode % self.target_update_freq == 0:
                self.update_target_network()
            
            # Store episode statistics
            self.episode_rewards.append(episode_reward)
            self.episode_costs.append(-episode_reward)
            recent_rewards.append(episode_reward)
            
            if episode_td_errors:
                self.td_errors.append(np.mean(episode_td_errors))
            
            # Parameter decay
            self.decay_parameters(episode)
            
            # Progress reporting
            if (episode + 1) % print_every == 0:
                avg_reward = np.mean(recent_rewards) if recent_rewards else 0
                avg_cost = -avg_reward
                avg_td_error = np.mean(self.td_errors[-print_every:]) if self.td_errors else 0
                states_explored = len(self.Q)
                
                print(f"Episode {episode + 1}/{num_episodes}")
                print(f"  Avg Cost (last {len(recent_rewards)}): {avg_cost:.2f}")
                print(f"  Avg TD Error: {avg_td_error:.4f}")
                print(f"  Epsilon: {self.current_epsilon:.4f}")
                print(f"  Alpha: {self.current_alpha:.4f}")
                print(f"  States explored: {states_explored}")
                print(f"  Experience buffer: {len(self.experience_buffer)}")
                
                # Exploration statistics
                total_actions = sum(self.exploration_stats.values())
                if total_actions > 0:
                    print("  Exploration breakdown:")
                    for strategy, count in self.exploration_stats.items():
                        pct = (count / total_actions) * 100
                        print(f"    {strategy}: {pct:.1f}%")
                print()
        
        print("Training completed!")
        return self.Q, self.episode_costs
    
    def get_policy_analysis(self, max_queue_size=20):
        """Analyze the learned policy in detail."""
        policy_regular = np.zeros((max_queue_size + 1, max_queue_size + 1), dtype=int)
        policy_specific = np.zeros((max_queue_size + 1, max_queue_size + 1), dtype=int)
        q_value_diff = np.zeros((max_queue_size + 1, max_queue_size + 1))
        
        analysis = {
            'total_states': 0,
            'states_learned': 0,
            'avg_q_value_diff': 0,
            'strong_preferences': 0
        }
        
        for q1 in range(max_queue_size + 1):
            for q2 in range(max_queue_size + 1):
                analysis['total_states'] += 1
                
                # Regular call policy
                state_regular = (q1, q2, 0)
                if state_regular in self.Q:
                    analysis['states_learned'] += 1
                    q_values = self.Q[state_regular]
                    policy_regular[q1, q2] = np.argmax(q_values)
                    diff = abs(q_values[0] - q_values[1])
                    q_value_diff[q1, q2] = diff
                    
                    if diff > 100:  # Strong preference
                        analysis['strong_preferences'] += 1
                
                # Specific call policy
                state_specific = (q1, q2, 1)
                if state_specific in self.Q:
                    q_values = self.Q[state_specific]
                    policy_specific[q1, q2] = np.argmax(q_values)
        
        if analysis['states_learned'] > 0:
            analysis['coverage'] = analysis['states_learned'] / analysis['total_states']
            analysis['avg_q_value_diff'] = np.mean(q_value_diff[q_value_diff > 0])
        
        return policy_regular, policy_specific, q_value_diff, analysis


In [50]:
def run_advanced_qlearning(env, num_episodes=60000):
    """Run the advanced Q-learning algorithm with extended training."""
    
    agent = AdvancedQLearning(
        env=env,
        initial_alpha=0.3,
        min_alpha=0.005,
        gamma=0.99,  # Higher discount for longer-term planning
        initial_epsilon=1.0,
        min_epsilon=0.01,
        epsilon_decay=0.999,
        use_prioritized_sweeping=False,  # Keep it simple for now
        use_experience_replay=True,
        replay_buffer_size=50000,  # Larger buffer
        batch_size=64,  # Larger batch
        target_update_freq=500,  # More frequent updates
        exploration_strategy='combined'
    )
    
    # Extended training
    Q_table, episode_costs = agent.train(
        num_episodes=num_episodes,
        max_steps_per_episode=300,  # Longer episodes
        print_every=5000,
        warmup_episodes=2000  # Longer warmup
    )
    
    # Analyze the learned policy
    policy_reg, policy_spec, q_diff, analysis = agent.get_policy_analysis()
    
    print("\n" + "="*60)
    print("POLICY ANALYSIS:")
    print(f"State space coverage: {analysis['coverage']:.2%}")
    print(f"States with strong preferences: {analysis['strong_preferences']}")
    print(f"Average Q-value difference: {analysis.get('avg_q_value_diff', 0):.2f}")
    
    return agent, Q_table, episode_costs


# Enhanced evaluation function
def advanced_evaluation(env, Q_table, num_episodes=1000, policy_name="Advanced Q-Learning"):
    """Comprehensive policy evaluation."""
    total_costs = []
    dropped_calls = 0
    total_calls = 0
    queue_stats = {'q1_lengths': [], 'q2_lengths': [], 'total_lengths': []}
    action_stats = {'regular_to_q1': 0, 'regular_to_q2': 0, 
                   'specific_to_q1': 0, 'specific_to_q2': 0}
    wait_time_stats = []
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_cost = 0
        
        for step in range(300):  # Longer evaluation episodes
            state_tuple = tuple(state)
            q1, q2, call_type = state
            
            queue_stats['q1_lengths'].append(q1)
            queue_stats['q2_lengths'].append(q2)
            queue_stats['total_lengths'].append(q1 + q2)
            
            # Policy decision
            if state_tuple in Q_table:
                q_values = Q_table[state_tuple]
                
                # Add small amount of intelligent tie-breaking
                if abs(q_values[0] - q_values[1]) < 1:
                    # Use domain knowledge for ties
                    if call_type == 0 and q1 <= q2:  # Regular calls prefer Q1 if not worse
                        action = 0
                    elif call_type == 1 and q2 <= q1:  # Specific calls can prefer Q2
                        action = 1
                    else:
                        action = 0 if q1 < q2 else 1
                else:
                    action = int(np.argmax(q_values))
            else:
                # Default policy for unvisited states
                if q1 < q2:
                    action = 0
                elif q2 < q1:
                    action = 1
                else:
                    action = 0 if call_type == 0 else 1
            
            # Track actions
            if call_type == 0:
                if action == 0:
                    action_stats['regular_to_q1'] += 1
                else:
                    action_stats['regular_to_q2'] += 1
            else:
                if action == 0:
                    action_stats['specific_to_q1'] += 1
                else:
                    action_stats['specific_to_q2'] += 1
            
            state, reward, terminated, truncated, info = env.step(action)
            episode_cost += -reward
            total_calls += 1
            
            if info.get("dropped", False):
                dropped_calls += 1
            elif info.get("expected_wait") is not None:
                wait_time_stats.append(info["expected_wait"])
            
            if terminated or truncated:
                break
        
        total_costs.append(episode_cost)
    
    # Calculate comprehensive statistics
    avg_cost = np.mean(total_costs)
    std_cost = np.std(total_costs)
    drop_rate = dropped_calls / total_calls if total_calls > 0 else 0
    
    avg_wait = np.mean(wait_time_stats) if wait_time_stats else 0
    avg_q1_util = np.mean(queue_stats['q1_lengths'])
    avg_q2_util = np.mean(queue_stats['q2_lengths'])
    avg_total_util = np.mean(queue_stats['total_lengths'])
    
    print(f"\n{policy_name} Comprehensive Evaluation:")
    print(f"{'='*60}")
    print(f"Average cost per episode:    {avg_cost:.2f} ± {std_cost:.2f}")
    print(f"Drop rate:                   {drop_rate:.4f} ({dropped_calls}/{total_calls})")
    print(f"Average waiting time:        {avg_wait:.2f}s")
    print("Queue utilization:")
    print(f"  Queue 1 avg length:        {avg_q1_util:.2f}")
    print(f"  Queue 2 avg length:        {avg_q2_util:.2f}")
    print(f"  Total system load:         {avg_total_util:.2f}")
    
    print("Action distribution:")
    total_actions = sum(action_stats.values())
    for action_type, count in action_stats.items():
        percentage = (count / total_actions) * 100 if total_actions > 0 else 0
        print(f"  {action_type.replace('_', ' ').title()}: {percentage:.1f}%")
    
    return {
        'avg_cost': avg_cost,
        'std_cost': std_cost,
        'drop_rate': drop_rate,
        'avg_wait': avg_wait,
        'queue_util': (avg_q1_util, avg_q2_util),
        'action_distribution': action_stats
    }

In [51]:
if __name__ == "__main__":
    print("Enhanced Q-Learning for Call Centre Environment")
    print("=" * 60)
    
    # Use the same environment setup as your original code
    max_queue_size = MAX_QUEUE_SIZE
    drop_penalty = DROP_PENALTY
    scaler = 10.0

    arrival_means = {
        0: ARRIVAL_REGULAR,
        1: ARRIVAL_SPECIFIC
    }
    
    service_means = {
        (1, 0): SERVICE_REGULAR_1,
        (1, 1): SERVICE_SPECIFIC_1,
        (2, 0): SERVICE_REGULAR_2,
        (2, 1): SERVICE_SPECIFIC_2
    }

    print(f"Regular call probability: {p_regular:.3f}")
    print(f"Service times (mean): Q1 Regular = {service_means[(1,0)]:.1f} s, Specific = {service_means[(1,1)]:.1f} s")
    print(f"                     Q2 Regular = {service_means[(2,0)]:.1f} s, Specific = {service_means[(2,1)]:.1f} s")
    print(f"Arrival times (mean): Regular = {arrival_means[0] / scaler:.1f} s, Specific = {arrival_means[1] / scaler:.1f} s")
    print("=" * 60)

    # Set seeds for reproducibility
    random.seed(1901448)
    np.random.seed(1901448)
    
    env = EventBasedCallCentreEnv(
        max_queue_size=max_queue_size,
        drop_penalty=drop_penalty,
        arr_scaler=scaler,
        arrival_regular_mean=arrival_means[0],
        arrival_specific_mean=arrival_means[1],
        service_regular_1=service_means[(1, 0)],
        service_specific_1=service_means[(1, 1)],
        service_regular_2=service_means[(2, 0)],
        service_specific_2=service_means[(2, 1)],
        seed=1901448
    )

    # Evaluate baseline first
    print("\n1. BASELINE EVALUATION:")
    baseline_cost, _, _ = shortest_queue_baseline(env, num_episodes=1000)
    
    # Train and evaluate enhanced Q-learning
    print("\n3. ENHANCED Q-LEARNING:")
    print("Training enhanced Q-learning...")
    
    # Reset environment seed for fair comparison
    env.seed(1901448)
    
    print("Advanced Q-Learning with Extended Training")
    print("="*60)

    # Run advanced Q-learning with more episodes
    advanced_agent, advanced_Q, advanced_costs = run_advanced_qlearning(env, num_episodes=60000)

    # Comprehensive evaluation
    advanced_results = advanced_evaluation(env, advanced_Q, num_episodes=1000)

    # Compare with baseline
    print("\nFINAL COMPARISON:")
    print(f"Baseline cost:     {baseline_cost:.2f}")
    print(f"Advanced Q cost:   {advanced_results['avg_cost']:.2f}")
    improvement = baseline_cost - advanced_results['avg_cost']
    print(f"Improvement:       {improvement:.2f} ({improvement/baseline_cost*100:.2f}%)")

    if advanced_results['avg_cost'] < baseline_cost:
        print("✅ Enhanced Q-Learning beats baseline")
    else:
        print("❌ Enhanced Q-Learning loses to baseline")
    
    
    print("\nTraining completed! Check the comprehensive plots above for detailed analysis.")

Enhanced Q-Learning for Call Centre Environment
Regular call probability: 0.713
Service times (mean): Q1 Regular = 12.6 s, Specific = 14.2 s
                     Q2 Regular = 19.0 s, Specific = 15.0 s
Arrival times (mean): Regular = 4.0 s, Specific = 9.9 s

1. BASELINE EVALUATION:
Shortest Queue Baseline:
  Average cost per episode: 145797.99 ± 32213.89
  Drop rate: 0.1578 (31550/200000)

3. ENHANCED Q-LEARNING:
Training enhanced Q-learning...
Advanced Q-Learning with Extended Training
Starting Advanced Q-Learning Training...
Episodes: 60000, Warmup: 2000
Experience replay: True
Target network updates every: 500 episodes
Episode 5000/60000
  Avg Cost (last 1000): 310386.04
  Avg TD Error: 1184.6006
  Epsilon: 0.1115
  Alpha: 0.1820
  States explored: 881
  Experience buffer: 50000
  Exploration breakdown:
    random: 21.1%
    greedy: 57.8%
    ucb: 0.0%
    count_based: 21.1%

Episode 10000/60000
  Avg Cost (last 1000): 310514.57
  Avg TD Error: 1191.7922
  Epsilon: 0.0100
  Alpha: 0.

KeyboardInterrupt: 

### What to implement next:

* Implement Gym Environment
* Implement Q-Learning
* Implement PPO