In [14]:
%matplotlib inline

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import random
import gymnasium as gym
from gymnasium import Env, spaces
from scipy.special import comb
from collections import defaultdict

In [16]:
# Load the parameters from the pickle file
with open('pickles/anonbank_best_fit_params_report.pkl', 'rb') as f:
    best_fit_params_df = pickle.load(f)

# Convert to dictionary for easier access
best_fit_params_dict = best_fit_params_df.set_index('param_name').T.to_dict()

best_fit_params_dict

{('1', 'REGULAR'): {'shape': 0.8324764719822447,
  'loc': -3.115567764829924,
  'scale': 123.58267067008317},
 ('1', 'SPECIFIC'): {'shape': 1.0752011949530333,
  'loc': -1.9054138290538103,
  'scale': 126.33203131875452},
 ('2', 'REGULAR'): {'shape': 0.8540932024632348,
  'loc': -8.785773518703614,
  'scale': 148.1498746014255},
 ('2', 'SPECIFIC'): {'shape': 1.0827964542219308,
  'loc': -1.0635409275577619,
  'scale': 96.9210654259884},
 'REGULAR': {'shape': 1.045274579529165,
  'loc': -3.0642813856887496,
  'scale': 44.96983117392873},
 'SPECIFIC': {'shape': 1.1207108865662982,
  'loc': -5.97020631347698,
  'scale': 111.61143284668836}}

In [17]:
with open('pickles/anonbank_lognorm_params_report.pkl', 'rb') as f:
    lognorm_params_df = pickle.load(f)
    
lognorm_params_dict = lognorm_params_df.set_index('param_name').T.to_dict()

lognorm_params_dict

{('1', 'REGULAR'): {'mu': 4.816910330259358,
  'sigma': 0.006736114570356179,
  'loc': -3.115567764829924},
 ('1', 'SPECIFIC'): {'mu': 4.838913610177206,
  'sigma': 0.008510760998310192,
  'loc': -1.9054138290538103},
 ('2', 'REGULAR'): {'mu': 4.9982244275894185,
  'sigma': 0.005765014032185277,
  'loc': -8.785773518703614},
 ('2', 'SPECIFIC'): {'mu': 4.57389688872724,
  'sigma': 0.011171592730385803,
  'loc': -1.0635409275577619},
 'REGULAR': {'mu': 3.805991846582301,
  'sigma': 0.0232407685000318,
  'loc': -3.0642813856887496},
 'SPECIFIC': {'mu': 4.715023489562893,
  'sigma': 0.010040930489862581,
  'loc': -5.97020631347698}}

In [18]:
# Global variables

# Queue size
MAX_QUEUE_SIZE = 20

# Discount factor for future rewards
DISCOUNT = 0.95

# Penalty for dropping calls (when queue is full)
DROP_PENALTY = 3600.0 # 1 hour in seconds

# State space: (queue 1 size, queue 2 size, call type)
STATE_SPACE = (MAX_QUEUE_SIZE + 1, MAX_QUEUE_SIZE + 1, 2)

# Call type: 0 = regular, 1 = specific
CALL_TYPE = [0, 1]

# Actions: 0 = route to queue 1, 1 = route to queue 2
ACTIONS = [1, 2]

# Arrival parameters for regular and specific calls
ARRIVAL_REGULAR_MU = lognorm_params_dict['REGULAR']['mu']
ARRIVAL_REGULAR_SIGMA = lognorm_params_dict['REGULAR']['sigma']
ARRIVAL_REGULAR_LOC = lognorm_params_dict['REGULAR']['loc']

ARRIVAL_SPECIFIC_MU = lognorm_params_dict['SPECIFIC']['mu']
ARRIVAL_SPECIFIC_SIGMA = lognorm_params_dict['SPECIFIC']['sigma']
ARRIVAL_SPECIFIC_LOC = lognorm_params_dict['SPECIFIC']['loc']

SERVICE_REGULAR_1_MU = lognorm_params_dict[('1', 'REGULAR')]['mu']
SERVICE_REGULAR_1_SIGMA = lognorm_params_dict[('1', 'REGULAR')]['sigma']
SERVICE_REGULAR_1_LOC = lognorm_params_dict[('1', 'REGULAR')]['loc']
SERVICE_SPECIFIC_1_MU = lognorm_params_dict[('1', 'SPECIFIC')]['mu']
SERVICE_SPECIFIC_1_SIGMA = lognorm_params_dict[('1', 'SPECIFIC')]['sigma']
SERVICE_SPECIFIC_1_LOC = lognorm_params_dict[('1', 'SPECIFIC')]['loc']

SERVICE_REGULAR_2_MU = lognorm_params_dict[('2', 'REGULAR')]['mu']
SERVICE_REGULAR_2_SIGMA = lognorm_params_dict[('2', 'REGULAR')]['sigma']
SERVICE_REGULAR_2_LOC = lognorm_params_dict[('2', 'REGULAR')]['loc']
SERVICE_SPECIFIC_2_MU = lognorm_params_dict[('2', 'SPECIFIC')]['mu']
SERVICE_SPECIFIC_2_SIGMA = lognorm_params_dict[('2', 'SPECIFIC')]['sigma']
SERVICE_SPECIFIC_2_LOC = lognorm_params_dict[('2', 'SPECIFIC')]['loc']

def mean_shifted_lognormal(mu, sigma, loc):
    """
    Calculate mean of shifted lognormal distribution:
    E[X] = exp(mu + sigma^2 / 2) + loc
    """
    return np.exp(mu + (sigma**2) / 2) + loc

# Arrival rates

ARRIVAL_REGULAR = mean_shifted_lognormal(
    ARRIVAL_REGULAR_MU,
    ARRIVAL_REGULAR_SIGMA,
    ARRIVAL_REGULAR_LOC
)

ARRIVAL_SPECIFIC = mean_shifted_lognormal(
    ARRIVAL_SPECIFIC_MU,
    ARRIVAL_SPECIFIC_SIGMA,
    ARRIVAL_SPECIFIC_LOC
)

# Service rates

SERVICE_REGULAR_1 = mean_shifted_lognormal(
    SERVICE_REGULAR_1_MU,
    SERVICE_REGULAR_1_SIGMA,
    SERVICE_REGULAR_1_LOC
)

SERVICE_REGULAR_2 = mean_shifted_lognormal(
    SERVICE_REGULAR_2_MU,
    SERVICE_REGULAR_2_SIGMA,
    SERVICE_REGULAR_2_LOC
)

SERVICE_SPECIFIC_1 = mean_shifted_lognormal(
    SERVICE_SPECIFIC_1_MU,
    SERVICE_SPECIFIC_1_SIGMA,
    SERVICE_SPECIFIC_1_LOC
)

SERVICE_SPECIFIC_2 = mean_shifted_lognormal(
    SERVICE_SPECIFIC_2_MU,
    SERVICE_SPECIFIC_2_SIGMA,
    SERVICE_SPECIFIC_2_LOC
)

In [19]:
print(f"ARRIVAL_REGULAR: {ARRIVAL_REGULAR}, ARRIVAL_SPECIFIC: {ARRIVAL_SPECIFIC}")
print(f"SERVICE_REGULAR_1: {SERVICE_REGULAR_1}, SERVICE_REGULAR_2: {SERVICE_REGULAR_2}")
print(f"SERVICE_SPECIFIC_1: {SERVICE_SPECIFIC_1}, SERVICE_SPECIFIC_2: {SERVICE_SPECIFIC_2}") 

ARRIVAL_REGULAR: 41.917696280463964, ARRIVAL_SPECIFIC: 105.64685302326639
SERVICE_REGULAR_1: 120.46990673369942, SERVICE_REGULAR_2: 139.36656301237034
SERVICE_SPECIFIC_1: 124.43119287989809, SERVICE_SPECIFIC_2: 95.86357277892792


In [13]:
# Lognormal expected value calculation

def expected_waiting_time_lognormal(q_size, p_regular, mean_s, mean_c):
    """
    Calculate expected waiting time for a queue with q_size calls,
    each call being regular with probability p_regular,
    using direct mean service times for regular and specific calls.
    """
    wait = 0.0
    for k in range(q_size + 1):
        # Binomial probability of k regular calls in the queue
        prob = comb(q_size, k) * (p_regular ** k) * ((1 - p_regular) ** (q_size - k))
        # Waiting time if k regular and (q_size - k) specific ahead
        wait_k = k * mean_s + (q_size - k) * mean_c
        wait += prob * wait_k

    # Add mean service time for the current arriving call (assumed regular here)
    wait += mean_s
    return wait


In [14]:
# Example usage to calculate expected waiting time for queue 1:

rate_regular = 1 / ARRIVAL_REGULAR
rate_specific = 1 / ARRIVAL_SPECIFIC

p_regular = rate_regular / (rate_regular + rate_specific)

mean_s1 = SERVICE_REGULAR_1

mean_c1 = SERVICE_SPECIFIC_1

q1 = 10 # Example queue size

expected_wait = expected_waiting_time_lognormal(q1, p_regular, mean_s1, mean_c1)
print(f"Expected waiting time at queue 1: {expected_wait:.4f}")

# Compare with queue 2

mean_s2 = SERVICE_REGULAR_2

mean_c2 = SERVICE_SPECIFIC_2

expected_wait = expected_waiting_time_lognormal(q1, p_regular, mean_s2, mean_c2) # Assuming same queue size for simplicity
print(f"Expected waiting time at queue 2: {expected_wait:.4f}")

Expected waiting time at queue 1: 1156.5679
Expected waiting time at queue 2: 1425.3833


In [15]:
class CallCentreMDPEnv:
    """
    RL environment for a two-queue call centre.
    State: (q1_length, q2_length, call_type)
    Action: 1 = route to queue 1, 2 = queue 2
    Reward: Negative expected waiting time (cost to minimize)
    """
    def __init__(self, max_queue=MAX_QUEUE_SIZE, drop_penalty=DROP_PENALTY, seed=1901448):
        self.max_queue = max_queue
        self.drop_penalty = drop_penalty
        self.state = None

        # Precompute arrival rates and squared coefficients of variation (cv^2)
        self.lam_arrival = {
            0: 1 / lognormal_mean(ARRIVAL_REGULAR_MU, ARRIVAL_REGULAR_SIGMA, ARRIVAL_REGULAR_LOC),
            1: 1 / lognormal_mean(ARRIVAL_SPECIFIC_MU, ARRIVAL_SPECIFIC_SIGMA, ARRIVAL_SPECIFIC_LOC)
        }
        self.ca2_arrival = {
            0: lognormal_cv2(ARRIVAL_REGULAR_MU, ARRIVAL_REGULAR_SIGMA, ARRIVAL_REGULAR_LOC),
            1: lognormal_cv2(ARRIVAL_SPECIFIC_MU, ARRIVAL_SPECIFIC_SIGMA, ARRIVAL_SPECIFIC_LOC)
        }

        # Store service lognormal parameters per queue and call type
        self.service_params = {
            (1, 0): (SERVICE_REGULAR_1_MU, SERVICE_REGULAR_1_SIGMA, SERVICE_REGULAR_1_LOC),
            (1, 1): (SERVICE_SPECIFIC_1_MU, SERVICE_SPECIFIC_1_SIGMA, SERVICE_SPECIFIC_1_LOC),
            (2, 0): (SERVICE_REGULAR_2_MU, SERVICE_REGULAR_2_SIGMA, SERVICE_REGULAR_2_LOC),
            (2, 1): (SERVICE_SPECIFIC_2_MU, SERVICE_SPECIFIC_2_SIGMA, SERVICE_SPECIFIC_2_LOC),
        }
        self.service_means = {}
        self.service_cv2 = {}

        for key, (mu, sigma, loc) in self.service_params.items():
            self.service_means[key] = lognormal_mean(mu, sigma, loc)
            self.service_cv2[key] = lognormal_cv2(mu, sigma, loc)

        # Initialize queues and tracked service times
        self.queue1_length = 0
        self.queue2_length = 0
        self.queue1_service_times = []
        self.queue2_service_times = []

        self.seed(seed)

    def seed(self, seed=None):
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)

    def reset(self):
        self.queue1_length = 0
        self.queue2_length = 0
        self.queue1_service_times = []
        self.queue2_service_times = []

        total_lam = self.lam_arrival[0] + self.lam_arrival[1]
        call_type = np.random.choice([0, 1], p=[self.lam_arrival[0]/total_lam, self.lam_arrival[1]/total_lam])
        self.state = (0, 0, call_type)
        return self.state

    def _kingman_expected_waiting(self, lam, mu_s, ca2, cs2):
        if lam <= 0:
            return 0.0
        rho = lam * mu_s
        if rho >= 1.0:
            return float('inf')  # unstable system
        return (rho / (1 - rho)) * (ca2 + cs2) / 2 * mu_s

    def _expected_waiting_time(self, queue_num, queue_length, call_type):
        if queue_length == 0:
            return 0.0
        lam = self.lam_arrival[call_type]
        ca2 = self.ca2_arrival[call_type]
        mu_s = self.service_means[(queue_num, call_type)]
        cs2 = self.service_cv2[(queue_num, call_type)]
        expected_wait_one_call = self._kingman_expected_waiting(lam, mu_s, ca2, cs2)
        return queue_length * expected_wait_one_call

    def _sample_service_time(self, queue_num, call_type):
        mu, sigma, loc = self.service_params[(queue_num, call_type)]
        return np.random.lognormal(mean=mu, sigma=sigma) + loc

    def step(self, action):
        q1_len, q2_len, call_type = self.state
        dropped = False

        if action == 1 and q1_len >= self.max_queue:
            reward = -self.drop_penalty
            dropped = True
        elif action == 2 and q2_len >= self.max_queue:
            reward = -self.drop_penalty
            dropped = True
        else:
            if action == 1:
                expected_wait = self._expected_waiting_time(1, q1_len, call_type)
                self.queue1_length = min(q1_len + 1, self.max_queue)
                new_service = self._sample_service_time(1, call_type)
                self.queue1_service_times.append(new_service)
            elif action == 2:
                expected_wait = self._expected_waiting_time(2, q2_len, call_type)
                self.queue2_length = min(q2_len + 1, self.max_queue)
                new_service = self._sample_service_time(2, call_type)
                self.queue2_service_times.append(new_service)

            reward = -expected_wait
            dropped = False

        # Advance time by 1 unit for all calls, remove completed calls
        time_step = 1.0
        self.queue1_service_times = [t - time_step for t in self.queue1_service_times if t - time_step > 0]
        self.queue2_service_times = [t - time_step for t in self.queue2_service_times if t - time_step > 0]

        self.queue1_length = len(self.queue1_service_times)
        self.queue2_length = len(self.queue2_service_times)

        total_lam = self.lam_arrival[0] + self.lam_arrival[1]
        p_regular = self.lam_arrival[0] / total_lam
        next_call_type = np.random.choice([0, 1], p=[p_regular, 1 - p_regular])

        self.state = (self.queue1_length, self.queue2_length, next_call_type)

        done = False
        info = {"dropped": dropped}

        return self.state, reward, done, info


# Helper functions for lognormal calculations

def lognormal_mean(mu, sigma, loc=0):
    return np.exp(mu + sigma**2 / 2) + loc

def lognormal_var(mu, sigma):
    return (np.exp(sigma**2) - 1) * np.exp(2 * mu + sigma**2)

def lognormal_cv2(mu, sigma, loc=0):
    m = lognormal_mean(mu, sigma, loc)
    v = lognormal_var(mu, sigma)
    return v / m**2

In [16]:
def q_learning_with_debugging(
    env,
    num_episodes=5000,
    max_steps_per_episode=200,
    alpha=0.1,
    gamma=0.9,  # Lower discount for immediate rewards
    epsilon_start=0.9,  # Start with less exploration
    epsilon_min=0.05,
    epsilon_decay=0.995
):
    """
    Q-learning with debugging information
    """
    Q = defaultdict(lambda: np.zeros(2))
    
    episode_costs = []  # Track actual costs (positive values)
    epsilon = epsilon_start
    
    for episode in range(num_episodes):
        state = env.reset()
        state_tuple = tuple(state)
        
        total_reward = 0
        steps = 0
        
        for step in range(max_steps_per_episode):
            # Epsilon-greedy action selection
            if random.random() < epsilon:
                action = random.choice([1, 2])
            else:
                best_action_idx = np.argmax(Q[state_tuple])
                action = best_action_idx + 1
            
            next_state, reward, done, info = env.step(action)
            next_state_tuple = tuple(next_state)
            
            # Q-learning update
            best_next_action = np.argmax(Q[next_state_tuple])
            td_target = reward + gamma * Q[next_state_tuple][best_next_action]
            td_error = td_target - Q[state_tuple][action - 1]
            Q[state_tuple][action - 1] += alpha * td_error
            
            state_tuple = next_state_tuple
            total_reward += reward
            steps += 1
            
            if done:
                break
        
        # Decay epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        episode_costs.append(-total_reward)  # Convert to positive cost
        
        # Progress logging
        if (episode + 1) % 1000 == 0:
            avg_cost = np.mean(episode_costs[-1000:])  # Average cost over last 1000 episodes
            print(f"Episode {episode+1}/{num_episodes}")
            print(f"  Avg Cost (last 1000): {avg_cost:.2f}")
            print(f"  Epsilon: {epsilon:.4f}")
            print(f"  States explored: {len(Q)}")
            print()
    
    return Q, episode_costs

In [17]:
def evaluate_policy(env, Q_table, num_episodes=100, policy_name="Q-Learning"):
    """Evaluate a policy without exploration"""
    total_costs = []
    dropped_calls = 0
    total_calls = 0
    
    for episode in range(num_episodes):
        state = env.reset()
        episode_cost = 0
        
        for step in range(200):  # Max steps per episode
            state_tuple = tuple(state)
            
            # Use learned policy (no exploration)
            if state_tuple in Q_table:
                action_idx = np.argmax(Q_table[state_tuple])
                action = action_idx + 1  # Convert zero-based index to action (1 or 2)
            else:
                action = 1  # Default to queue 1 if state not seen
            
            state, reward, done, info = env.step(action)
            episode_cost += (-reward)  # Convert reward to positive cost
            total_calls += 1
            
            if info["dropped"]:
                dropped_calls += 1
            
            if done:
                break
                
        total_costs.append(episode_cost)
    
    avg_cost = np.mean(total_costs)
    std_cost = np.std(total_costs)
    drop_rate = dropped_calls / total_calls if total_calls > 0 else 0
    
    print(f"{policy_name} Policy Evaluation:")
    print(f"  Average cost per episode: {avg_cost:.2f} ± {std_cost:.2f}")
    print(f"  Drop rate: {drop_rate:.4f} ({dropped_calls}/{total_calls})")
    
    return avg_cost, std_cost, drop_rate

In [18]:
def shortest_queue_baseline(env, num_episodes=100):
    """Baseline: always choose the shorter queue"""
    total_costs = []
    dropped_calls = 0
    total_calls = 0
    
    for episode in range(num_episodes):
        state = env.reset()
        episode_cost = 0
        
        for step in range(200):
            q1, q2, call_type = state
            if q1 < q2:
                action = 1
            elif q2 < q1:
                action = 2
            else:
                # Tie-break based on call type
                action = 1 if call_type == 0 else 2
            
            state, reward, done, info = env.step(action)
            episode_cost += (-reward)  # Convert to positive cost
            total_calls += 1
            
            if info["dropped"]:
                dropped_calls += 1
                
            if done:
                break
                
        total_costs.append(episode_cost)
    
    avg_cost = np.mean(total_costs)
    std_cost = np.std(total_costs)
    drop_rate = dropped_calls / total_calls if total_calls > 0 else 0
    
    print("Shortest Queue Baseline:")
    print(f"  Average cost per episode: {avg_cost:.2f} ± {std_cost:.2f}")
    print(f"  Drop rate: {drop_rate:.4f} ({dropped_calls}/{total_calls})")
    
    return avg_cost, std_cost, drop_rate

In [20]:
if __name__ == "__main__":
    print("Call Centre MDP Environment")

    # Compute mean service times for printing
    reg_q1_mean = lognormal_mean(SERVICE_REGULAR_1_MU, SERVICE_REGULAR_1_SIGMA, SERVICE_REGULAR_1_LOC)
    reg_q2_mean = lognormal_mean(SERVICE_REGULAR_2_MU, SERVICE_REGULAR_2_SIGMA, SERVICE_REGULAR_2_LOC)
    spec_q1_mean = lognormal_mean(SERVICE_SPECIFIC_1_MU, SERVICE_SPECIFIC_1_SIGMA, SERVICE_SPECIFIC_1_LOC)
    spec_q2_mean = lognormal_mean(SERVICE_SPECIFIC_2_MU, SERVICE_SPECIFIC_2_SIGMA, SERVICE_SPECIFIC_2_LOC)
    arr_reg_mean = lognormal_mean(ARRIVAL_REGULAR_MU, ARRIVAL_REGULAR_SIGMA, ARRIVAL_REGULAR_LOC)
    arr_spec_mean = lognormal_mean(ARRIVAL_SPECIFIC_MU, ARRIVAL_SPECIFIC_SIGMA, ARRIVAL_SPECIFIC_LOC)

    total_lambda = (1 / lognormal_mean(ARRIVAL_REGULAR_MU, ARRIVAL_REGULAR_SIGMA, ARRIVAL_REGULAR_LOC) +
                    1 / lognormal_mean(ARRIVAL_SPECIFIC_MU, ARRIVAL_SPECIFIC_SIGMA, ARRIVAL_SPECIFIC_LOC))
    p_regular = (1 / lognormal_mean(ARRIVAL_REGULAR_MU, ARRIVAL_REGULAR_SIGMA, ARRIVAL_REGULAR_LOC)) / total_lambda

    print(f"Regular call probability: {p_regular:.3f}")
    print(f"Service times (mean): Q1 Regular = {reg_q1_mean:.1f} s, Specific = {spec_q1_mean:.1f} s")
    print(f"                      Q2 Regular = {reg_q2_mean:.1f} s, Specific = {spec_q2_mean:.1f} s")
    print(f"Arrival times (mean inter-arrival time): Regular = {arr_reg_mean:.1f} s, Specific = {arr_spec_mean:.1f} s")
    print("="*60)

    # Set random seeds for reproducibility
    random.seed(1901448)
    np.random.seed(1901448)

    # Create environment with consistent drop penalty
    env = CallCentreMDPEnv(max_queue=MAX_QUEUE_SIZE, drop_penalty=DROP_PENALTY)

    # Test environment with a few random steps
    print("\nTesting environment:")
    state = env.reset()
    print(f"Initial state: {state}")

    for i in range(3):
        action = random.choice([1, 2])
        state, reward, done, info = env.step(action)
        cost = -reward
        print(f"Step {i+1}: Action={action}, State={state}, Cost={cost:.1f}, Dropped={info['dropped']}")

    print("\n" + "="*60)

    # Evaluate baseline policy
    print("BASELINE EVALUATION:")
    baseline_cost, _, _ = shortest_queue_baseline(env, num_episodes=1000)

    print("\n" + "="*60)

    # Train Q-learning agent
    print("Q-LEARNING TRAINING:")
    Q_table, costs = q_learning_with_debugging(env, num_episodes=10000, epsilon_decay=0.9995, alpha=0.3)

    print("="*60)

    # Evaluate learned policy
    print("LEARNED POLICY EVALUATION:")
    learned_cost, _, _ = evaluate_policy(env, Q_table, num_episodes=1000)

    print("="*60)
    print("FINAL COMPARISON:")
    print(f"Baseline (Shortest Queue): {baseline_cost:.2f}")
    print(f"Q-Learning Policy:         {learned_cost:.2f}")
    improvement = baseline_cost - learned_cost
    print(f"Improvement:               {improvement:.2f}")

    if improvement > 0:
        print("✅ Q-Learning is BETTER (lower cost)")
    else:
        print("❌ Q-Learning is WORSE (higher cost)")

    percent_impr = (improvement / baseline_cost) * 100 if baseline_cost > 0 else 0.0
    print(f"Percentage improvement:    {percent_impr:.2f}%")


Call Centre MDP Environment
Regular call probability: 0.713
Service times (mean): Q1 Regular = 106.7 s, Specific = 100.9 s
                      Q2 Regular = 141.3 s, Specific = 96.4 s
Arrival times (mean inter-arrival time): Regular = 40.1 s, Specific = 99.3 s

Testing environment:
Initial state: (0, 0, np.int64(1))
Step 1: Action=2, State=(0, 1, np.int64(0)), Cost=0.0, Dropped=False
Step 2: Action=2, State=(0, 2, np.int64(0)), Cost=inf, Dropped=False
Step 3: Action=2, State=(0, 3, np.int64(0)), Cost=inf, Dropped=False

BASELINE EVALUATION:


  x = asanyarray(arr - arrmean)
  td_error = td_target - Q[state_tuple][action - 1]
  Q[state_tuple][action - 1] += alpha * td_error


Shortest Queue Baseline:
  Average cost per episode: inf ± nan
  Drop rate: 0.5978 (119555/200000)

Q-LEARNING TRAINING:


KeyboardInterrupt: 

In [None]:
plt.plot(costs, alpha=0.5, label="Cost")
window = 50
smoothed = pd.Series(costs).rolling(window).mean()
plt.plot(smoothed, color='red', label=f"Smoothed (window={window})")
plt.title("Q-Learning Costs Over Episodes")
plt.xlabel("Episode")
plt.ylabel("Cost (Negative Waiting Time)")
plt.legend()
plt.show()

In [None]:
Q_table

In [None]:
# Plot the Q-table
# Initialize policy arrays
policy_regular = np.zeros((MAX_QUEUE_SIZE + 1, MAX_QUEUE_SIZE + 1), dtype=int)
policy_specific = np.zeros((MAX_QUEUE_SIZE + 1, MAX_QUEUE_SIZE + 1), dtype=int)

# Fill arrays with best actions from Q_table
for q1 in range(MAX_QUEUE_SIZE + 1):
    for q2 in range(MAX_QUEUE_SIZE + 1):
        # Simple call policy
        s_regular = (q1, q2, 0)
        if s_regular in Q_table:
            policy_regular[q1, q2] = np.argmax(Q_table[s_regular])
        # Complex call policy
        s_specific = (q1, q2, 1)
        if s_specific in Q_table:
            policy_specific[q1, q2] = np.argmax(Q_table[s_specific])

fig, axes = plt.subplots(1, 2, figsize=(18, 10))

# Plot Final Simple Call Policy
sns.heatmap(np.flipud(policy_regular), cmap="YlGnBu", ax=axes[0], annot=np.flipud(policy_regular),
            cbar=False)
axes[0].set_title('Final Simple Call Policy')
axes[0].set_ylabel('Queue 1 Size')
axes[0].set_xlabel('Queue 2 Size')
axes[0].set_yticks(list(reversed(range(0, MAX_QUEUE_SIZE + 1, 5))))
axes[0].set_yticklabels(range(0, MAX_QUEUE_SIZE + 1, 5))

# Plot Final Complex Call Policy
sns.heatmap(np.flipud(policy_specific), cmap="YlOrBr", ax=axes[1], annot=np.flipud(policy_specific),
            cbar=False)
axes[1].set_title('Final Complex Call Policy')
axes[1].set_ylabel('Queue 1 Size')
axes[1].set_xlabel('Queue 2 Size')
axes[1].set_yticks(list(reversed(range(0, MAX_QUEUE_SIZE + 1, 5))))
axes[1].set_yticklabels(range(0, MAX_QUEUE_SIZE + 1, 5))

plt.tight_layout()
plt.show()

### What to implement next:

* Implement Gym Environment
* Implement Q-Learning
* Implement PPO