# Finite Inventory Pricing

## 0. Imports

In [None]:
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pickle
from algorithms import policy_iteration, q_learning, sarsa_lambda, value_iteration
from constants import K, T, GAMMA, PRICES, NUM_EPISODES, TRACE_DECAY, GAUSSIAN_SMOOTHING_STD
from model import sample_demand
from scipy.ndimage import gaussian_filter, gaussian_filter1d

plt.rcParams["figure.autolayout"] = True

In [None]:
def value_function(Q):
    """ Use the state action value function to compute the value function. """

    V = {}
    for inv in range(K + 1):
        for t in range(T + 1):
            V[(inv, t)] = 0.0

    for (s, t, a) in Q.keys():
        V[(s, t)] = max(V[(s, t)], Q[(s, t, a)])

    return V

def dict_to_array(dict):
    """ Convert the value function from dictionary to numpy array. """

    array = np.zeros((K, T))
    for inv in range(1, K + 1):
        for t in range(1, T + 1):
            array[inv - 1, t - 1] = dict[(inv, t)]
    return array

def state_value_function_to_array(dict):
    """ Convert the value function from dictionary to numpy array. """

    price2idx = {price: idx for idx, price in enumerate(PRICES)}
    array = np.zeros((K, T, len(PRICES)))
    for inv in range(1, K + 1):
        for t in range(1, T + 1):
            for price in PRICES:
                array[inv - 1, t - 1, price2idx[price]] = dict[(inv, t, price)]
    return array

def plot_matrix(data, title = '', fig = None, ax = None, sigma = 0.0, value_range=(PRICES[0], PRICES[-1])):
    """ Plots the given matrix as a heatmap. """

    matrix = dict_to_array(data)
    matrix = gaussian_filter(matrix, sigma=sigma)

    if ax is None:
        plt.figure()
        plt.imshow(matrix, cmap='viridis', aspect='auto')
        cbar = plt.colorbar(label='Price')
        plt.clim(*value_range)
        plt.title(title)
        plt.ylabel('Remaining Seats')
        plt.xlabel('Days Left')
        plt.ylim(1, K - 1)
        plt.xlim(1, T - 1)
        plt.show()

    else:
        im = ax.imshow(matrix, cmap='viridis', aspect='auto')
        cbar = fig.colorbar(im, ax=ax)
        cbar.set_label('Price', fontsize=12)
        im.set_clim(*value_range)
        ax.set_title(title, fontsize=14)
        ax.set_ylabel('Remaining Seats', fontsize=12)
        ax.set_xlabel('Days Left', fontsize=12)
        ax.set_ylim(1, K - 1)
        ax.set_xlim(1, T - 1)

def generate_trajectory(policy, num_runs=500):
    """ Generates a trajectory following the given policy. """

    P = policy
    trajectory = []
    trajectory_reward = []

    for run_idx in range(num_runs):

        inventory = K
        time = T
        episode_reward = 0
        trajectory.append([inventory])

        while time > 0:

            price = P[(inventory, time)]
            demand = sample_demand(price)
            sales = min(demand, inventory)
            reward = price * sales
            episode_reward = reward + GAMMA * episode_reward

            inventory -= sales
            time -= 1
            trajectory[run_idx].append(inventory)

        trajectory_reward.append(episode_reward)

    return trajectory, trajectory_reward

def plot_episode_rewards(metrics, title = 'Total Discounted Rewards per Episode', ax = None, label = None):
    """ Plots the total discounted rewards per episode. """

    if label is None:
        ax.plot(gaussian_filter1d(metrics['episode_rewards'], sigma=GAUSSIAN_SMOOTHING_STD))
    else:
        ax.plot(gaussian_filter1d(metrics['episode_rewards'], sigma=GAUSSIAN_SMOOTHING_STD), label=label)
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Total Discounted Reward', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.set_xlim(0, NUM_EPISODES)
    ax.grid(linewidth=0.35)

def plot_episode_regrets(metrics, title = 'Instantaneous Episodic Regret', ax = None, label = None):
    """ Plots the instantaneous episodic regret. """

    if label is None:
        ax.plot(gaussian_filter1d(metrics['episode_regrets'], sigma=GAUSSIAN_SMOOTHING_STD))
    else:
        ax.plot(gaussian_filter1d(metrics['episode_regrets'], sigma=GAUSSIAN_SMOOTHING_STD), label=label)
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Regret', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.set_xlim(0, NUM_EPISODES)
    ax.grid(linewidth=0.35)

def plot_cumulative_rewards(metrics, title = 'Cumulative Discounted Rewards over Episodes', ax = None, label = None):
    """ Plots the cumulative discounted rewards over episode. """

    if label is None:
        ax.plot(gaussian_filter1d(metrics['cumulative_rewards'], sigma=GAUSSIAN_SMOOTHING_STD))
    else:
        ax.plot(gaussian_filter1d(metrics['cumulative_rewards'], sigma=GAUSSIAN_SMOOTHING_STD), label=label)
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Cumulative Discounted Reward', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.set_xlim(0, NUM_EPISODES)
    ax.grid(linewidth=0.35)

def plot_cumulative_regrets(metrics, title = 'Cumulative Regrets over Episodes', ax = None, label = None):
    """ Plots the cumulative regrets over episodes. """

    if label is None:
        ax.plot(gaussian_filter1d(metrics['cumulative_regrets'], sigma=GAUSSIAN_SMOOTHING_STD))
    else:
        ax.plot(gaussian_filter1d(metrics['cumulative_regrets'], sigma=GAUSSIAN_SMOOTHING_STD), label=label)
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Cumulative Regret', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.set_xlim(0, NUM_EPISODES)
    ax.grid(linewidth=0.35)

def plot_state_value_evolution(metrics, title = '', ax = None):
    """ Plots the evolution of state value function over episodes. """

    for idx in range(len(PRICES)):
        ax.plot(gaussian_filter1d(np.array(metrics['starting_values'])[:, idx], sigma=GAUSSIAN_SMOOTHING_STD), label=f'p={PRICES[idx]}')
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Value Function', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.set_xlim(0, NUM_EPISODES)
    ax.grid(linewidth=0.35)

def plot_policy_evolution(metrics, title = '', ax = None):
    """ Plots the evolution of policy over episodes. """

    for idx in range(len(PRICES)):
        ax.plot(metrics['starting_policies'])
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Price', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.set_xlim(0, NUM_EPISODES)
    ax.grid(linewidth=0.35)

def save_plot(dest_path):
    """ Saves the plot as an image file, given the absolute path. """

    plt.savefig(dest_path, bbox_inches='tight')

## 1. Simulation of Sales

In [None]:
V_vi, policy_vi = value_iteration()
trajectory_vi, reward_vi = generate_trajectory(policy_vi)

In [None]:
Q_ql, policy_ql, metrics_ql = q_learning()
trajectory_ql, reward_ql = generate_trajectory(policy_ql)

In [None]:
Q_sarsa, policy_sarsa, metrics_sarsa = sarsa_lambda()
trajectory_sarsa, reward_sarsa = generate_trajectory(policy_sarsa)

In [None]:
for run_idx in range(5):
    if run_idx == 0:
        plt.plot(trajectory_vi[run_idx], c='b', linewidth=0.5, label='Value Iteration')
        plt.plot(trajectory_ql[run_idx], c='r', linewidth=0.5, label='Q Learning')
        plt.plot(trajectory_sarsa[run_idx], c='g', linewidth=0.5, label='SARSA(λ)')
    else:
        plt.plot(trajectory_vi[run_idx], c='b', linewidth=0.5)
        plt.plot(trajectory_ql[run_idx], c='r', linewidth=0.5)
        plt.plot(trajectory_sarsa[run_idx], c='g', linewidth=0.5)
plt.ylabel('Seats Left in Inventory')
plt.xlabel('Number of Days Elapsed')
plt.ylim(0, K)
plt.xlim(0, T)
plt.legend()
plt.grid(linewidth=0.35)
plt.savefig('../slides/figures/trajectory_1.png')
plt.show()

In [None]:
for run_idx in range(10):
    if run_idx == 0:
        plt.plot(trajectory_vi[run_idx], c='b', linewidth=0.5, label='Value Iteration')
        plt.plot(trajectory_ql[run_idx], c='r', linewidth=0.5, label='Q Learning')
        plt.plot(trajectory_sarsa[run_idx], c='g', linewidth=0.5, label='SARSA(λ)')
    else:
        plt.plot(trajectory_vi[run_idx], c='b', linewidth=0.5)
        plt.plot(trajectory_ql[run_idx], c='r', linewidth=0.5)
        plt.plot(trajectory_sarsa[run_idx], c='g', linewidth=0.5)
plt.ylabel('Seats Left in Inventory')
plt.xlabel('Number of Days Elapsed')
plt.ylim(0, K)
plt.xlim(0, T)
plt.legend()
plt.grid(linewidth=0.35)
plt.savefig('../slides/figures/trajectory_2.png')
plt.show()

In [None]:
plt.plot(np.mean(np.array(trajectory_vi), axis=0), c='b', linewidth=1, label='Value Iteration')
plt.plot(np.mean(np.array(trajectory_ql), axis=0), c='r', linewidth=1, label='Q Learning')
plt.plot(np.mean(np.array(trajectory_sarsa), axis=0), c='g', linewidth=1, label='SARSA(λ)')
plt.ylabel('Seats Left in Inventory')
plt.xlabel('Number of Days Elapsed')
plt.ylim(0, K)
plt.xlim(0, T)
plt.legend()
plt.grid(linewidth=0.35)
plt.savefig('../slides/figures/average_trajectory.png')
plt.show()

## 2. Variation in Value Function

In [None]:
V_ql = value_function(Q_ql)
V_sarsa = value_function(Q_sarsa)

In [None]:
for inventory in [K // 2, K]:
    plt.plot(dict_to_array(V_vi)[inventory - 1], c='b', linewidth=0.75, label=f'Value Iteration')
    plt.plot(dict_to_array(V_ql)[inventory - 1], c='r', linewidth=0.75, label=f'Q Learning')
    plt.plot(dict_to_array(V_sarsa)[inventory - 1], c='g', linewidth=0.75, label=f'SARSA(λ)')
    plt.ylabel('Value Function', fontsize=12)
    plt.xlabel('Days Left', fontsize=12)
    plt.ylim(0, 250000)
    plt.xlim(0, T)
    plt.legend(loc='upper left')
    plt.grid(linewidth=0.35)
    plt.savefig(f'../slides/figures/inventory_{inventory}.png')
    plt.show()

In [None]:
for timestep in [T // 2, T]:
    plt.plot(dict_to_array(V_vi)[:, timestep - 1], c='b', linewidth=0.75, label=f'Value Iteration')
    plt.plot(dict_to_array(V_ql)[:, timestep - 1], c='r', linewidth=0.75, label=f'Q Learning')
    plt.plot(dict_to_array(V_sarsa)[:, timestep - 1], c='g', linewidth=0.75, label=f'SARSA(λ)')
    plt.ylabel('Value Function', fontsize=12)
    plt.xlabel('Remaining Seats', fontsize=12)
    plt.ylim(0, 250000)
    plt.xlim(0, K)
    plt.legend(loc='upper left')
    plt.grid(linewidth=0.35)
    plt.savefig(f'../slides/figures/timestep_{timestep}.png')
    plt.show()

In [None]:
for (inventory, time, color) in [(20, 20, 'b'), (20, 30, 'darkorange'), (30, 20, 'g'), (30, 30, 'r')]:
    plt.plot(PRICES, state_value_function_to_array(Q_ql)[inventory - 1, time - 1], c=color, label=f'K={inventory}, T={time}')
    plt.axhline(y=V_vi[inventory - 1, time - 1], c=color, linestyle=':')
plt.ylabel('State Value Function', fontsize=12)
plt.xlabel('Price', fontsize=12)
plt.ylim(100000, 250000)
plt.xlim(PRICES[0], PRICES[-1])
plt.legend(loc='upper left')
plt.grid(linewidth=0.35)
plt.savefig(f'../slides/figures/state_action_ql.png')
plt.show()

In [None]:
for (inventory, time, color) in [(20, 20, 'b'), (20, 30, 'darkorange'), (30, 20, 'g'), (30, 30, 'r')]:
    plt.plot(PRICES, state_value_function_to_array(Q_sarsa)[inventory - 1, time - 1], c=color, label=f'K={inventory}, T={time}')
    plt.axhline(y=V_vi[inventory - 1, time - 1], c=color, linestyle=':')
plt.ylabel('State Value Function', fontsize=12)
plt.xlabel('Price', fontsize=12)
plt.ylim(100000, 250000)
plt.xlim(PRICES[0], PRICES[-1])
plt.legend(loc='upper left')
plt.grid(linewidth=0.35)
plt.savefig(f'../slides/figures/state_action_sarsa.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_state_value_evolution(metrics_ql, ax=axs[0], title='Q-Learning')
axs[0].legend(fontsize=12, loc='lower right')

plot_state_value_evolution(metrics_sarsa, ax=axs[1], title='SARSA(λ)')
axs[1].legend(fontsize=12, loc='lower right')

save_plot('../slides/figures/value_evolution.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_policy_evolution(metrics_ql, ax=axs[0], title='Q-Learning')
plot_policy_evolution(metrics_sarsa, ax=axs[1], title='SARSA(λ)')

save_plot('../slides/figures/policy_evolution.png')
plt.show()

## 3. Learned Policies, Rewards and Regrets

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_matrix(V_vi, 'Value/Policy Iteration', fig, axs[0], 0.0, (0, 250000))
plot_matrix(V_ql, 'Q-Learning', fig, axs[1], 0.0, (0, 250000))
plot_matrix(V_sarsa, 'SARSA(λ)', fig, axs[2], 0.0, (0, 250000))

save_plot('../slides/figures/value_function.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_matrix(V_vi, 'Value/Policy Iteration', fig, axs[0], 1.0, (0, 250000))
plot_matrix(V_ql, 'Q-Learning', fig, axs[1], 1.0, (0, 250000))
plot_matrix(V_sarsa, 'SARSA(λ)', fig, axs[2], 1.0, (0, 250000))

save_plot('../slides/figures/value_function_smooth.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_matrix(policy_vi, 'Value/Policy Iteration', fig, axs[0])
plot_matrix(policy_ql, 'Q-Learning', fig, axs[1])
plot_matrix(policy_sarsa, 'SARSA(λ)', fig, axs[2])

save_plot('../slides/figures/policy.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_matrix(policy_vi, 'Value/Policy Iteration', fig, axs[0], 1.0)
plot_matrix(policy_ql, 'Q-Learning', fig, axs[1], 1.0)
plot_matrix(policy_sarsa, 'SARSA(λ)', fig, axs[2], 1.0)

save_plot('../slides/figures/policy_smooth.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_episode_rewards(metrics_ql, ax=axs[0], label='Q-Learning')
plot_episode_rewards(metrics_sarsa, ax=axs[0], label='SARSA(λ)')
axs[0].legend(fontsize=12, loc='upper right')

plot_cumulative_rewards(metrics_ql, ax=axs[1], label='Q-Learning')
plot_cumulative_rewards(metrics_sarsa, ax=axs[1], label='SARSA(λ)')
axs[1].legend(fontsize=12)

save_plot('../slides/figures/reward.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_episode_regrets(metrics_ql, ax=axs[0], label='Q-Learning')
plot_episode_regrets(metrics_sarsa, ax=axs[0], label='SARSA(λ)')
axs[0].legend(fontsize=12)

plot_cumulative_regrets(metrics_ql, ax=axs[1], label='Q-Learning')
plot_cumulative_regrets(metrics_sarsa, ax=axs[1], label='SARSA(λ)')
axs[1].legend(fontsize=12)

save_plot('../slides/figures/regret.png')
plt.show()

## 4. Effect of Exploration

Ability to modify hyperparameters on the fly has not been added for simplicity. Changes have to be made manually with kernel restarts for them to take effect.

In [None]:
_, policy_100, metrics_100 = q_learning()
with open('.tmp/epsilon_100.pkl', 'wb') as f:
    pickle.dump([policy_100, metrics_100], f)

In [None]:
_, policy_60, metrics_60 = q_learning()
with open('.tmp/epsilon_60.pkl', 'wb') as f:
    pickle.dump([policy_60, metrics_60], f)

In [None]:
_, policy_20, metrics_20 = q_learning()
with open('.tmp/epsilon_20.pkl', 'wb') as f:
    pickle.dump([policy_20, metrics_20], f)

In [None]:
with open('.tmp/epsilon_100.pkl', 'rb') as f:
    policy_100, metrics_100 = pickle.load(f)

with open('.tmp/epsilon_60.pkl', 'rb') as f:
    policy_60, metrics_60 = pickle.load(f)

with open('.tmp/epsilon_20.pkl', 'rb') as f:
    policy_20, metrics_20 = pickle.load(f)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_matrix(policy_20, 'ε=0.20', fig, axs[0], 1.0)
plot_matrix(policy_60, 'ε=0.60', fig, axs[1], 1.0)
plot_matrix(policy_100, 'ε=1.00', fig, axs[2], 1.0)

save_plot('../slides/figures/epsilon_policy.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_episode_rewards(metrics_20, ax=axs[0], label='ε=0.20')
plot_episode_rewards(metrics_60, ax=axs[0], label='ε=0.60')
plot_episode_rewards(metrics_100, ax=axs[0], label='ε=1.00')
axs[0].legend(fontsize=12, loc='upper left')

plot_episode_regrets(metrics_20, ax=axs[1], label='ε=0.20')
plot_episode_regrets(metrics_60, ax=axs[1], label='ε=0.60')
plot_episode_regrets(metrics_100, ax=axs[1], label='ε=1.00')
axs[1].legend(fontsize=12)

save_plot('../slides/figures/epsilon_metrics.png')
plt.show()

## 5. Effect of Trace Decay Parameter

Ability to modify hyperparameters on the fly has not been added for simplicity. Changes have to be made manually with kernel restarts for them to take effect.

In [None]:
_, policy_100, metrics_100 = sarsa_lambda()
with open('.tmp/lambda_100.pkl', 'wb') as f:
    pickle.dump([policy_100, metrics_100], f)

In [None]:
_, policy_90, metrics_90 = sarsa_lambda()
with open('.tmp/lambda_90.pkl', 'wb') as f:
    pickle.dump([policy_90, metrics_90], f)

In [None]:
_, policy_80, metrics_80 = sarsa_lambda()
with open('.tmp/lambda_80.pkl', 'wb') as f:
    pickle.dump([policy_80, metrics_80], f)

In [None]:
with open('.tmp/lambda_100.pkl', 'rb') as f:
    policy_100, metrics_100 = pickle.load(f)

with open('.tmp/lambda_90.pkl', 'rb') as f:
    policy_90, metrics_90 = pickle.load(f)

with open('.tmp/lambda_80.pkl', 'rb') as f:
    policy_80, metrics_80 = pickle.load(f)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_matrix(policy_80, 'λ=0.80', fig, axs[0], 1.0)
plot_matrix(policy_90, 'λ=0.90', fig, axs[1], 1.0)
plot_matrix(policy_100, 'λ=1.00', fig, axs[2], 1.0)

save_plot('../slides/figures/lambda_policy.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_episode_rewards(metrics_80, ax=axs[0], label='λ=0.80')
plot_episode_rewards(metrics_90, ax=axs[0], label='λ=0.90')
plot_episode_rewards(metrics_100, ax=axs[0], label='λ=1.00')
axs[0].legend(fontsize=12, loc='upper left')

plot_episode_regrets(metrics_80, ax=axs[1], label='λ=0.80')
plot_episode_regrets(metrics_90, ax=axs[1], label='λ=0.90')
plot_episode_regrets(metrics_100, ax=axs[1], label='λ=1.00')
axs[1].legend(fontsize=12)

save_plot('../slides/figures/lambda_metrics.png')
plt.show()