# Finite Inventory Pricing

## 0. Imports

In [None]:
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pickle
from algorithms import policy_iteration, q_learning, sarsa_lambda, value_iteration
from constants import K, T, PRICES, TRACE_DECAY, GAUSSIAN_SMOOTHING_STD
from scipy.ndimage import gaussian_filter, gaussian_filter1d

plt.rcParams["figure.autolayout"] = True

In [None]:
def plot_policy(policy, title = '', fig = None, ax = None, sigma = 0.0):
    """ Displays the image represented by an array. """

    policy_matrix = np.zeros((K, T))
    for inv in range(1, K + 1):
        for t in range(1, T + 1):
            policy_matrix[inv - 1, t - 1] = policy[(inv, t)]
    policy_matrix = gaussian_filter(policy_matrix, sigma=sigma)

    if ax is None:
        plt.figure()
        plt.imshow(policy_matrix, cmap='viridis', aspect='auto')
        plt.colorbar(label='Price')
        plt.title(title)
        plt.ylabel('Remaining Seats')
        plt.xlabel('Days Left')
        plt.ylim(1, K - 1)
        plt.xlim(1, T - 1)
        plt.show()

    else:
        im = ax.imshow(policy_matrix, cmap='viridis', aspect='auto')
        cbar = fig.colorbar(im, ax=ax)
        cbar.set_label('Price', fontsize=12)
        ax.set_title(title, fontsize=14)
        ax.set_ylabel('Remaining Seats', fontsize=12)
        ax.set_xlabel('Days Left', fontsize=12)
        ax.set_ylim(1, K - 1)
        ax.set_xlim(1, T - 1)

def plot_episode_rewards(metrics, title = 'Total Discounted Rewards per Episode', ax = None, label = None):
    """ Plots the total discounted rewards per episode. """

    if label is None:
        ax.plot(gaussian_filter1d(metrics['episode_rewards'], sigma=GAUSSIAN_SMOOTHING_STD))
    else:
        ax.plot(gaussian_filter1d(metrics['episode_rewards'], sigma=GAUSSIAN_SMOOTHING_STD), label=label)
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Total Discounted Reward', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.grid(linewidth=0.35)

def plot_episode_regrets(metrics, title = 'Instantaneous Episodic Regret', ax = None, label = None):
    """ Plots the instantaneous episodic regret. """

    if label is None:
        ax.plot(gaussian_filter1d(metrics['episode_regrets'], sigma=GAUSSIAN_SMOOTHING_STD))
    else:
        ax.plot(gaussian_filter1d(metrics['episode_regrets'], sigma=GAUSSIAN_SMOOTHING_STD), label=label)
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Regret', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.grid(linewidth=0.35)

def plot_cumulative_rewards(metrics, title = 'Cumulative Discounted Rewards over Episodes', ax = None, label = None):
    """ Plots the cumulative discounted rewards over episode. """

    if label is None:
        ax.plot(gaussian_filter1d(metrics['cumulative_rewards'], sigma=GAUSSIAN_SMOOTHING_STD))
    else:
        ax.plot(gaussian_filter1d(metrics['cumulative_rewards'], sigma=GAUSSIAN_SMOOTHING_STD), label=label)
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Cumulative Discounted Reward', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.grid(linewidth=0.35)

def plot_cumulative_regrets(metrics, title = 'Cumulative Regrets over Episodes', ax = None, label = None):
    """ Plots the cumulative regrets over episodes. """

    if label is None:
        ax.plot(gaussian_filter1d(metrics['cumulative_regrets'], sigma=GAUSSIAN_SMOOTHING_STD))
    else:
        ax.plot(gaussian_filter1d(metrics['cumulative_regrets'], sigma=GAUSSIAN_SMOOTHING_STD), label=label)
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('Cumulative Regret', fontsize=12)
    ax.set_xlabel('Episode', fontsize=12)
    ax.grid(linewidth=0.35)

def save_plot(dest_path):
    """ Saves the plot as an image file, given the absolute path. """

    plt.savefig(dest_path, bbox_inches='tight')

## 1. Learned Policies, Rewards and Regrets

In [None]:
V_vi, policy_vi = value_iteration()
V_pi, policy_pi = policy_iteration()
Q_ql, policy_ql, metrics_ql = q_learning()
Q_sarsa, policy_sarsa, metrics_sarsa = sarsa_lambda()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_policy(policy_vi, 'Value/Policy Iteration', fig, axs[0])
plot_policy(policy_ql, 'Q-Learning', fig, axs[1])
plot_policy(policy_sarsa, 'SARSA(λ)', fig, axs[2])

save_plot('../slides/figures/policy.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_policy(policy_vi, 'Value/Policy Iteration', fig, axs[0], 1.0)
plot_policy(policy_ql, 'Q-Learning', fig, axs[1], 1.0)
plot_policy(policy_sarsa, 'SARSA(λ)', fig, axs[2], 1.0)

save_plot('../slides/figures/policy_smooth.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_episode_rewards(metrics_ql, ax=axs[0], label='Q-Learning')
plot_episode_rewards(metrics_sarsa, ax=axs[0], label='SARSA(λ)')
axs[0].legend(fontsize=12, loc='upper right')

plot_cumulative_rewards(metrics_ql, ax=axs[1], label='Q-Learning')
plot_cumulative_rewards(metrics_sarsa, ax=axs[1], label='SARSA(λ)')
axs[1].legend(fontsize=12)

save_plot('../slides/figures/reward.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_episode_regrets(metrics_ql, ax=axs[0], label='Q-Learning')
plot_episode_regrets(metrics_sarsa, ax=axs[0], label='SARSA(λ)')
axs[0].legend(fontsize=12)

plot_cumulative_regrets(metrics_ql, ax=axs[1], label='Q-Learning')
plot_cumulative_regrets(metrics_sarsa, ax=axs[1], label='SARSA(λ)')
axs[1].legend(fontsize=12)

save_plot('../slides/figures/regret.png')
plt.show()

## 2. Effect of Exploration

Ability to modify hyperparameters on the fly has not been added for simplicity. Changes have to be made manually with kernel restarts for them to take effect.

In [None]:
# _, policy_10, metrics_10 = q_learning()
# with open('.tmp/epsilon_10.pkl', 'wb') as f:
#     pickle.dump([policy_10, metrics_10], f)

In [None]:
# _, policy_20, metrics_20 = q_learning()
# with open('.tmp/epsilon_20.pkl', 'wb') as f:
#     pickle.dump([policy_20, metrics_20], f)

In [None]:
# _, policy_30, metrics_30 = q_learning()
# with open('.tmp/epsilon_30.pkl', 'wb') as f:
#     pickle.dump([policy_30, metrics_30], f)

In [None]:
with open('.tmp/epsilon_10.pkl', 'rb') as f:
    policy_10, metrics_10 = pickle.load(f)

with open('.tmp/epsilon_20.pkl', 'rb') as f:
    policy_20, metrics_20 = pickle.load(f)

with open('.tmp/epsilon_30.pkl', 'rb') as f:
    policy_30, metrics_30 = pickle.load(f)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_policy(policy_10, 'ε=0.1', fig, axs[0], 1.0)
plot_policy(policy_20, 'ε=0.2', fig, axs[1], 1.0)
plot_policy(policy_30, 'ε=0.3', fig, axs[2], 1.0)

save_plot('../slides/figures/epsilon_policy.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_episode_rewards(metrics_10, ax=axs[0], label='ε=0.1')
plot_episode_rewards(metrics_20, ax=axs[0], label='ε=0.2')
plot_episode_rewards(metrics_30, ax=axs[0], label='ε=0.3')
axs[0].legend(fontsize=12)

plot_episode_regrets(metrics_10, ax=axs[1], label='ε=0.1')
plot_episode_regrets(metrics_20, ax=axs[1], label='ε=0.2')
plot_episode_regrets(metrics_30, ax=axs[1], label='ε=0.3')
axs[1].legend(fontsize=12)

save_plot('../slides/figures/epsilon_metrics.png')
plt.show()

## 3. Effect of Trace Decay Parameter

Ability to modify hyperparameters on the fly has not been added for simplicity. Changes have to be made manually with kernel restarts for them to take effect.

In [None]:
# _, policy_100, metrics_100 = sarsa_lambda()
# with open('.tmp/lambda_100.pkl', 'wb') as f:
#     pickle.dump([policy_100, metrics_100], f)

In [None]:
# _, policy_90, metrics_90 = sarsa_lambda()
# with open('.tmp/lambda_90.pkl', 'wb') as f:
#     pickle.dump([policy_90, metrics_90], f)

In [None]:
# _, policy_80, metrics_80 = sarsa_lambda()
# with open('.tmp/lambda_80.pkl', 'wb') as f:
#     pickle.dump([policy_80, metrics_80], f)

In [None]:
with open('.tmp/lambda_100.pkl', 'rb') as f:
    policy_100, metrics_100 = pickle.load(f)

with open('.tmp/lambda_90.pkl', 'rb') as f:
    policy_90, metrics_90 = pickle.load(f)

with open('.tmp/lambda_80.pkl', 'rb') as f:
    policy_80, metrics_80 = pickle.load(f)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(20, 5))

plot_policy(policy_80, 'λ=0.80', fig, axs[0], 1.0)
plot_policy(policy_90, 'λ=0.90', fig, axs[1], 1.0)
plot_policy(policy_100, 'λ=1.00', fig, axs[2], 1.0)

save_plot('../slides/figures/lambda_policy.png')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 5))

plot_episode_rewards(metrics_80, ax=axs[0], label='λ=0.80')
plot_episode_rewards(metrics_90, ax=axs[0], label='λ=0.90')
plot_episode_rewards(metrics_100, ax=axs[0], label='λ=1.00')
axs[0].legend(fontsize=12, loc='upper left')

plot_episode_regrets(metrics_80, ax=axs[1], label='λ=0.80')
plot_episode_regrets(metrics_90, ax=axs[1], label='λ=0.90')
plot_episode_regrets(metrics_100, ax=axs[1], label='λ=1.00')
axs[1].legend(fontsize=12)

save_plot('../slides/figures/lambda_metrics.png')
plt.show()