In [None]:
import numpy as np
import matplotlib.pyplot as plt
from agent import KTDV
import pandas as pd
from environment import SimpleMDP
from tqdm import tqdm_notebook as tqdm
%matplotlib notebook

In [None]:
ag = KTDV(environment=SimpleMDP(nr_states=3))

all_results = {}
for ep in range(50):
    results = ag.train_one_episode(fixed_policy=True)
    all_results[ep] = results
 

In [None]:
results = pd.DataFrame.from_dict({(trial, step): all_results[trial][step] 
                           for trial in all_results.keys() 
                           for step in all_results[trial].keys()},
                       orient='index')

In [None]:
results.index.names = ['trial', 't']

In [None]:
results.head()

In [None]:
plt.figure()
plt.plot(np.array(results.xs(1,level='t').rhat))
plt.title('Predicted reward for transition 2-3 (R hat)')

In [None]:
plt.figure()
plt.plot(np.array(results.xs(1,level='t').V))
plt.plot(np.array(results.xs(0,level='t').V))

plt.legend(['State 2', 'State 1'])

plt.title('Predicted value (V)')

In [None]:
plt.figure()
plt.imshow(results.iloc[-1]['cov']);plt.colorbar()

In [None]:
results.iloc[-1]['weights']

In [None]:
results.iloc[-1].K

In [None]:
all_covs = np.stack(results['cov'].values)

In [None]:
plt.figure()
plt.plot(all_covs[:, 1, 1])
plt.plot(all_covs[:, 0, 0])
plt.legend(['State 2', 'State 1'])

plt.ylim([0,1])
plt.title('Uncertainty (variance)')

## Now try a 2D GridWorld

In [None]:
from environment import GridWorld
from matplotlib import pyplot as plt, patches as patches
from mpl_toolkits.axes_grid1 import make_axes_locatable
ag2 = KTDV(environment=GridWorld('./mdps/10x10.mdp'))

In [None]:
ag2.env.reward_func

In [None]:
def plot_maze(ax):
    for idx in range(ag2.env.num_cols * ag2.env.num_rows):
        x, y = ag2.env.get_state_position(idx)
        if ag2.env.matrix_MDP[x][y] == -1:
            plt.gca().add_patch(
                patches.Rectangle(
                    (y, ag2.env.num_rows - x - 1),  # (x,y)
                    1.0,  # width
                    1.0,  # height
                    facecolor="gray"
                )
            )
        else:
            pass

    for i in range(ag2.env.num_cols):
        plt.axvline(i, color='k', linestyle=':')
    plt.axvline(ag2.env.num_rows, color='k', linestyle=':')

    for j in range(ag2.env.num_rows):
        plt.axhline(j, color='k', linestyle=':')
    plt.axhline(ag2.env.num_rows, color='k', linestyle=':')

    plt.text(ag2.env.goal_x + .1, ag2.env.goal_y + .1, 'G', fontsize=20, color='green')
    plt.text(ag2.env.start_x + .1, ag2.env.start_y + .1, 'S', fontsize=20, color='black')


    plt.xlim([0,12])
    plt.ylim([0,12])
    plt.box(False)

    aspect_ratio = np.diff(ax.get_xlim())[0] / np.diff(ax.get_ylim())[0]
    ax.set_aspect(aspect_ratio)
    


In [None]:
fig, ax = plt.subplots()
plot_maze(ax)

In [None]:
all_results = {}
for ep in tqdm(range(50)):
    results = ag2.train_one_episode(random_policy=True)
    all_results[ep] = results


In [None]:
results = pd.DataFrame.from_dict({(trial, step): all_results[trial][step] 
                           for trial in all_results.keys() 
                           for step in all_results[trial].keys()},
                       orient='index')

results.index.names = ['trial', 't']

In [None]:
results.tail()

In [None]:
first_episode = results.xs(0,level='trial')

In [None]:
locations = np.array([ag2.env.get_state_position(t) for t in first_episode.state.values])

In [None]:
fig, ax = plt.subplots()

plot_maze(ax)

plt.scatter(locations[:,0]+.5, locations[:,1]+.5, alpha=.1)

plt.title('Occupancy during first trial')


In [None]:

all_results = {}
for ep in tqdm(range(100)):
    results = ag2.train_one_episode(random_policy=False)
    all_results[ep] = results


In [None]:
results = pd.DataFrame.from_dict({(trial, step): all_results[trial][step] 
                           for trial in all_results.keys() 
                           for step in all_results[trial].keys()},
                       orient='index')

results.index.names = ['trial', 't']

In [None]:
from plotting import GridWorldPlotter

In [None]:
b = GridWorldPlotter(ag2)

In [None]:
fig, ax = plt.subplots()
b.plot_maze(ax)

b.plot_value(ax)

In [None]:
fig, ax = plt.subplots()
b.plot_maze(ax)

b.plot_uncertainty(ax)

In [None]:
last_episode = results.xs(80,level='trial')

locations = np.array([ag2.env.get_state_position(t) for t in last_episode.state.values])

In [None]:
fig, ax = plt.subplots()

plot_maze(ax)

plt.scatter(locations[:,0]+.5, locations[:,1]+.5, alpha=.3)



In [None]:
ag3 = KTDV(environment=GridWorld('./mdps/10x10.mdp'))

In [None]:
ag3.env.start_x = 1
ag3.env.start_y = 1

In [None]:
all_results = {}
for ep in tqdm(range(100)):
    results = ag3.train_one_episode(random_policy=False)
    all_results[ep] = results

In [None]:
plotter = GridWorldPlotter(ag3)

In [None]:
fig, ax = plt.subplots()
plotter.plot_maze(ax)

plotter.plot_uncertainty(ax)

In [None]:
fig, ax = plt.subplots()
plotter.plot_maze(ax)



In [None]:
fig, ax = plt.subplots()
plotter.plot_maze(ax)

plotter.plot_value(ax)