In [None]:
from agent import KalmanSR
from environment import SimpleMDP
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from dynamic_programming import value_iteration
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
from itertools import product
%matplotlib notebook

In [None]:
env = SimpleMDP(5)
env.create_graph()

plt.figure()
positions = {0: (0, 0), 1: (1, 0), 2: (2, 0), 3: (3, 0), 4: (4, 0)}
env.show_graph(layout=positions)


In [None]:
transition_noise = .005 * np.eye(env.nr_states ** 2)
gamma = .9
kappa = 1. 
prior_M = np.eye(env.nr_states).flatten()
prior_covariance = np.eye(env.nr_states ** 2)  # np.ones((env.nr_states**2, env.nr_states**2))
observation_noise_variance = np.eye(env.nr_states)  # np.ones([env.nr_states, env.nr_states])

M = prior_M
covariance = prior_covariance


In [None]:
def get_feature_representation(state_idx):
    """Get one-hot feature representation from state index.
    """
    if env.is_terminal(state_idx):
        return np.zeros(env.nr_states)
    else:
        return np.eye(env.nr_states)[state_idx]



In [None]:
for episode in tqdm(range(100)):
    env.reset()
    t = 0
    s = env.get_current_state()
    features = get_feature_representation(s)

    while not env.is_terminal(env.get_current_state()) and t < 1000:
        a = 1 # np.random.choice([0,1])
       
        next_state, reward = env.act(a)
        next_features = get_feature_representation(next_state)
        H = features - gamma * next_features  # Temporal difference features
        feature_block_matrix = np.kron(H, np.eye(env.nr_states)).T
        
        # Prediction step;
        a_priori_covariance = covariance + transition_noise

        # compute sigma points
        n = len(M)
        X = np.empty((2 * n + 1, n))
        X[:, :] = M[None, :]  # fill array with m for each
        cholesky = np.linalg.cholesky((kappa + n) * covariance)
        for j in range(n):
            X[j+1,:] += cholesky[:, j]
            X[j+n+1,:] -= cholesky[:, j]
        weights = np.ones(2*n+1) * (1. / (2 * (kappa +n) ) )
        weights[0] = (kappa / (kappa + n))
        
        Y = np.matmul(X, feature_block_matrix)

        
        # TODO: compute sigma y and sig
        
        # Compute statistics of interest;
        phi_hat = np.multiply(Y, weights[:, np.newaxis]).sum(axis=0)

        param_error_cov = np.sum([weights[j] * np.outer((X[j] - M), (Y[j] - phi_hat)) 
                                  for j in range(len(weights))], axis=0)
        
        residual_cov = np.maximum(np.sum([weights[j] *np.outer((Y[j] - phi_hat), (Y[j] - phi_hat))
                               for j in range(len(weights))], axis=0), 10e-5)
        
        delta_t = features - phi_hat

        
        # Correction step;
        kalman_gain = np.matmul(param_error_cov, np.linalg.inv(residual_cov))
        delta_M = np.matmul(kalman_gain, delta_t)

        M += delta_M

        covariance = a_priori_covariance - np.matmul(np.matmul(kalman_gain, residual_cov), kalman_gain.T)

        s = next_state
        features = get_feature_representation(s)

        t += 1
np.around(M.reshape(env.nr_states, -1), decimals=3)


In [None]:
plt.figure()
m_labels = ['{}-{}'.format(i, j) for i, j in product(list(range(env.nr_states)), list(range(env.nr_states)))]

plt.imshow(covariance[:-5, :-5])
plt.xticks(ticks=list(range(env.nr_states**2 - env.nr_states)), labels=m_labels, rotation=90)
plt.yticks(ticks=list(range(env.nr_states**2 - env.nr_states)), labels=m_labels, rotation=0)

plt.colorbar()

In [None]:
np.diag(covariance)

In [None]:
phi_hat

In [None]:
covariance