In [122]:
import numpy as np
import json

In [123]:
with open("input.json", "r") as read_file:
    data = json.load(read_file)
data

{'disc': 1.0,
 'reward_function': {'r': 0.5, 'r_B': [], 'r_T': [-1.0, 1.0]},
 'transition_rates': {'p1': 0.8, 'p2': 0.1, 'p3': 0.1},
 'world': {'size': {'M': 3, 'N': 4},
  'states': {'B': [], 'F': [[2, 2]], 'S': [1, 1], 'T': [[4, 2], [4, 3]]}}}

In [124]:
class MarkovDecisionProcess:
    def __init__(self, *, N, M, S, T, F, B, p1, p2, p3, r, r_T, r_B, disc):
        self.N, self.M = N, M
        self.S = S
        self.T = self.lists_to_tuples(T)
        self.F = self.lists_to_tuples(F)
        self.B = self.lists_to_tuples(B)
        self.p1, self.p2, self.p3 = p1, p2, p3
        self.r, self.r_T, self.r_B = r, r_T, r_B
        self.disc = disc  # discounting
        assert len(self.T) == len(self.r_T) 
        assert len(self.B) == len(self.r_B)
        
    def lists_to_tuples(self, I):
        return [(a, b) for a, b in I]


MDP = MarkovDecisionProcess(**data['world']['size'],
                            **data['world']['states'],
                            **data['transition_rates'],
                            **data['reward_function'],
                            disc=data['disc'])

In [189]:
class World:
    def __init__(self):  # S = (1,1), T = [(1,4), (5,2)], F = [(4,2)], B
        self.grid = [[State(x, y) for y in range(1, MDP.M + 1)] for x in range(1, MDP.N + 1)]
        
        self.get_state(*MDP.S).kind = 'S'
    
        for t, rt in zip(MDP.T, MDP.r_T):
            self.get_state(*t).kind = 'T'
            self.get_state(*t).reward = rt
        for b, rb in zip(MDP.B, MDP.r_B):
            self.get_state(*b).kind = 'B'
            self.get_state(*b).reward = rb
        for f in MDP.F:
            self.get_state(*f).kind = 'F'
            
    def get_state(self, x, y):
        return self.grid[x-1][y-1]
        
    def print_utility(self):
        pass
    
    def print_policy(self):
        pass
    
    
class State:
    def __init__(self, x, y, kind="N", reward=MDP.r):
        self.x, self.y = x, y
        self.kind = kind  # S/T/F/B
        self.reward = reward
        self.utility = -0.04
        self.policy = ""
        
    def neighbours(self):
        neigh_coordinates = [(self.x + i, self.y + j)
                             for i in [-1, 0, 1] for j in [-1, 0, 1]
                             if (i * j == 0 and i + j != 0)]
        return [c for c in neigh_coordinates
                if self.is_valid(*c) and c not in MDP.F]

    def move(self, d):
        p1, p2, p3 = MDP.p1, MDP.p2, MDP.p3
        if d == "U":
            np.random.choice(["U", "L", "R", "D"], p=[p1, p2, p3, 1-p1-p2-p3])
        
        if d == "D":
            pass
        
        if d == "L":
            pass
        
        if d == "R":
            pass
    
    def out_of_world(self, x, y):
        return 
        
    def invalid_state(self, x, y):
        out_of_world = x < 1 or x > MDP.N or y < 1 or y > MDP.M
        return (x, y) in MDP.F or out_of_world
        
        
    def transitions(self):
#         neigh_coordinates = [(self.x + i, self.y + j)
#                              for i in [-1, 0, 1] for j in [-1, 0, 1]
#                              if (i * j == 0 and i + j != 0)]
        probabilities = [MDP.p1, MDP.p2, MDP.p3, 1 - MDP.p1 - MDP.p2 - MDP.p3]
        neigh_coordinates = [(self.x - 1, self.y), (self.x, self.y + 1),
                             (self.x + 1, self.y), (self.x, self.y - 1)]
        transitions = []
        for move in ['U', 'L', 'R', 'D']:
            if move == 'U':
                destination = (self.x, self.y + 1)
                left = (self.x - 1, self.y)
                right = (self.x + 1, self.y)
                opposite = (self.x, self.y - 1)
            
            if move == 'L':
                destination = (self.x - 1, self.y)
                left = (self.x, self.y - 1)
                right = (self.x, self.y + 1)
                opposite = (self.x + 1, self.y)
                
            if move == 'D':
                destination = (self.x, self.y - 1)
                left = (self.x + 1, self.y)
                right = (self.x - 1, self.y)
                opposite = (self.x, self.y + 1)
            
            if move == 'R':
                destination = (self.x + 1, self.y)
                left = (self.x, self.y + 1)
                right = (self.x, self.y - 1)
                opposite = (self.x - 1, self.y)
                
            for coordinates, transition_rate in zip([destination, left, right, opposite], probabilities):
                if self.invalid_state(*coordinates):
                    coordinates = self.x, self.y
                transitions.append((move, coordinates, transition_rate))
        return transitions
                
import copy
from itertools import groupby
        
class ValueIteration:
    def __init__(self, stopping_condition=0.0001):
        self.stop_cond = stopping_condition
        self.t = 0
        self.W = World()
        self.converged = False
        
    def iterate(self):
        while not self.converged:
            self.converged = True
            self.t += 1
            next_W = copy.deepcopy(self.W)
            for x in range(1, MDP.N + 1):
                for y in range(1, MDP.M + 1):
                    if (x, y) not in MDP.F:
                        s = next_W.get_state(x, y)
                        prev_utility = s.utility
                        max_sum = self.select_action(self.W, s.transitions())
                        s.utility = s.reward + MDP.disc * max_sum
                        print(s.utility, prev_utility, f"({x}, {y})")
                        if abs(s.utility - prev_utility) < self.stop_cond:
                            self.converged = False
            self.W = next_W
    
    def select_action(self, world, transitions):
        sums = []
        for move, group in groupby(transitions, lambda x: x[0]):
            sum_ = 0
            for transition in group:
                sum_ += world.get_state(*transition[1]).utility * transition[2]
            sums.append((sum_, move))
        return max(sums, key=lambda x: x[1])[0]
                

In [190]:
w = World()
t = w.get_state(4,3).transitions()
vi = ValueIteration()
vi.iterate()

0.45999999999999996 -0.04 (1, 1)
0.45999999999999996 -0.04 (1, 2)
0.45999999999999996 -0.04 (1, 3)
0.45999999999999996 -0.04 (2, 1)
0.45999999999999996 -0.04 (2, 3)
0.45999999999999996 -0.04 (3, 1)
0.45999999999999996 -0.04 (3, 2)
0.45999999999999996 -0.04 (3, 3)
0.45999999999999996 -0.04 (4, 1)
-1.04 -0.04 (4, 2)
0.96 -0.04 (4, 3)


In [132]:
from itertools import cycle, islice

In [139]:
a = [1,2,3,4]
ac = cycle(a)

In [143]:
list(islice(ac, 1, 5))

[2, 3, 4, 1]

In [164]:
World().get_state(4,3).utility

-0.04