# Imports and Installs

In [56]:
#Installs & Imports
!pip install scipy
!pip install matplotlib
!pip install numpy
!pip install torchvision
import gym 
from gym import spaces
import collections
import pprint
import torch
import numpy as np
import random
import operator



#Initialising MDP

In [104]:
GAMMA = 0.9
TEST_EPISODES = 20
REWARD_GOAL = 0.8
N =1000
class MDP:
    def init(self):
        self.S = [0, 1, 2]
        self.endstate = self.S[-1]
        self.gamma = 0.99
        self.actions = [0, 1, 2] # 0 = BACK, 1 = FORWARD, 2 = STAY
        self.currentstate = self.actions[0]
        self.rewards = {0: 0.0,
                        1: 0.2,
                        2: 1.0}
        self.R = collections.defaultdict(float)
        self.T = {
            (0, 0):{0: 0.8, 1: 0.1, 2: 0.1},  
            (0, 1): {0: 0.1, 1: 0.7, 2: 0.2}, 
            (0, 2): {0: 0.8, 1: 0.1}, 
            (1, 0): {0: 0.7, 1: 0.2, 2: 0.1}, 
            (1, 1): {0: 0.1, 1: 0.1, 2: 0.8}, 
            (1, 2): {0: 0.1, 1: 0.8, 2: 0.1},
            (2, 0): {0: 0.1, 1: 0.8, 2: 0.1}, 
            (2, 1): {0: 0.8, 1: 0.1, 2: 0.1},
            (2, 2): {0: 0.1, 1: 0.1, 2: 0.8}}

        self.values = collections.defaultdict(float)
    


    def step(self, action):
        isdone = False
        new_state = max(self.T[(self.currentstate, action)].items(), key=operator.itemgetter(1))[0]
        if new_state == self.endstate:
            isdone = True
        else:
            isdone = False

        return new_state, self.rewards[new_state], isdone
       
    def select_action(self, state):
        #helper function to select action with highest utility
        best_action, best_value = None, None
        for action in range(len(self.actions)):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action

    def get_state_utility(self, state):
        utility = 0
        for action in self.actions:
            utility += self.values[state, action]

        return utility/len(agent.actions)

        

    def play_n_random_steps(self, count):
        for _ in range(count):
            action = random.choice(self.actions)
            new_state, reward, is_done = self.step(action)
            #print("State: {}".format(self.currentstate), "Action: {}".format(action), "New State{}".format(new_state), "Reward {}".format(reward) )
            self.R[(self.currentstate, action, new_state)] = reward
            #self.T[(self.currentstate, action)][new_state] += 1 
            
            self.currentstate = new_state

            """
            if is_done:
                self.currentstate = self.S[0]
            else: 
                self.currentstate = new_state
            """
            

            

    def value_iteration_for_Q(self):
        for state in self.S:
            for action in self.actions:
                action_value = 0.0
                target_probs = self.T[(state, action)]
                total = sum(target_probs.values())
                for tgt_state, probability in target_probs.items():
                    key = (state, action, tgt_state)
                    reward = self.R[key]
                    best_action = self.select_action(tgt_state)
                    val = reward + GAMMA * self.values[(tgt_state, best_action)]
                    action_value += (probability / total) * val
                self.values[(state, action)] = action_value

    def find_optimal_policy(self):
        policy = [None] * len(self.S)
        for state in self.S:
            best_action = self.select_action(state)
            policy[state] = best_action
        return policy

#Solving MDP

In [106]:
agent = MDP()
agent.init()
agent.play_n_random_steps(N)
agent.value_iteration_for_Q()
print(agent.find_optimal_policy())

[1, 1, 2]


#Likelihood Function

In [None]:
#for i:
  #for t:
    #s = state(i)
    #a = action(i)
    #likelihood[i] = Q(s,a) - V(s)
#return likelihood

def likelihood(policy, rewardfunc):
    likelihoodList = [None] * len(policy)
    for i in range(len(policy)):
        state = int(i)
        action = int(policy[i])
        likelihoodList[i] = rewardfunc[state,action] - np.argmax(rewardfunc[state,:])
    return sum(likelihoodList) / len(likelihoodList) 

In [None]:
#new likelihood function
def likelihood1(policy, agent):
    likelihoodList = [None] * len(policy)
    for i in range(len(policy)):
        state = int(i)
        action = int(policy[i])
    #likelihoodList[i] = agent.R[(state, action, max(agent.T[(0,1)].items(), key=operator.itemgetter(1))[0])] - np.argmax(agent.R[state,:,:])
    
        likelihoodList[i] = agent.values[state,action]  - agent.get_state_utility(state) #change this to ... sum probabilites of all actions in state / number of actions
    #print(agent.get_state_utility(state))
    return sum(likelihoodList) / len(likelihoodList)

pprint.pprint(agent.values)
print(likelihood1([1,1,2], agent))
print(likelihood1([1,2,2], agent))
print(likelihood1([2,2,2], agent))
print(likelihood1([1,1,2], agent))




defaultdict(<class 'float'>,
            {(0, 0): 0.1,
             (0, 1): 0.20900000000000002,
             (0, 2): 0.16720000000000002,
             (1, 0): 0.23167000000000004,
             (1, 1): 0.8396603,
             (1, 2): 0.723365416,
             (2, 0): 0.723365416,
             (2, 1): 0.39115231444000004,
             (2, 2): 1.4152025265200001})
0.2878858340666667
0.24912087273333336
0.2351875394
0.2878858340666667


#Testing likelihood function

In [None]:
#create random policies for testing
actions = [0, 1, 2, 3]
nonoptimal1 = [None] * 16
nonoptimal2 = [None] * 16
nonoptimal3 = [None] * 16
policies = [nonoptimal1, nonoptimal2, nonoptimal3]
for policy in policies:
    for i in range(len(policy)):
        policy[i] = random.choice(actions)

#create random reward functions for testing
nonoptimal_rewardfunc1 = np.random.rand(env.observation_space.n,env.action_space.n)
nonoptimal_rewardfunc2 = np.random.rand(env.observation_space.n,env.action_space.n)
nonoptimal_rewardfunc3 = np.random.rand(env.observation_space.n,env.action_space.n)


#get optimal reward function
q_table = create_q_table(env) 


print("---------------------------------------")
print("\n*** Testing likelihood function using optimal policy with random reward functions ***\n")
print("---------------------------------------")
print("\n")
print("Likelihood for optimal reward function is {}".format(likelihood(optimal_policy, q_table)))
print("Likelihood for non optimal reward function 1 is {}".format(likelihood(optimal_policy, nonoptimal_rewardfunc1)))
print("Likelihood for non optimal reward function 2 is {}".format(likelihood(optimal_policy, nonoptimal_rewardfunc2)))
print("Likelihood for non optimal reward function 3 is {}".format(likelihood(optimal_policy, nonoptimal_rewardfunc3)))
print("\n")


print("---------------------------------------")
print("\n*** Testing likelihood function using optimal reward function with random policies ***\n")
print("---------------------------------------")
print("\n")
print("Likelihood for optimal policy  is {}".format(likelihood(optimal_policy, q_table)))
print("Likelihood for non optimal policy 1 is {}".format(likelihood(nonoptimal1, q_table)))
print("Likelihood for non optimal policy 2 is {}".format(likelihood(nonoptimal2, q_table)))
print("Likelihood for non optimal policy 3 is {}".format(likelihood(nonoptimal3, q_table)))
print("\n")








NameError: name 'env' is not defined

# **ignore** *Copy Paste Dump* 

In [None]:
"""
threshold = 0.0001
action = 0
TEST_EPISODES = 20
REWARD_GOAL = 0.8
N =100

        #hard coded R
        self.R_hardcoded = {(0, 0, 0): 0.0, 
                  (0, 0, 1): 0.0, 
                  (0, 0, 2): 1.0,
                  (0, 1, 0): 0.0,
                  (0, 1, 1): 0.0, 
                  (0, 1, 2): 1.0,
                  (0, 2, 0): 0.0,
                  (0, 2, 1): 0.0,
                  (1, 0, 0): 0.0, 
                  (1, 0, 1): 0.0, 
                  (1, 0, 2): 1.0,
                  (1, 1, 0): 0.0,
                  (1, 1, 1): 0.0, 
                  (1, 1, 2): 1.0,
                  (1, 2, 0): 0.0,
                  (1, 2, 1): 0.0,
                  (1, 2, 2): 1.0,
                  (2, 0, 0): 0.0, 
                  (2, 0, 1): 0.0, 
                  (2, 0, 2): 1.0,
                  (2, 1, 0): 0.0,
                  (2, 1, 1): 0.0, 
                  (2, 1, 2): 1.0,
                  (2, 2, 0): 0.0,
                  (2, 2, 1): 0.0,
                  (2, 2, 2): 1.0}

        #hard coded T

def step(self, action):
        isdone = False
        if action == 2:
            currentstate = self.currentstate #remain at current state
        elif action == 1:
            try:
                currentstate = self.actions[self.actions.index(self.currentstate)+1] #take step to right
            except IndexError:
                currentstate = self.actions[0]
        else:
            try:
                currentstate = self.actions[self.actions.index(self.currentstate)-1] #take step to left
            except IndexError:
                currentstate = self.currentstate
        if currentstate == self.endstate:
            isdone = True
        else:
            isdone = False

        return currentstate, self.rewards[currentstate], isdone
        

        
"""