In [2]:
from numpy.random import choice
import numpy as np

In [72]:
### Converting an MDP into an MRP

state_names = ["C1", "C2", "C3", "FB", "Sleep"]

# Probabilities changed to reflect uniform random policy

# Notice Class 3 probabilities reflect possible pub choice:
# Row 3 Column 1:
# (.5 * .2) = .1 = probability of picking pub action (.5) AND
# probability of being sent to class 1 (.2) as a result

# Together they mean a .1 probability 
# of ending up back in C1 from C3

p_matrix = [[0, .5, 0, .5, 0],
            [0, 0, .5, 0, .5],
            [.1, .2, .2, 0, .5],
            [.5, 0, 0, .5, 0],
            [0, 0, 0, 0, 0]]

# Action rewards are weighted and summed by probability of being chosen
# I.E: 5.5 = (.5 * 10) + (.5 * 1)
_rewards = [-1.5, -1, 5.5, -.5, 0]


gamma = 1
R = np.array(_rewards)
P = np.matrix(p_matrix)
I = np.identity(len(p_matrix))

solution = np.dot(np.linalg.inv((I-gamma*P)), R)
solution = solution.tolist()[0]

solutions = {}
for state in range(len(state_names)):
    solutions[state_names[state]] = solution[state]

solutions
    

{'C1': -1.3076923076923084,
 'C2': 2.6923076923076925,
 'C3': 7.384615384615385,
 'FB': -2.3076923076923075,
 'Sleep': 0.0}

In [67]:


state_names = ["C1", "C2", "C3", "FB", "Sleep"]

_transitions = {
    "C1": {"Study": [0, 1, 0, 0, 0], "FB": [0, 0, 0, 1, 0]},
    "C2": {"Study": [0, 0, 1, 0, 0], "Sleep": [0, 0, 0, 0, 1]},
    "C3": {"Study": [0, 0, 0, 0, 1], "Pub": [.2, .4, .4, 0, 0]},
    "FB": {"Quit": [1, 0, 0, 0, 0], "FB": [0, 0, 0, 1, 0]},
    "Sleep": {"Sleep": [0, 0, 0, 0, 1]}
}

_rewards = {
    "Study": [-2, -2, 10, None, None],
    "Sleep": [None, 0, None, None, 0],
    "FB": [-1, None, None, -1, None],
    "Quit": [None, None, None, 0, None],
    "Pub": [None, None, 1, None, None]
}

_policy = {
    "C1": {"Study": .5, "FB": .5},
    "C2": {"Study": .5, "Sleep": .5},
    "C3": {"Study": .5, "Pub": .5},
    "FB": {"Quit": .5, "FB": .5},
    "Sleep": {"Sleep": 1}
}



# Just cus
class ProbabilityMatrixException(Exception):
    def __init__(self, message):
        self.message = message
    
    
class MDP:
    def __init__(self, transitions, rewards, policy, state_names, terminal_state):
        self.transitions = transitions
        self.rewards = rewards
        self.state_names = state_names
        self.policy = policy
        self.terminal_state = terminal_state
        
    def sample_policy(self, state):
        probabilities = self.policy[state]
        action = choice(list(probabilities.keys()), p=list(probabilities.values()))
        return action
    
    def act(self, state, action):
        possible_actions = self.transitions[state]
        P = possible_actions[action]
        next_state = choice(self.state_names, p=P)
        return next_state
        
        
    def sample(self, start):
        states = []
        actions = []
        state = start
        while state != self.terminal_state:
            states.append(state)
            action = self.sample_policy(state)
            actions.append(action)
            next_state = self.act(state, action)
            state = next_state
            
        states.append(self.terminal_state)
        return states, actions
    
    
    def G(self, sample):
        states, actions = sample
        states.pop()
        rewards = []
        for i in range(len(states)):
            reward_list = self.rewards[actions[i]]
            state_index = self.state_names.index(states[i])
            assert reward_list[state_index] is not None
            rewards.append(reward_list[state_index])
            
        return np.sum(rewards)
        

mdp = MDP(_transitions, _rewards, _policy, state_names, "Sleep")



In [74]:
### Monte-Carlo State-Value Estimation
estimates = {}
for state in mdp.state_names:
    rewards = []
    for i in range(0, 5000):
        reward = mdp.G(mdp.sample(state))
        rewards.append(reward)
    estimates[state] = np.mean(rewards)
estimates


{'C1': -1.2836000000000001,
 'C2': 2.7210000000000001,
 'C3': 7.319,
 'FB': -2.3532000000000002,
 'Sleep': 0.0}

In [97]:
# Monte-Carlo Action-Value Estimation

action_estimates = {"C1": {}, "C2": {}, "C3": {}, "FB": {}}

action_rewards = {
    "C1": {"Study": [], "FB": []},
    "C2": {"Study": [], "Sleep": []},
    "C3": {"Study": [], "Pub": []},
    "FB": {"Quit": [], "FB": []},
    "Sleep": {"Sleep": []}
}

min_iterations = 5000

# Yeesh this is ugly
for state in mdp.state_names:
    if state == "Sleep":
        continue
        
    possible_actions = list(action_rewards[state].keys())
    while possible_actions:
        sample = mdp.sample(state)
        _, actions = sample
        reward = mdp.G(sample)
        action_rewards[state][actions[0]].append(reward)
        
        for action in possible_actions:
            if len(action_rewards[state][action]) > min_iterations:
                i = possible_actions.index(action)
                possible_actions.pop(i)

action_estimates = action_rewards
for state in action_rewards:
    for action in action_rewards[state]:
        action_estimates[state][action] = np.mean(action_rewards[state][action])
action_estimates

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


{'C1': {'FB': -3.400354121581743, 'Study': 0.6522695460907818},
 'C2': {'Sleep': 0.0, 'Study': 5.4152127033914921},
 'C3': {'Pub': 4.7754449110177966, 'Study': 10.0},
 'FB': {'FB': -3.3111243307555025, 'Quit': -1.4095180963807239},
 'Sleep': {'Sleep': nan}}