Markov Decision Process (MDP)
===
<ul>
<li>States</li>
</ul>


Actions

Rewards

Probabilities

Gamma

In [27]:
import random
import numpy as np
from numpy.random import choice

In [28]:
def normalizedList(size):
    l = [random.random() for i in range(size)]
    s = sum(l)
    lnorm = [x/s for x in l]
    return lnorm

In [29]:
class MDP:
    
    def __init__(self, numStates, numActions, gamma = 0.9):
        self.rewards = { (s,a,sNext) : random.randint(-20,20) \
                            for s in range(numStates) \
                            for a in range(numActions) \
                            for sNext in range(numStates)}
        self.probs = [[normalizedList(numStates) \
                       for a in range(numActions)] \
                       for s in range(numStates)]
        self.states = range(numStates)
        self.state = 0
        self.gamma = gamma
        self.numStates = numStates
        self.numActions = numActions
    
    def takeAction(self, action):
        sNext = choice( self.states, p = self.probs[self.state][action])
        reward = self.rewards[ (self.state, action, sNext) ]
        self.state = sNext
        return reward

In [30]:
class Qlearner():
    def __init__(self, mdp, alpha = 0.1):
        self.q = np.random.rand(mdp.numStates, mdp.numActions)
        self.alpha = alpha
        self.mdp = mdp
        self.score = 0
        self.prevAction = 0
    
    def explore(self):
        """"Take random action and learn from them"""
        #Redundant, use exploit
        
        # Save reward, gamma from mdp, S_t a_t, S_t+1
        #Save current state
        prevState = self.mdp.state
        #Take a random action
        randAction = np.choice(range(self.numActions)) # Picks a random value from array
        reward = self.mdp.takeAction(randAction)
        self.q[prevState, randAction] = self.q[prevState, randAction] + self.alpha*(reward + \
                                     self.mdp.gamma*np.max(self.q[self.mdp.state]) - self.q[prevState, randAction])
        
    def exploit(self, epsilon = 0.1):
        """"Figure out what to do based upon value function"""
        random = np.random.random()
        prevState = self.mdp.state
        #Epsilon greedy
        if(random < epsilon):
            #Behave randomly
            actionTaken = choice(range(self.mdp.numActions)) # Picks a random value from array
            
        else:
            actionTaken = np.argmax(self.q[self.mdp.state])
            # print(actionTaken)
        
        reward = self.mdp.takeAction(actionTaken)
        self.score += reward
        self.q[prevState][actionTaken] += self.alpha*(reward + self.mdp.gamma*(self.q[self.mdp.state][actionTaken]) \
                                                      - self.q[prevState][self.prevAction])
        self.prevAction = actionTaken
        
        return reward
            
        
    

In [31]:
mdp = MDP(2,2)

qlearner = Qlearner(mdp)

for i in range(1000):
    rew = qlearner.exploit(epsilon=1)
    
print(qlearner.score)

qlearner.score = 0

for i in range(1000):
    rew = qlearner.exploit(epsilon=0.01)
    
print(qlearner.score)

# rew = qlearner.exploit()
# print(rew)

4456
7735


Next Step
 * Apply this to openAI gym
 * SARSA - State action reward state action - policy based
 * Look at continuous space problems and apply SARSA/Q-learning by making a discrete model
 * Look at continuous state and action spaces (i.e. CEM, Policy Gradient)