![](q_learning.png)

In [227]:
import gym
import numpy as np
import random
ENV = gym.make('CartPole-v0')

GAMMA = 0.5
ALPHA = 0.005

In [219]:
def random_policy(observation, env):
    """A random action from the enviroment."""
    return env.action_space.sample()

    

def run_episode(policy, render=False, env=ENV):
    """Used to generate an episode."""
    
    # Start the enviroment an log the start state.
    state = env.reset()    
    episode = []
    
    while True:
        
        if render:
            env.render()
        
        action = policy(state, env)
        
        new_state, new_reward, done, _ = env.step(action) 
        
        if done:
            episode.append((state, action, -1, new_state))
        episode.append((state, action, new_reward, new_state))
        
        # Current state is now the new state.
        state = new_state
        
        if done:
            break
    
    env.close()
    return episode

len(run_episode(random_policy, render=True, env=ENV))

17

In [24]:
assert run_episode(random_policy, render=False)

In [233]:
def get_feature(state, action):
    return np.append(state, [action, 1.0]) 

class linear_model:
    
    def __init__(self, m):
        self.w = np.random.normal(size = m)
        
    def action_value(self, state, action):
        X = get_feature(state, action)
        return X.dot(self.w)
    
    
class epsilon_greedy_policy:
    def __init__(self, epsilon, model):
        self.epsilon = epsilon
        self.model = model

    def policy(self, state, env):
        random_action = random.choice([0, 1])
        if random.random() <= self.epsilon:
            return random_action
        return max([(model.action_value(state, action), action) for action in [0, 1]])[1]

    
class greedy_policy:
    def __init__(self, model):
        self.model = model
        
    def policy(self, state, env):
        return max([(model.action_value(state, action), action) for action in [0, 1]])[1]

    
def get_training(episode, action_value):
    """Create a training dataset using TD(0) target."""
    
    training = []
    
    for state, action, new_reward, new_state in episode:
        
        X = get_feature(state, action)
        y = new_reward + GAMMA * max(action_value(new_state, action) for action in [0, 1])
        
        training.append((X, y))
        
    return training
    

def get_experience(model, policy, experience_size=10):
    experience = []
    
    for i in range(experience_size):

        episode = run_episode(policy.policy)

        training = get_training(
            episode = episode,
            action_value = model.action_value
        )

        experience.extend(training)
    return experience


def update(model, experience, minibatches=100, minibatch_size=64):
    
    for _ in range(minibatches):
        
        random.shuffle(experience)
        
        for i in range(0, len(experience), minibatch_size):
            gradient = get_gradient(model, experience[i:(i+minibatch_size)])
            model.w = model.w - ALPHA * gradient

        
def mse(experience, model):
    _mse = 0
    for X, y in experience:
        _mse += (model.w.dot(X) - y)
    return _mse / len(experience)

def get_gradient(model, experience):
    _delta = 0
    for X, y in experience:
        _delta += (model.w.dot(X) - y) * X
    
    return _delta / len(experience)


model = linear_model(m=6)
policy = epsilon_greedy_policy(0.9, model)
experience_length=10

print(f'starting w {model.w}')

for _ in range(100):
    
    experience = get_experience(model, policy, experience_length)
    
    print(f'mse: {mse(experience, model)}, average experience length: {len(experience) / experience_length}')
    
    update(model, experience)
    
    
    #print(f'experience length: {len(experience)}')

print(f'ending w {model.w}')

final_policy = greedy_policy(model)

starting w [ 1.07271902 -0.15666873 -0.06019664 -0.42025472 -1.97097622 -0.26678551]


ValueError: shapes (8,) and (6,) not aligned: 8 (dim 0) != 6 (dim 0)

In [99]:
model.w

array([-1343.02079992,  -650.79698822,  2538.25079922, -1419.68784436])

In [209]:
ENV.reset()
run_episode(final_policy.policy, render=True, env=ENV)

[(array([ 0.00359954,  0.00246077, -0.03854915,  0.02889756]),
  0,
  1.0,
  array([ 0.00364876, -0.19208776, -0.0379712 ,  0.30917292])),
 (array([ 0.00364876, -0.19208776, -0.0379712 ,  0.30917292]),
  0,
  1.0,
  array([-1.92997400e-04, -3.86648699e-01, -3.17877451e-02,  5.89643105e-01])),
 (array([-1.92997400e-04, -3.86648699e-01, -3.17877451e-02,  5.89643105e-01]),
  0,
  1.0,
  array([-0.00792597, -0.58131147, -0.01999488,  0.87214572])),
 (array([-0.00792597, -0.58131147, -0.01999488,  0.87214572]),
  0,
  1.0,
  array([-0.0195522 , -0.77615588, -0.00255197,  1.15847581])),
 (array([-0.0195522 , -0.77615588, -0.00255197,  1.15847581]),
  0,
  1.0,
  array([-0.03507532, -0.97124448,  0.02061755,  1.45035748])),
 (array([-0.03507532, -0.97124448,  0.02061755,  1.45035748]),
  0,
  1.0,
  array([-0.05450021, -1.16661365,  0.0496247 ,  1.74941008])),
 (array([-0.05450021, -1.16661365,  0.0496247 ,  1.74941008]),
  0,
  1.0,
  array([-0.07783248, -1.36226279,  0.0846129 ,  2.05710618