![](q_learning.png)

In [1]:
import gym
import numpy as np
import random


ENV = gym.make('CartPole-v0')

GAMMA = 0.5
ALPHA = 0.005

In [2]:
def random_policy(observation, env):
    """A random action from the enviroment."""
    return env.action_space.sample()

    

def run_episode(policy, render=False, env=ENV):
    """Used to generate an episode."""
    
    # Start the enviroment an log the start state.
    state = env.reset()    
    episode = []
    
    while True:
        
        if render:
            env.render()
        
        action = policy(state, env)
        
        new_state, new_reward, done, _ = env.step(action) 
        
        if done:
            episode.append((state, action, -1, new_state))
        episode.append((state, action, new_reward, new_state))
        
        # Current state is now the new state.
        state = new_state
        
        if done:
            break
    
    env.close()
    return episode

len(run_episode(random_policy, render=True, env=ENV))

18

In [24]:
assert run_episode(random_policy, render=False)

In [247]:
import itertools


def get_feature(state, action):
    return np.append(state, [action, 1.0]) 

class linear_model:
    
    def __init__(self, m):
        self.w = np.random.normal(size = m)
        
    def action_value(self, state, action):
        X = get_feature(state, action)
        return X.dot(self.w)
    
    
class epsilon_greedy_policy:
    def __init__(self, epsilon, model):
        self.epsilon = epsilon
        self.model = model

    def policy(self, state, env):
        random_action = random.choice([0, 1])
        if random.random() <= self.epsilon:
            return random_action
        return max([(model.action_value(state, action), action) for action in [0, 1]])[1]

    
class greedy_policy:
    def __init__(self, model):
        self.model = model
        
    def policy(self, state, env):
        return max([(model.action_value(state, action), action) for action in [0, 1]])[1]

    
def get_training(episode, action_value):
    """Create a training dataset using TD(0) target."""
    
    training = []
    
    for state, action, new_reward, new_state in episode:
        
        X = get_feature(state, action)
        y = new_reward + GAMMA * max(action_value(new_state, action) for action in [0, 1])
        
        training.append((X, y))
        
    return training
    

def get_experience(model, policy, experience_size=10):
    experience = []
    
    for i in range(experience_size):

        episode = run_episode(policy.policy)
        experience.extend(episode)
    return experience


def update(model, experience, minibatches=100, minibatch_size=64):
    
    for _ in range(minibatches):
        
        random.shuffle(experience)
        
        for i in range(0, len(experience), minibatch_size):
            gradient = get_gradient(model, experience[i:(i+minibatch_size)])
            model.w = model.w - ALPHA * gradient

        
def mse(experience, model):
    _mse = 0
    for X, y in experience:
        _mse += (model.w.dot(X) - y)
    return _mse / len(experience)

def get_gradient(model, experience):
    _delta = 0
    for X, y in experience:
        _delta += (model.w.dot(X) - y) * X
    
    return _delta / len(experience)


model = linear_model(m=6)
policy = epsilon_greedy_policy(0.9, model)
experience_length=10

experience = []

print(f'starting w {model.w}')

for e in range(100):
    
    new_experience = get_experience(model, policy, experience_length)
    print(f"Length of new experience: {len(new_experience) / experience_length}")
    experience.append(new_experience)
    training = list(itertools.chain.from_iterable(map(lambda episode: get_training(episode, action_value = model.action_value), experience)))
        
    update(model, training)
    
    
    #print(f'experience length: {len(experience)}')

print(f'ending w {model.w}')

final_policy = greedy_policy(model)

starting w [-0.80752339 -1.16306764 -0.09984934 -0.60706882 -0.09559868 -0.77572489]
Length of new experience: 22.1
Length of new experience: 17.5
Length of new experience: 21.3
Length of new experience: 23.8
Length of new experience: 25.7
Length of new experience: 21.6
Length of new experience: 25.3
Length of new experience: 24.1
Length of new experience: 25.1
Length of new experience: 28.5
Length of new experience: 15.7
Length of new experience: 29.0
Length of new experience: 25.3
Length of new experience: 18.6
Length of new experience: 27.0
Length of new experience: 18.9
Length of new experience: 21.9
Length of new experience: 21.1
Length of new experience: 23.3
Length of new experience: 18.4
Length of new experience: 22.5
Length of new experience: 16.3
Length of new experience: 18.5
Length of new experience: 24.1
Length of new experience: 23.4
Length of new experience: 23.0
Length of new experience: 21.9
Length of new experience: 22.2
Length of new experience: 22.1
Length of new ex

KeyboardInterrupt: 

In [244]:
import itertools
list(itertools.chain.from_iterable(experience))

[(array([0.03240678, 0.01357878, 0.04576226, 0.0378292 ]),
  0,
  1.0,
  array([ 0.03267836, -0.18216852,  0.04651885,  0.34459215])),
 (array([ 0.03267836, -0.18216852,  0.04651885,  0.34459215]),
  0,
  1.0,
  array([ 0.02903499, -0.37792029,  0.05341069,  0.65157378])),
 (array([ 0.02903499, -0.37792029,  0.05341069,  0.65157378]),
  1,
  1.0,
  array([ 0.02147658, -0.18358129,  0.06644216,  0.37617624])),
 (array([ 0.02147658, -0.18358129,  0.06644216,  0.37617624]),
  0,
  1.0,
  array([ 0.01780496, -0.37958093,  0.07396569,  0.68904663])),
 (array([ 0.01780496, -0.37958093,  0.07396569,  0.68904663]),
  1,
  1.0,
  array([ 0.01021334, -0.18555919,  0.08774662,  0.42053586])),
 (array([ 0.01021334, -0.18555919,  0.08774662,  0.42053586]),
  1,
  1.0,
  array([0.00650216, 0.00821703, 0.09615734, 0.15675421])),
 (array([0.00650216, 0.00821703, 0.09615734, 0.15675421]),
  0,
  1.0,
  array([ 0.0066665 , -0.18814074,  0.09929242,  0.47815694])),
 (array([ 0.0066665 , -0.18814074,  0.0