In [1]:
import gym
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import random

#
# solved using fitted q-iteration over randomforest
#

clf = None
env = gym.make('CartPole-v0')

def get_point(pre, action, reward, post):
  """ returns a tuple (state, action, reward, post-state) """
  return np.array([pre.tolist() + [action] + [reward] + post.tolist()])

def get_action(clf, p_feat, exploration, space):
  """ given the q-function estimage (clf), the state p_feat, an exploration noise threshold, and an action space, returns action """
  feat = np.append(p_feat, 0)
  a = clf.predict(feat.reshape(1, -1))[0]
  feat = np.append(p_feat, 1)
  b = clf.predict(feat.reshape(1, -1))[0]
  # print("%s %s" % (a,b))
  if random.random() < exploration:
    action = space.sample()
  elif a > b:
    action = 0
  elif b > a:
    action = 1
  else:
    action = space.sample()
  # print("action chosen %s" % action)
  return action


def get_tree(obs, nb_it):
  #
  # fitted q-iteration
  # ref: Tree-Based Batch Mode Reinforcement Learning, Ernst, Guerst
  # 
  reg = None
  d_rate = 0.95
  df = pd.DataFrame(obs, columns=['a', 'b', 'c', 'd', 'action', 'reward', 'x', 'y', 'z', 'w'])
  for n in range(nb_it):
      # training an estimate of q_1
    if reg is not None:
      # using previous classifier as estimate of q_n-1
      features = df.columns[6:10]
      df2 = df[features]
      length = df2.shape[0]
      res = reg.predict(pd.concat([df2, pd.DataFrame(np.zeros(length))], axis=1))
      res2 = reg.predict(pd.concat([df2, pd.DataFrame(np.ones(length))], axis=1))
      # preparing our new training data set
      # if reward is 0, don't allow expected future reward to be positive
      df['max_reward'] = df['reward'] + (df['reward'] == 1)*(np.amax([res, res2], axis=0) * d_rate)
      # print("obs len: %s - nb iteration: %s - max reward: %s" % (len(obs), nb_it, np.max(df['max_reward'])))
    else:
      df['max_reward'] = df['reward']
    features = df.columns[:5]
    reg = RandomForestRegressor()
    reg = reg.fit(df[features], df['max_reward'])
    score = reg.score(df[features], df['max_reward'])
  # print("training - score is %s" % (score))
  return reg


#
# getting initial trajectories
#
tmp = None
post_observation = None
k = 0
for e in range(2000):
  pre_observation = env.reset()
  exploration = max(1.0/2**(e/2), 0)
  print("exploration = %s" % exploration)
  for t in range(1000):
    env.render()
    if post_observation is not None:
      pre_observation = post_observation
    if clf is None:
      action = env.action_space.sample()
    else:
      action = get_action(clf, pre_observation, exploration, env.action_space)
      # print(pre_observation)
    post_observation, reward, done, info = env.step(action)
    if done:
      print("Episode %s finished after timestamp %s" % (e, t))
      if t < 199:
          # simple reward shaping
          reward = -50
      tmp2 = get_point(pre_observation, action, reward, post_observation)
      tmp = np.append(tmp, tmp2, 0)
      break
    if tmp is None:
      tmp = get_point(pre_observation, action, reward, post_observation)
    else:
      tmp2 = get_point(pre_observation, action, reward, post_observation)
      tmp = np.append(tmp, tmp2, 0)
  if t < 199:
    clf = get_tree(tmp, max(e + 10, 30))
    k = 0
  else:
    k = k + 1
  print("nb hits %s" % k)
  if k > 100:
    break

[2016-11-15 21:50:23,207] Making new env: CartPole-v0


exploration = 1.0
Episode 0 finished after timestamp 12
nb hits 0
exploration = 0.7071067811865475
Episode 1 finished after timestamp 10
nb hits 0
exploration = 0.5
Episode 2 finished after timestamp 27
nb hits 0
exploration = 0.35355339059327373
Episode 3 finished after timestamp 13
nb hits 0
exploration = 0.25
Episode 4 finished after timestamp 46
nb hits 0
exploration = 0.17677669529663687
Episode 5 finished after timestamp 26
nb hits 0
exploration = 0.125
Episode 6 finished after timestamp 18
nb hits 0
exploration = 0.08838834764831843
Episode 7 finished after timestamp 96
nb hits 0
exploration = 0.0625
Episode 8 finished after timestamp 110
nb hits 0
exploration = 0.044194173824159216
Episode 9 finished after timestamp 109
nb hits 0
exploration = 0.03125
Episode 10 finished after timestamp 68
nb hits 0
exploration = 0.022097086912079608
Episode 11 finished after timestamp 352
nb hits 1
exploration = 0.015625
Episode 12 finished after timestamp 218
nb hits 2
exploration = 0.0110485

KeyboardInterrupt: 