# OpenAI gym: Copy-v0 with Simple Q-learning  
- Observation dim = [1]  
- Action dim = [2,2,5]. Use encoding 0-100
- The rest is a standard Q learning with a memory relay
- Initial exploration is random. I found 100 episodes were not enough, and I kept using 1000 episodes.

In [1]:
import datetime
print(datetime.datetime.now().isoformat())

2017-08-27T00:30:05.059532


In [2]:
import tensorflow as tf
import numpy as np
import gym
from gym import wrappers

In [3]:
tf.reset_default_graph()
k_init = tf.contrib.layers.variance_scaling_initializer()
X = tf.placeholder(tf.int32,shape=[None,1])
hid1 = tf.reshape(tf.one_hot(X,6),[-1,6])
hid2 = tf.layers.dense(hid1,64,activation=tf.nn.elu,kernel_initializer=k_init)
hid3 = tf.layers.dense(hid2,128,activation=tf.nn.elu,kernel_initializer=k_init)
Q = tf.layers.dense(hid2,20)
action_ph = tf.placeholder(tf.int32,shape=[None,])
Q_action = tf.reduce_sum(Q*tf.one_hot(action_ph,20),axis=1,keep_dims=True)
Qexpected = tf.placeholder(tf.float32,shape=[None,1])
loss = tf.losses.mean_squared_error(Qexpected,Q_action)
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
train_op = optimizer.minimize(loss)

In [4]:
from collections import deque
class Memory(object):
    def __init__(self,memory_size=10000):
        self.memory = deque(maxlen=memory_size)
        self.memory_size = memory_size
        
    def __len__(self):
        return len(self.memory)
    
    def append(self,item):
        self.memory.append(item)
        
    def sample_batch(self,batch_size=256):
        idx = np.random.permutation(len(self.memory))[:batch_size]
        return [self.memory[i] for i in idx]

In [5]:
gamma = 0.99
max_episode = 5000
batch_size = 256
memory_warmup = batch_size*3
memory = Memory()
explore_eps = 1000

env = gym.make('Copy-v0')
env = wrappers.Monitor(env,'./tmp/',force=True)
obs = env.reset()
init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    iteration = 0
    episode = 0
    episode_reward = 0
    episode_steps = 0
    while episode <= max_episode:
        print('\riter {}, ep {}'.format(iteration,episode),end='')
        p = episode/explore_eps
        if np.random.rand() > p:
            action = env.action_space.sample()
        else:
            Q_val = Q.eval(feed_dict={X:np.reshape(obs,[1,-1])})
            action = np.argmax(Q_val,axis=1)
            action = np.unravel_index(action[0],[2,2,5])
        next_obs,reward,done,_ = env.step(action)
        memory.append([obs,np.ravel_multi_index(action,[2,2,5]),reward,next_obs,done])
        if iteration > memory_warmup:
            memory_batch = memory.sample_batch(batch_size)
            extract_mem = lambda k: np.array([item[k] for item in memory_batch])
            obs_batch = extract_mem(0)
            action_batch = extract_mem(1)
            reward_batch = extract_mem(2)
            next_obs_batch = extract_mem(3)
            done_batch = extract_mem(4)
            Qnext_val = Q.eval(feed_dict={X:np.expand_dims(next_obs_batch,axis=1)})
            Qnext_val = np.max(Qnext_val,axis=1,keepdims=True)
            Qexpected_batch = reward_batch + gamma*(1-done_batch)*Qnext_val.ravel()
            Qexpected_batch = np.expand_dims(Qexpected_batch,1)
            train_op.run(feed_dict={X:np.expand_dims(obs_batch,axis=1),action_ph:action_batch,Qexpected:Qexpected_batch})
        iteration += 1
        episode_reward += reward
        episode_steps += 1
        if done:
            if iteration>memory_warmup and episode%100==0:
                print(', ep reward {}, ep steps {}'.format(episode_reward, episode_steps))
            episode_reward = 0
            episode_steps = 0
            episode += 1
            obs = env.reset()
        else:
            obs = next_obs
env.close()

[2017-08-27 00:30:10,892] Making new env: Copy-v0
[2017-08-27 00:30:10,985] Clearing 18 monitor files from previous run (because force=True was provided)
[2017-08-27 00:30:10,988] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video000000.json
[2017-08-27 00:30:11,094] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video000001.json
[2017-08-27 00:30:11,099] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video000008.json
[2017-08-27 00:30:11,114] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video000027.json
[2017-08-27 00:30:11,140] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video000064.json
[2017-08-27 00:30:11,173] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/open

iter 0, ep 0iter 1, ep 0iter 2, ep 0iter 3, ep 1iter 4, ep 1iter 5, ep 1iter 6, ep 1iter 7, ep 1iter 8, ep 2iter 9, ep 2iter 10, ep 3iter 11, ep 4iter 12, ep 5iter 13, ep 6iter 14, ep 6iter 15, ep 7iter 16, ep 7iter 17, ep 8iter 18, ep 9iter 19, ep 9iter 20, ep 9iter 21, ep 10iter 22, ep 10iter 23, ep 10iter 24, ep 10iter 25, ep 11iter 26, ep 12iter 27, ep 12iter 28, ep 12iter 29, ep 12iter 30, ep 12iter 31, ep 12iter 32, ep 13iter 33, ep 13iter 34, ep 13iter 35, ep 13iter 36, ep 14iter 37, ep 14iter 38, ep 14iter 39, ep 14iter 40, ep 14iter 41, ep 14iter 42, ep 14iter 43, ep 15iter 44, ep 16iter 45, ep 16iter 46, ep 17iter 47, ep 17iter 48, ep 17iter 49, ep 17iter 50, ep 18iter 51, ep 18iter 52, ep 19iter 53, ep 19iter 54, ep 19iter 55, ep 19iter 56, ep 19iter 57, ep 19iter 58, ep 19iter 59, ep 19iter 60, ep 19iter 61, ep 20iter 62, ep 20iter 63, ep 21iter 64, ep 21iter 65, ep 22iter 66, ep 22iter 67, ep 23iter 68, e

[2017-08-27 00:30:11,433] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video000343.json


iter 1018, ep 400, ep reward 0.5, ep steps 8
iter 1361, ep 495

[2017-08-27 00:30:12,899] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video000512.json


iter 1383, ep 500, ep reward -0.5, ep steps 1
iter 1780, ep 600, ep reward -0.5, ep steps 6
iter 2130, ep 700, ep reward 0.5, ep steps 4
iter 2213, ep 722

[2017-08-27 00:30:14,838] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video000729.json


iter 2530, ep 800, ep reward -0.5, ep steps 4
iter 2981, ep 900, ep reward -1.0, ep steps 11
iter 3410, ep 989

[2017-08-27 00:30:17,720] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video001000.json


iter 3466, ep 1000, ep reward 3.0, ep steps 3
iter 3911, ep 1100, ep reward -0.5, ep steps 2
iter 4298, ep 1200, ep reward 0.5, ep steps 3
iter 4730, ep 1300, ep reward -1.0, ep steps 9
iter 5099, ep 1400, ep reward -0.5, ep steps 3
iter 5484, ep 1500, ep reward 0.5, ep steps 3
iter 5900, ep 1600, ep reward 4.0, ep steps 4
iter 6333, ep 1700, ep reward 0.5, ep steps 3
iter 6775, ep 1800, ep reward 2.0, ep steps 13
iter 7154, ep 1900, ep reward 5.0, ep steps 5
iter 7946, ep 1996

[2017-08-27 00:30:29,364] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video002000.json


iter 7983, ep 2000, ep reward -0.5, ep steps 2
iter 8536, ep 2100, ep reward -0.5, ep steps 3
iter 8932, ep 2200, ep reward 5.5, ep steps 8
iter 9557, ep 2300, ep reward 14.0, ep steps 14
iter 11332, ep 2400, ep reward 24.0, ep steps 24
iter 13512, ep 2500, ep reward 0.5, ep steps 3
iter 14469, ep 2600, ep reward 4.5, ep steps 7
iter 16248, ep 2700, ep reward 31.0, ep steps 31
iter 19349, ep 2800, ep reward 32.0, ep steps 32
iter 22447, ep 2900, ep reward 32.0, ep steps 32
iter 25479, ep 2998

[2017-08-27 00:31:17,244] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video003000.json


iter 25543, ep 3000, ep reward 32.0, ep steps 32
iter 28633, ep 3100, ep reward 32.0, ep steps 32
iter 31742, ep 3200, ep reward 32.0, ep steps 32
iter 34853, ep 3300, ep reward 30.0, ep steps 30
iter 37933, ep 3400, ep reward 30.0, ep steps 30
iter 41039, ep 3500, ep reward 31.0, ep steps 31
iter 44139, ep 3600, ep reward 32.0, ep steps 32
iter 47246, ep 3700, ep reward 32.0, ep steps 32
iter 50344, ep 3800, ep reward 31.0, ep steps 31
iter 53448, ep 3900, ep reward 31.0, ep steps 31
iter 56517, ep 3999

[2017-08-27 00:32:41,871] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video004000.json


iter 56556, ep 4000, ep reward 30.0, ep steps 30
iter 59677, ep 4100, ep reward 31.0, ep steps 31
iter 62776, ep 4200, ep reward 32.0, ep steps 32
iter 65881, ep 4300, ep reward 31.0, ep steps 31
iter 68980, ep 4400, ep reward 32.0, ep steps 32
iter 72094, ep 4500, ep reward 30.0, ep steps 30
iter 75179, ep 4600, ep reward 31.0, ep steps 31
iter 78282, ep 4700, ep reward 32.0, ep steps 32
iter 81372, ep 4800, ep reward 32.0, ep steps 32
iter 84463, ep 4900, ep reward 30.0, ep steps 30
iter 87530, ep 4999

[2017-08-27 00:34:07,657] Starting new video recorder writing to /Users/winter/Google Drive/handson-ml/tmp/openaigym.video.0.1673.video005000.json


iter 87531, ep 5000iter 87532, ep 5000iter 87533, ep 5000iter 87534, ep 5000iter 87535, ep 5000iter 87536, ep 5000iter 87537, ep 5000iter 87538, ep 5000iter 87539, ep 5000iter 87540, ep 5000iter 87541, ep 5000iter 87542, ep 5000iter 87543, ep 5000iter 87544, ep 5000iter 87545, ep 5000iter 87546, ep 5000iter 87547, ep 5000iter 87548, ep 5000iter 87549, ep 5000iter 87550, ep 5000iter 87551, ep 5000iter 87552, ep 5000iter 87553, ep 5000iter 87554, ep 5000iter 87555, ep 5000iter 87556, ep 5000iter 87557, ep 5000iter 87558, ep 5000iter 87559, ep 5000iter 87560, ep 5000iter 87561, ep 5000iter 87562, ep 5000, ep reward 32.0, ep steps 32


[2017-08-27 00:34:07,873] Finished writing results. You can upload them to the scoreboard via gym.upload('/Users/winter/Google Drive/handson-ml/tmp')


In [6]:
gym.upload('./tmp/',api_key='sk_BlwjttPKR6ZsXVrObENYA')

[2017-08-27 00:34:07,916] [Copy-v0] Uploading 5001 episodes of training data
[2017-08-27 00:34:10,513] [Copy-v0] Uploading videos of 15 training episodes (3807 bytes)
[2017-08-27 00:34:10,859] [Copy-v0] Creating evaluation object from ./tmp/ with learning curve and training video
[2017-08-27 00:34:11,425] 
****************************************************
You successfully uploaded your evaluation on Copy-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_qdzwvK4RNa1Dx71JtH2g

****************************************************
