This code is for training a neural network into learning how to play the atari game from the gym API.

The process will consist on loading multiple datasets that must be in the working directory of this notebook, concatenating them to create a dataset and parsing the dataset to the network.

The user is free to change the number of epochs and the parameters of the neural network. However, I do not advise to change the parameters of network since the training accuracy is most likely to reduce considerably. The most sensitive parameter is the loss. All the possible options of loss were tried and none of them gave a result as good as binary_crossentropy.

In [0]:
import os
import gym
import cv2
import argparse
import sys, glob
import numpy as np
import pandas as pd
import pdb
import keras
from keras import backend as K
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, Reshape
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import Adam, Adamax, RMSprop
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Activation, Dropout, Flatten
from keras.layers.convolutional import UpSampling2D, Convolution2D
IMAGE_LENGTH = int(33600)
epochs = 1


Using TensorFlow backend.


In [0]:
# This code is for loading the images of the dataset given a path.
# IMPORTANT: The dataset most contain flattenned image of leng 33600 and
# before the first element of this vector there must be the feature of that
# sample. That feature is the keyboard pressed by the user while playing and 
# it has to be an integer between 0 and 8.
def load_data(path):
    #pdb.set_trace()
    df = pd.read_csv(path, sep = '.', header = None)
    letters = df[(df.index % (IMAGE_LENGTH+1)==0)].values.tolist()
    images = df[(df.index % (IMAGE_LENGTH+1)!=0)].values.tolist()
    n =IMAGE_LENGTH
    final = [images[i * n:(i + 1) * n] for i in range((len(images) + n - 1) // n )]  
    #pdb.set_trace()
    return letters, final

In [0]:
# Prepare first dataset 
letter, final = load_data('/content/drive/My Drive/DATOS/EIT/NICE ACADEMIC/AI/project/dataset_thomas_v1.txt')
for i in range (0, len(final)):
    final[i] = np.concatenate(final[i])
#pdb.set_trace()

letter = np.concatenate(letter)
final = np.array(final)
letter=letter.reshape(1, len(letter))
letter = letter[0, :]

In [0]:
# Prepare  second dataset 
letter2, final2 = load_data('/content/drive/My Drive/DATOS/EIT/NICE ACADEMIC/AI/project/dataset_thomas_v2.txt')
for i in range (0, len(final2)):
    final2[i] = np.concatenate(final2[i])
#pdb.set_trace()

letter2 = np.concatenate(letter2)
final2 = np.array(final2)
letter2=letter2.reshape(1, len(letter2))
letter2 = letter2[0, :]

In [0]:
# Prepare third dataset
# Prepare  second dataset 
letter3, final3 = load_data('/content/drive/My Drive/DATOS/EIT/NICE ACADEMIC/AI/project/dataset_thomas_v3.txt')
for i in range (0, len(final3)):
    final3[i] = np.concatenate(final3[i])
#pdb.set_trace()

letter3 = np.concatenate(letter3)
final3 = np.array(final3)
letter3=letter3.reshape(1, len(letter3))
letter3 = letter3[0, :]

In [0]:
# Prepare third dataset
# Prepare  second dataset 
letter4, final4 = load_data('/content/drive/My Drive/DATOS/EIT/NICE ACADEMIC/AI/project/dataset_haider_v1.txt')
for i in range (0, len(final4)):
    final4[i] = np.concatenate(final4[i])
#pdb.set_trace()

letter4 = np.concatenate(letter4)
final4 = np.array(final4)
letter4=letter4.reshape(1, len(letter4))
letter4 = letter4[0, :]

In [0]:
final = np.concatenate((final, final2, final3, final4))

In [0]:
letter = np.concatenate((letter, letter2, letter3, letter4))

In [0]:
# This is printing the number of images we have of gameplay. 
# This number should be much higher to properly train the network however 13 thousand
# already correspond to above 30 minutes of game play.
print('We have a total of {} frames ready for training!'.format(final.shape[0]))

We have a total of 13233 frames ready for training!


In [0]:
# Split dataset using a rule of 0.7
train_ratio = 0.7

n_train_samples = int(len(final) * train_ratio)
x_train, y_train = final[:n_train_samples], letter[:n_train_samples]
x_val, y_val = final[n_train_samples:], letter[n_train_samples:]


In [0]:
# The number of classes are all the possible keys a user can press during the game
# Even not pressing any key is allowed.
num_classes = 9
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)


In [0]:
# Input shape will be 33600 since that is the image size.
# This neural network has only 2 layers. The user is free to try new layers.
# An experiment with eleven layers was performed but no improvement was observed.
model = Sequential()
model.add(Dense(150, activation='tanh', input_shape=(33600,)))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='nadam',
              metrics=['accuracy'])
#pdb.set_trace()
history = model.fit(x_train, y_train,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_val, y_val))
score = model.evaluate(x_val, y_val, verbose=0)
print('Validation loss:', score[0])
print('Validation accuracy:', score[1])
# This code saves the model which will be needed for letting the network play the game
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model_weights.h5")
print("Saved model to disk")


Train on 9263 samples, validate on 3970 samples
Epoch 1/1
Validation loss: 0.6235493505631646
Validation accuracy: 0.8205432891845703
Saved model to disk


# New Section

In [0]:
# Based on the excellent
# https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
# and uses Keras.
import os
import pdb
import gym
import cv2
import argparse
import sys, glob
import numpy as np
from keras import backend as K
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, Reshape
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import Adam, Adamax, RMSprop
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Activation, Dropout, Flatten
from keras.layers.convolutional import UpSampling2D, Conv2D

Using TensorFlow backend.


In [0]:
#Script Parameters
input_dim = 210 * 160
gamma = 0.99
update_frequency = 1
learning_rate = 0.001
resume = False
render = False

In [0]:
#Initialize
env = gym.make('Enduro-v0')
#pdb.set_trace()
number_of_inputs = env.action_space.n #This is incorrect for Pong (?)
#number_of_inputs = 1
observation = env.reset()
prev_x = None
xs, dlogps, drs, probs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0
train_X = []
train_y = []

In [0]:
def pong_preprocess_screen(I):
  I=np.dot(I[..., :3], [0.2989, 0.5870, 0.1140])
  #pdb.set_trace()
  return I.astype(np.float).ravel()
def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

#Define the main model (WIP)
def learning_model(input_dim = input_dim, model_type=1):
  model = Sequential()
  if model_type==0:
    model.add(Reshape((1,210,160), input_shape=(input_dim,)))
    model.add(Flatten())
    model.add(Dense(200, activation = 'relu'))
    model.add(Dense(number_of_inputs, activation='softmax'))
    opt = RMSprop(lr=learning_rate)
  else:
    model.add(Reshape((1,210,160), input_shape=(input_dim,)))
    model.add(Conv2D(32, (9, 9), activation="relu", strides=(4, 4), padding="same", kernel_initializer="he_uniform"))
    model.add(Flatten())
    model.add(Dense(16, activation="relu", kernel_initializer="he_uniform"))
    model.add(Dense(number_of_inputs, activation='softmax'))
    opt = Adam(lr=learning_rate)
  model.compile(loss='categorical_crossentropy', optimizer=opt)
  if resume == True:
    model.load_weights('atari_model_checkpoint.h5')
  return model


In [0]:
model = learning_model()

In [0]:
#Begin training
while True:
  if render: 
    env.render()
  #Preprocess, consider the frame difference as features
  cur_x = pong_preprocess_screen(observation)
  #pdb.set_trace()
  x = cur_x - prev_x if prev_x is not None else np.zeros(input_dim)
  prev_x = cur_x
  #Predict probabilities from the Keras model
  aprob = ((model.predict(x.reshape([1,x.shape[0]]), batch_size=1).flatten()))
  #aprob = aprob/np.sum(aprob)
  #Sample action
  #action = np.random.choice(number_of_inputs, 1, p=aprob)
  #Append features and labels for the episode-batch
  xs.append(x)
  probs.append((model.predict(x.reshape([1,x.shape[0]]), batch_size=1).flatten()))
  aprob = aprob/np.sum(aprob)
  action = np.random.choice(number_of_inputs, 1, p=aprob)[0]
  y = np.zeros([number_of_inputs])
  y[action] = 1
  #print action
  dlogps.append(np.array(y).astype('float32') - aprob)
  observation, reward, done, info = env.step(action)
  reward_sum += reward
  drs.append(reward) 
  if done:
    #pdb.set_trace()
    episode_number += 1
    # Puts as a matrix the photos
    epx = np.vstack(xs)
    epdlogp = np.vstack(dlogps)
    epr = np.vstack(drs)
    discounted_epr = discount_rewards(epr)
    mean = np.mean(discounted_epr)
    std = np.std(discounted_epr) if np.std(discounted_epr) > 0 else 1
    discounted_epr = (discounted_epr-mean)/std
    epdlogp *= discounted_epr
    #Slowly prepare the training batch
    train_X.append(xs) 
    train_y.append(epdlogp)
    xs,dlogps,drs = [],[],[]
    #Periodically update the model
    if episode_number % update_frequency == 0: 
      y_train = probs + learning_rate * np.squeeze(np.vstack(train_y)) #Hacky WIP
      #y_train[y_train<0] = 0
      #y_train[y_train>1] = 1
      #y_train = y_train / np.sum(np.abs(y_train), axis=1, keepdims=True)
      print ('Training Snapshot:')
      #pdb.set_trace()
      print (y_train)
      model.train_on_batch(np.squeeze(np.vstack(train_X)), y_train)
      #Clear the batch
      train_X = []
      train_y = []
      probs = []
      #Save a checkpoint of the model
      os.remove('atari_model_checkpoint.h5') if os.path.exists('atari_model_checkpoint.h5') else None
      model.save_weights('atari_model_checkpoint.h5')
    #Reset the current environment nad print the current results
    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
    print('Environment reset imminent. Total Episode Reward: {}. Running Mean: {}'.format(reward_sum, running_reward))
    reward_sum = 0
    observation = env.reset()
    prev_x = None
  if reward != 0:
    print(('Episode {} \n '.format(episode_number)))
    if reward == -1:
        print('Defeat')
    else:
        print('VICTORY!')


Training Snapshot:
[[1.11111112e-01 1.11111112e-01 1.11111112e-01 ... 1.11111112e-01
  1.11111112e-01 1.11111112e-01]
 [1.05619133e-02 4.12464171e-04 1.74888261e-02 ... 9.08532739e-01
  3.76470659e-08 8.31604877e-04]
 [3.19967367e-04 4.29355123e-06 8.62146087e-04 ... 9.38069522e-01
  7.04219332e-04 5.63389971e-04]
 ...
 [2.64926784e-05 1.80292773e-05 2.89394520e-02 ... 1.03733875e-02
  5.42349136e-03 8.94116893e-05]
 [8.71038139e-02 9.23015978e-06 3.55539260e-06 ... 4.82413918e-01
  1.66023136e-04 5.04706707e-03]
 [3.86817845e-07 1.61061858e-11 4.17597583e-08 ... 9.85582709e-01
  1.98356375e-07 3.93617083e-05]]
Environment reset imminent. Total Episode Reward: 0.0. Running Mean: 0.0
Training Snapshot:
[[1.1111106e-01 1.1111106e-01 1.1111109e-01 ... 1.1111121e-01
  1.1111117e-01 1.1111110e-01]
 [1.0492862e-02 4.0883818e-04 1.7401118e-02 ... 9.0880764e-01
  3.7388567e-08 8.2685269e-04]
 [2.0325261e-04 2.7156866e-06 1.0205087e-03 ... 9.9242097e-01
  1.1633394e-05 1.3451895e-04]
 ...
 [5.7

KeyboardInterrupt: ignored

# New Section

In [0]:
from keras.layers import Dense, Activation, Input
from keras.models import Model, load_model
from keras.optimizers import Adam
import keras.backend as K
import numpy as np


In [0]:
from keras.layers import Dense, Activation, Input
from keras.models import Model, load_model
from keras.optimizers import Adam
import keras.backend as K
import numpy as np
import pdb


class Agent(object):
  def __init__(self, ALPHA, GAMMA = 0.99, n_actions = 9, layer1_size = 16,
                 layer2_size = 16, input_dims = 210*160, fname = 'reinforce.h5'):
    self.gamma = GAMMA
    self.lr = ALPHA
    self.G = 0
    self.input_dims = input_dims
    self.fc1_dims = layer1_size
    self.fc2_dims = layer2_size
    self.n_actions = n_actions
    self.state_memory = []
    self.action_memory = []
    self.reward_memory = []
    self.probabilities = np.zeros(9)

    self.policy, self.predict = self.build_policy_network()
    self.action_space = [ i for i in range(n_actions)]
    self.mode_file = fname

  def build_policy_network(self):
    input = Input(shape=(self.input_dims,))
    advantages = Input(shape=[1])
    dense1 = Dense(self.fc1_dims, activation = 'relu')(input)
    dense2 = Dense(self.fc2_dims, activation = 'relu')(dense1)
    probs = Dense(self.n_actions, activation='softmax')(dense2)

    def custom_loss(y_true, y_pred):
      out = K.clip(y_pred, 1e-8, 1-1e-8) # To avoid deividing by 0
      log_lik = y_true*K.log(out)

      return K.sum(-log_lik*advantages)

    policy = Model(input=[input, advantages], output = [probs])
    policy.compile(optimizer = Adam(lr = self.lr), loss = custom_loss)

    predict = Model(input=[input], output = [probs])

    return policy, predict

  
  def choose_action (self, observation):
    #pdb.set_trace()
    state = observation[np.newaxis, :]
    probabilities = self.predict.predict(state)[0]
    self.probabilities = np.concatenate((probabilities, self.probabilities), axis = 0)
    action = np.random.choice(self.action_space, p = probabilities)
    return action

  def store_observation(self, observation, action, reward):
    self.action_memory.append(action)
    self.state_memory.append(observation)
    self.reward_memory.append(reward)

  def learn(self):
    #pdb.set_trace()
    state_memory = np.array(self.state_memory)
    action_memory = np.array(self.action_memory)
    reward_memory = np.array(self.reward_memory)

    actions = np.zeros([len(action_memory), self.n_actions])
    actions[np.arange(len(action_memory)), action_memory] = 1

    G = np.zeros_like(reward_memory)
    for t in range(len(reward_memory)):
      G_sum = 0
      discount = 1
      for k in range (t, len(reward_memory)):
        G_sum += reward_memory[k]*discount
        discount *=self.gamma

      G[t] = G_sum
    mean = np.mean(G)
    std = np.std(G) if np.std(G)>0 else 1
    self.G = (G-mean)/std

    cost = self.policy.train_on_batch([state_memory, self.G], actions) # y_pred y_true

    self.state_memory=[]
    self.action_memory = []
    self.reward_memory = []

    return cost

  def save_model(self):
    self.policy.save(self.model_file)

  def load_model(self):
    self.policy = load_model(self.model_file)



In [0]:
def pong_preprocess_screen(I):
  I=np.dot(I[..., :3], [0.2989, 0.5870, 0.1140])
  #pdb.set_trace()
  return I.astype(np.float).ravel()



agent = Agent(ALPHA = 0.0005, input_dims=210*160, GAMMA = 0.99, n_actions = 9,
            layer1_size = 64, layer2_size = 64)

env = gym.make('Enduro-v0')
score_history = []

n_episodes = 2000
for i in range(n_episodes):
    done = False
    score = 0
    observation = pong_preprocess_screen(env.reset())

    while (not done):
        #pdb.set_trace()
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        observation_=pong_preprocess_screen(observation_)
        agent.store_observation(observation, action, reward)
        observation=observation_
        score += reward
    score_history.append(score)

    cost = agent.learn()

    #pdb.set_trace()
    agent.probabilities = np.reshape(agent.probabilities, (9,-1))
    #plt.plot(agent.probabilities.sum(axis = 1))
    #plt.show()
    agent.probabilities=np.zeros(9)
    print('Episode {}, score {}, average_score {}, cost {}'.format(i, score, np.mean(score_history[-100:]), int(cost)))




Episode 0, score 0.0, average_score 0.0, cost 0
Episode 1, score 0.0, average_score 0.0, cost 0
Episode 2, score 0.0, average_score 0.0, cost 21


In [0]:
a=np.zeros([1,9])
b = np.ones([1,9])
c = np.concatenate((a,b), axis = 1)

In [0]:
a=np.array([[1,2,3], [1,2,3]])

In [33]:
a.shape

(1, 9)