In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation

import sys

import os

sys.path.append(os.path.join(os.getcwd(), 'python'))
from keras.optimizers import Adam
import random
import gym
import pylos_env
import numpy as np
from collections import deque

Using TensorFlow backend.


In [2]:
env = gym.envs.make('Pylos-v0')

In [3]:
class Agent():
    def __init__(self):
        self.gamma = 0.95
        self.exploration_rate = 1
        self.memory = deque([], 200)
        
        self.model = self.create_model()
    
    def choose_action(self, state, explore=True):
        if explore and (random.random() < self.exploration_rate):
            return random.randint(0, 30)
        else:
            return np.argmax(self.model.predict(np.array([state])))
            
    def create_model(self):
        model = Sequential([Dense(200, input_shape = (96,))])
        model.add(Activation('relu'))
        
        # Hidden layers
        model.add(Dense(200))
        model.add(Activation('relu'))
        model.add(Dense(200))
        model.add(Activation('relu'))
        
        model.add(Dense(31, activation='softmax'))

        optimizer = Adam()
        model.compile(optimizer, loss='mse')
        return model
    
    def train(self, memory):
        for m in memory:
            self.memory.append(m)

        num_samples = min(100, len(memory))
        samples = random.sample(self.memory, num_samples)

        states = np.array([ state for state,_,_,_,_ in samples ])
        next_states = np.array([ next_state for _,_,_,next_state,_ in samples ])
        rewards = np.array([ np.array([reward]) for _, _, reward, _, _ in samples ])
        dones = np.array([ np.array([0 if done else 1])  for _, _, _, _, done in samples ])

        num_samples = states.shape[0]
        
        q = self.model.predict(states, batch_size=min(num_samples, 50))
        
        update_indices = q.argmax(1)
        updates = np.eye(31)[update_indices]
        
        q_next = self.model.predict(next_states, batch_size=min(num_samples, 50))
        best_next = np.amax(q_next, 1).reshape((num_samples, 1))
        
        q = q * (1 - updates)
        q = q + best_next * updates
        
        self.model.fit(states, q, epochs = 1, verbose=0)
        self.exploration_rate = self.exploration_rate - 0.01
        if self.exploration_rate < 0.05:
            self.exploration_rate = 0.05

In [4]:
agents = [ Agent(), Agent() ]


In [5]:
# We'll play the game, and after each win or loss
# we'll train 2 agents, and let them play against each other
# Rewards: winning = 10, invalid move = -1
# To help the agent learn valid moves more quicly, we adjust the reward with -1 for having made an invalid move

# It should be possible to train without adjusting for valid/invalid moves,
# TODO: check how much impact this has on training speed

num_episodes = 1000

for episode in range(num_episodes):
    state = env.reset()
    #print(state)
    done = False
    invalid_moves = 0
    memories = [ [], [] ]
    while not done:
        #print(env.pylos.current_player)
        current = state[1]
        #print ("next move", current)
        agent = agents[current]
        memory = memories[current]
        
        action = agent.choose_action(state)
        next_state, reward, done, info = env.step(action)
        if done:
            reward = 10 # last move is always the winning move (ball on top of the pyramid)
        
        memory.append((state, action, reward, next_state, done))

        state = next_state
    # train both agents with experience
    agents[0].train(memories[0])
    agents[1].train(memories[1])

    if episode % 10 == 0:
        print ("episode %d"%(episode, ))


episode 0
episode 10
episode 20
episode 30
episode 40
episode 50
episode 60
episode 70
episode 80
episode 90


KeyboardInterrupt: 

In [None]:
m.save_weights('agent0_weights')

In [None]:
# visualize one game - stop on invalid moves

env = gym.envs.make('Pylos-v0')
state = env.reset()
for i in range(10):
    current = state[1]
    agent = agents[current]
    
    print(env.pylos.render())
    action = agent.choose_action(state, False)
    print (action)
    state, _,_,_ = env.step(action)