In [103]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam
import random
import gym
import pylos_env
import numpy as np
from collections import deque

In [104]:
env = gym.envs.make('Pylos-v0')

# action space consists of 4 numbers: 
# Discrete(31): softmax indicating which ball (0-29) to move or (30) use player's reserve
# Discrete(30): softmax indicating where to place the ball
# Discrete(31): softmax indicating 1st ball to retract (0-29) or (30) for None
# Discrete(31): softmax indicating 2nd ball to retract (0-29) or (30) for None

In [135]:
class Agent():
    def __init__(self):
        self.gamma = 0.95
        self.exploration_rate = 1
        self.memory = deque([], 200)
        
        # Each move has four components (ball source, ball target, retract1, retract2)
        # we'll train a separate model for each component, and
        # hope they play together well;
        self.models = (self.create_model(31), self.create_model(30), self.create_model(31), self.create_model(31))
    
    def choose_action(self, state):
        if random.random() < self.exploration_rate:
            return (random.randint(0, 30), random.randint(0, 29), random.randint(0, 30), random.randint(0, 30))
        else:
            # make prediction based on each of the four models
            return [np.argmax(self.models[part].predict(state)) for part in range(4)]
            
    def create_model(self, num_outputs):
        model = Sequential([Dense(200, input_shape = (90,))])
        model.add(Activation('relu'))
        
        # Hidden layers
        model.add(Dense(200))
        model.add(Activation('relu'))
        model.add(Dense(200))
        model.add(Activation('relu'))
        
        # Output: 4 components to each move: (from, to, retract1, retract2)
        # from has 30 + 1 possibilities, to has 30 possibilities, retract1&2 have 31 possibilities
        model.add(Dense(num_outputs, activation='softmax'))

        optimizer = Adam()
        model.compile(optimizer, loss='mse')
        return model
    
    def train(self, memory):
        for m in memory:
            self.memory.append(m)

        num_samples = min(100, len(memory))
        samples = random.sample(self.memory, num_samples)

        inputs = np.array([ state['board'].reshape(90) for state,_,_,_,_ in samples ])
        next_inputs = np.array([ next_state['board'].reshape(90) for _,_,_,next_state,_ in samples ])
        rewards = np.array([ np.array([reward]) for _, _, reward, _, _ in samples ])
        dones = np.array([ np.array([0 if done else 1])  for _, _, _, _, done in samples ])

        num_actions = [31, 30, 31, 31]
        for i in range(len(num_actions)):
            self.train_model(self.models[i], inputs, next_inputs, rewards, dones, i, num_actions[i])

            #state, action, reward, next_state, done = experience
            #state = state['board']
            
            #print(action, state)
            # train 'from' network
            #self.learn(self.models[0], state, action[0], 31, reward, next_state, done)

            # train 'to' network
            #self.learn(self.models[1], state, action[1], 30, reward, next_state, done)

            # train 'retract1' network
            #self.learn(self.models[2], state, action[2], 31, reward, next_state, done)
            
            # train 'retract2' network
            #self.learn(self.models[3], state, action[3], 31, reward, next_state, done)
            
    def train_model(self, model, states, next_states, rewards, dones, action_index, num_actions):
        num_samples = states.shape[0]
        
        q = model.predict(states, batch_size=min(num_samples, 50))
        
        update_indices = q.argmax(1)
        updates = np.eye(num_actions)[update_indices]
        
        q_next = model.predict(next_states, batch_size=min(num_samples, 50))
        best_next = np.amax(q_next, 1).reshape((num_samples, 1))
        
        q = q * (1 - updates)
        q = q + best_next * updates # for selected actions, subtract current value, add new value
        
        # THIS DOES NOT WORK, network will not learn unless each model happens to produce valid input at the same time
        model.fit(states, q, epochs = 1, verbose=1)
            
            # old approach (learn 1 sample at a time)
#    def learn(self, model, state, action, num_actions, reward, next_state, done):
#        net_input = state.reshape((1,90))
#        q = model.predict(net_input, batch_size=1)[0]
#        q_target = reward
#        if not done:
#            q_target += self.gamma * np.amax(model.predict(next_state['board'].reshape(1,90))[0])
#            
#        q[action] = q_target
#        print (net_input.shape)
#        print (q.shape)
#        model.fit(net_input, q.reshape((1, num_actions)), epochs = 1, verbose = False)
        

In [136]:
# We'll play the game, and after each win or loss
# we'll train 2 agents, and let them play against each other
# Rewards: winning = +10, losing = -10
# To help the agent learn valid moves more quicly, we adjust the reward with -1 for having made an invalid move

# It should be possible to train without adjusting for valid/invalid moves,
# TODO: check how much impact this has on training speed

num_episodes = 1000
agents = [ Agent(), Agent() ]

for episode in range(num_episodes):
    state = env.reset()
    #print(state)
    done = False
    invalid_moves = 0
    memories = [ [], [] ]
    while not done:
        #print(env.pylos.current_player)
        current = state['current_player']
        #print ("next move", current)
        agent = agents[current]
        memory = memories[current]
        
        action = agent.choose_action(state)
        next_state, reward, done, info = env.step(action)
        #print(next_state, info)
        if done:
            reward = 10 # last move is always the winning move (ball on top of the pyramid)
        if 'invalid_move' in info:
            invalid_moves += 1
            reward -= -1
        
        memory.append((state, action, reward, next_state, done))

        if done:
            # update opponent memory (subtract 10 from last reward)
            opponent_memory = memories[1-current]
            s,a,r,n,d = opponent_memory[-1]
            opponent_memory[-1] = s,a,r-10,n,d
        
        state = next_state
    # train both agents with experience
    agents[0].train(memories[0])
    agents[1].train(memories[1])

    print ("episode %d / invalid moves %d"%(episode, invalid_moves))


(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 0 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 1 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 2 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 3 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 4 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 5 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 6 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 7 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 8 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 9 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15,

(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 85 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 86 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 87 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 88 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 89 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 90 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 91 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 92 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 93 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 94 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(1

(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 169 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 170 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 171 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 172 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 173 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 174 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 175 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 176 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 177 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(15, 90)
(15, 1)
(15, 1)
episode 178 / invalid moves 30
(15, 90)
(15, 90)
(15, 1)
(15, 1)
(15, 90)
(

KeyboardInterrupt: 

In [107]:
m = agent.models[0]

In [108]:
i = m.layers[0]

In [109]:
i.input_shape

(None, 90)