In [1]:
#multi_agent.ipynb
#
#by Joe Hahn
#jmh.datasciences@gmail.com
#12 February 2018
#
#This uses Q-learning on multiple agents to demonstrate something...

In [2]:
#game settings
N_agents = 3                #number of agents
N_buckets = 7               #number of buckets
max_turns = 400             #max number of moves in single game
turn = 0                     #starting turn
rn_seed = 14                 #seed for random number generator

In [3]:
#import game
from multi_agent import *
import time
time_start = time.time()

In [4]:
#import plotting libraries
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(font_scale=1.5, font='DejaVu Sans')

In [15]:
#initialize system
environment = initialize_environment(rn_seed, max_turns, N_buckets, N_agents)
print 'environment = ', environment
state = initialize_state(environment)
print 'state = ', state
reward = get_reward(state)
print 'reward = ', reward
state_vector = state2vector(state, environment)
print 'state_vector = ', state_vector
game_state = get_game_state(turn, environment)
print 'game_state = ', game_state

environment =  {'max_turns': 400, 'acts': ['move to 0', 'move to 1', 'move to 2', 'move to 3', 'move to 4', 'move to 5', 'move to 6'], 'actions': array([0, 1, 2, 3, 4, 5, 6]), 'N_agents': 3, 'rn_seed': 14, 'N_buckets': 7, 'bucket_params': {'p0': array([ 0.5       ,  0.66666667,  0.83333333,  1.        ,  1.16666667,
        1.33333333,  1.5       ]), 'sigma': array([ 0.25      ,  0.70833333,  1.16666667,  1.625     ,  2.08333333,
        2.54166667,  3.        ])}}
state =  {'bucket_productivity': array([ 0.41722224,  1.38916186, -0.25692234, -0.2798226 ,  2.30735468,
       -1.35731516, -1.48156123]), 'previous_bucket_productivity': array([ 0.11870634, -0.27661734,  0.6219043 ,  0.35564602, -2.8927732 ,
        2.96604474,  0.68625791]), 'agent_health': array([ 1.,  1.,  1.]), 'agent_locations': array([2, 3, 1])}
reward =  0.85241692463
state_vector =  [[ 0.          1.          1.          1.          0.          0.          0.
   0.11870634 -0.27661734  0.6219043   0.35564602 -2.892

In [12]:
#illustrate moving an agent
new_locations = np.array([6, 4, 2])
state_next = move_agent(state, new_locations)
print 'new_locations = ', new_locations
print 'state_next = ', state_next
state_vector_next = state2vector(state_next, environment)
print 'state_vector_next = ', state_vector_next

new_locations =  [6 4 2]
state_next =  {'bucket_productivity': array([ 0.56596566,  1.29025915, -0.21805664,  2.48086245, -0.09405033,
        0.27678673,  0.56844213]), 'previous_bucket_productivity': array([ 0.33454106,  1.08174899,  1.23182195,  0.35917174,  1.79781868,
       -4.80692034,  5.87749537]), 'agent_health': array([ 1.,  1.,  1.]), 'agent_locations': array([6, 4, 2])}
state_vector_next =  [[ 0.          0.          1.          0.          1.          0.          1.
   0.33454106  1.08174899  1.23182195  0.35917174  1.79781868 -4.80692034
   5.87749537]]
(3,)
(3,)


In [None]:
#play 1 game of randomly-moving agents and stash bucket rewards in dataframe
turn = 0
strategy = 'random'
memories = play_one_game(environment, turn, strategy)
game_history = memories2timeseries(memories, environment)
print 'number of memories generated during 1 game = ', len(memories)
print memories[-1]

In [None]:
#plot bucket rewards versus turn
cols = ['turn'] + [col for col in game_history.columns if ('reward_' in col)]
df = game_history[cols].drop_duplicates()
cols = cols[1:]
fig, ax = plt.subplots(1,1, figsize=(15, 11))
p = ax.set_title('bucket rewards')
p = ax.set_xlabel('turn')
p = ax.set_ylabel('reward')
p = ax.set_xlim(0, df['turn'].max() + 40)
for col in cols:
    p = ax.plot(df['turn'], df[col], alpha=0.8, linewidth=1, label=col)
p = ax.legend(loc='lower right')

In [None]:
#play 10 games making random actions, and stash moves in memories queue
N_games = 5
strategy = 'random'
memories = play_N_games(environment, strategy, N_games)
print 'number of memories = ', len(memories)

In [None]:
#build model
N_inputs = state_vector.shape[1]
N_outputs = N_buckets
N_neurons = N_inputs*N_outputs
model = build_model(N_inputs, N_neurons, N_outputs)
print model.summary()

In [None]:
#train model
N_games = 20                               #number of games played during training
gamma = 0.85                                #discount for future rewards
batch_size = 100                             #number of memories used during experience-replay
debug = False                               #set True to see stats about each game's final turn
print 'batch_size = ', batch_size
print 'training model'
trained_model, game, rewards, epsilon = train(environment, model, N_games, gamma, memories, batch_size, debug=debug)
print '\ntraining done'

In [None]:
#plot rewards vs training game
fig, ax = plt.subplots(1,1, figsize=(15, 6))
xp = game
yp = rewards
p = ax.plot(xp, yp)
p = ax.set_title('reward vs training game')
p = ax.set_xlabel('game')
p = ax.set_ylabel('final reward')

In [None]:
#plot epsilon vs game_number
fig, ax = plt.subplots(1,1, figsize=(15, 6))
xp = game
yp = epsilon
p = ax.plot(xp, yp)
p = ax.set_title('epsilon vs game number')
p = ax.set_xlabel('game number')
p = ax.set_ylabel('epsilon')

In [None]:
#play one smart game
strategy = 'smart'
memories = play_one_game(environment, turn, strategy, model=trained_model)
game_history = memories2timeseries(memories, environment)
fig, ax = plt.subplots(1,1, figsize=(15, 6))
xp = game_history['turn']
yp = game_history['reward']
p = ax.plot(xp, yp)
p = ax.set_title('reward vs turn')
p = ax.set_xlabel('turn')
p = ax.set_ylabel('reward')
game_history.head()

In [None]:
#show grayscale of agent locations
cols = [col for col in game_history.columns if ('agents_' in col)]
df = game_history[cols]
fig, ax = plt.subplots(1,1, figsize=(15, 6))
plt.imshow(df.T, aspect='auto', cmap='gray')
df.tail(10)

In [None]:
#plot bucket rewards versus turn
cols = ['turn'] + [col for col in game_history.columns if ('reward_' in col)]
df = game_history[cols].drop_duplicates()
cols = cols[1:]
fig, ax = plt.subplots(1,1, figsize=(15, 11))
p = ax.set_title('bucket rewards')
p = ax.set_xlabel('turn')
p = ax.set_ylabel('reward')
p = ax.set_xlim(0, df['turn'].max() + 40)
for col in cols:
    p = ax.plot(df['turn'], df[col], alpha=0.8, linewidth=1, label=col)
p = ax.legend(loc='lower right')

In [None]:
#done!
time_stop = time.time()
print 'execution time (minutes) = ', (time_stop - time_start)/60.0