In [1]:
from __future__ import division
import numpy as np

# import MalmoPython
import malmo.MalmoPython as MalmoPython
import os
import random
import sys
import time
import json
import random
import math
import errno
import logging
import pickle
from builtins import range
from past.utils import old_div
from collections import defaultdict, deque
from timeit import default_timer as timer
from pprint import pprint

if sys.version_info[0] == 2:
    # Workaround for https://github.com/PythonCharmers/python-future/issues/262
    import Tkinter as tk
else:
    import tkinter as tk

# Maze Generation

In [2]:
def saveMaze(fname=0):
    random.seed()

    water_xml = ""
    logs_xml = "\n"
    z_used = []
    for i in range(4):
        x1 = random.randint(0, 2)        
        x2 = x1 + random.randint(1, 2)
        
        z = -5
        while True:
            z = random.randint(-5, 12)
            if z not in z_used:
                z_used.append(z)
                break
        
        water_xml = water_xml + '''
                <DrawCuboid x1="{x1}"   y1="45"  z1="{z1}"   x2="{x2}"  y2="45"  z2="{z2}" type="water"/>'''.format(x1=x1, x2=x2, z1=z, z2=z)
        
        x1 = random.randint(0, 2)
        x2 = x1 + random.randint(0, 1)
        
        z = -5
        while True:
            z = random.randint(-5, 12)
            if z not in z_used:
                z_used.append(z)
                break
                
        logs_xml = logs_xml + '''
                <DrawCuboid x1="{x1}"  y1="46"  z1="{z1}"   x2="{x2}"  y2="47"  z2="{z2}" type="log" />'''.format(x1=x1, x2=x2, z1=z, z2=z)
                
#     print("water_xml: ", water_xml)
#     print("logs_xml: ", logs_xml)

    
    xml_str = '''<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
        <Mission xmlns="http://ProjectMalmo.microsoft.com" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">

          <About>
            <Summary>Racing track simulation with reinforcement learning agent!</Summary>
          </About>

          <ServerSection>
            <ServerInitialConditions>
                <Time><StartTime>1</StartTime></Time>
            </ServerInitialConditions>
            <ServerHandlers>
              <FlatWorldGenerator generatorString="3;7,2;1;"/>
              <DrawingDecorator>
                <!-- coordinates for cuboid are inclusive -->
                <!-- limits of our arena -->
                <DrawCuboid x1="-4"  y1="46"  z1="-13"  x2="9"  y2="50"  z2="13" type="air" />           
                
                <!-- lava floor -->
                <DrawCuboid x1="-4"  y1="45"  z1="-13"  x2="9"  y2="45"  z2="13" type="lava" />        
                
                <!-- floor of the arena -->
                <DrawCuboid x1="-1"  y1="45"  z1="-8"   x2="5"  y2="45"  z2="13" type="packed_ice" />
                
                <!-- water area -->''' + water_xml + logs_xml + '''         
                
                <!-- the starting marker -->
                <DrawBlock x="2"  y="45" z="-8" type="cobblestone" />                  
                
                <!-- the destination markers -->
                <DrawBlock x="5"  y="45" z="13" type="diamond_block" />                
                <DrawBlock x="4"  y="45" z="13" type="diamond_block" />                
                <DrawBlock x="3"  y="45" z="13" type="diamond_block" />                                 
                <DrawBlock x="2"  y="45" z="13" type="diamond_block" />                                  
                <DrawBlock x="1"  y="45" z="13" type="diamond_block" />                                  
                <DrawBlock x="0"  y="45" z="13" type="diamond_block" />                                  
                <DrawBlock x="-1"  y="45" z="13" type="diamond_block" />                                 
              </DrawingDecorator>
              <ServerQuitFromTimeUp timeLimitMs="15000"/>
              <ServerQuitWhenAnyAgentFinishes/>
            </ServerHandlers>
          </ServerSection>

          <AgentSection mode="Survival">
            <Name>Agent</Name>
            <AgentStart>
              <Placement x="2.5" y="46.0" z="-7.5" pitch="30" yaw="0"/>
            </AgentStart>
            <AgentHandlers>
              <DiscreteMovementCommands/>
              <ObservationFromFullStats/>
              <RewardForTouchingBlockType>
                <Block reward="250.0" type="diamond_block" behaviour="onceOnly"/>
                <Block reward="-100.0" type="lava" behaviour="onceOnly"/>
                <Block reward="-50.0" type="water" behaviour="onceOnly"/>
                <Block reward="-25.0" type="log" behaviour="constant"/>
                <Block reward="2" type="packed_ice" behaviour="constant"/>
              </RewardForTouchingBlockType>
              <RewardForSendingCommand reward="-1" />
              <AgentQuitFromTouchingBlockType>
                  <Block type="lava" description="fail"/>
                  <Block type="water" description="fail"/>
                  <Block type="diamond_block" description="complete"/>
              </AgentQuitFromTouchingBlockType>
            </AgentHandlers>
          </AgentSection>

        </Mission>
        '''
    
    with open("xmls/discrete/world_{}.xml".format(fname), "w") as f:
        f.write(xml_str)

In [3]:
num_files = 20
for i in range(num_files):
    print("creating file ", i)
    saveMaze(i)
    print("saving file ", i)

creating file  0
saving file  0
creating file  1
saving file  1
creating file  2
saving file  2
creating file  3
saving file  3
creating file  4
saving file  4
creating file  5
saving file  5
creating file  6
saving file  6
creating file  7
saving file  7
creating file  8
saving file  8
creating file  9
saving file  9
creating file  10
saving file  10
creating file  11
saving file  11
creating file  12
saving file  12
creating file  13
saving file  13
creating file  14
saving file  14
creating file  15
saving file  15
creating file  16
saving file  16
creating file  17
saving file  17
creating file  18
saving file  18
creating file  19
saving file  19


# RL Racer Agent (DiscreteMovementCommands)

In [9]:
rewards_map =  {'diamond_block': 250, 'packed_ice': 2, 'log': -25, 'water': -50, 'lava': -100}
actions_space = ["movenorth 1", "movesouth 1", "movewest 1", "moveeast 1", "jumpsouth 1", "jumpsouth 0"] 
# REMINDER: VALUE OF EACH ACTION IS THE SPEED, NOT NUMBER OF TIMES
# move  1    full speed ahead
# move -1    full speed backwards
# strafe 1   moves right at full speed
# strafe -1  moves left at full speed
# turn 1     turns full speed right
# turn -1    turns full speed left
# jump 1/0   starts/stops jumping


class Racer(object):
    # initializing the parameters for the agent
    def __init__(self, epsilon=0.01, n=1):
        """Tabular Q-learning agent for discrete state/action spaces."""
        
        self.epsilon = epsilon  # chance of taking a random action instead of the best

        self.logger = logging.getLogger(__name__)
        if False: # True if you want to see more information
            self.logger.setLevel(logging.DEBUG)
        else:
            self.logger.setLevel(logging.INFO)
        self.logger.handlers = []
        self.logger.addHandler(logging.StreamHandler(sys.stdout))
        
        self.actions = actions_space
        self.q_table = {}
        self.canvas = None
        self.root = None
        
    def updateQTable( self, reward, current_state ):
        """Change q_table to reflect what we have learnt."""
        
        # retrieve the old action value from the Q-table (indexed by the previous state and the previous action)
        old_q = self.q_table[self.prev_s][self.prev_a]
        
        # TODO: what should the new action value be?
        new_q = old_q
        
        # assign the new action value to the Q-table
        self.q_table[self.prev_s][self.prev_a] = new_q
        
    def updateQTableFromTerminatingState( self, reward ):
        """Change q_table to reflect what we have learnt, after reaching a terminal state."""
        
        # retrieve the old action value from the Q-table (indexed by the previous state and the previous action)
        old_q = self.q_table[self.prev_s][self.prev_a]
        
        # TODO: what should the new action value be?
        new_q = old_q
        
        # assign the new action value to the Q-table
        self.q_table[self.prev_s][self.prev_a] = new_q
        
    def act(self, world_state, agent_host, current_r ):
        """take 1 action in response to the current world state"""
        
        obs_text = world_state.observations[-1].text
        obs = json.loads(obs_text) # most recent observation
        self.logger.debug(obs)

        if not u'XPos' in obs or not u'ZPos' in obs:
            self.logger.error("Incomplete observation received: %s" % obs_text)
            return 0
        current_s = "%d:%d" % (int(obs[u'XPos']), int(obs[u'ZPos']))
        self.logger.debug("State: %s (x = %.2f, z = %.2f)" % (current_s, float(obs[u'XPos']), float(obs[u'ZPos'])))
        print("State: %s (x = %.2f, z = %.2f)" % (current_s, float(obs[u'XPos']), float(obs[u'ZPos'])))
        
        if current_s not in self.q_table:
            self.q_table[current_s] = ([0] * len(self.actions))

        # update Q values
        if self.prev_s is not None and self.prev_a is not None:
            self.updateQTable( current_r, current_s )

        self.drawQ( curr_x = int(obs[u'XPos']), curr_y = int(obs[u'ZPos']) )

        # select the next action
        rnd = random.random()
        if rnd < self.epsilon:
            a = random.randint(0, len(self.actions) - 1)
            self.logger.info("Random action: %s" % self.actions[a])
        else:
            m = max(self.q_table[current_s])
            self.logger.debug("Current values: %s" % ",".join(str(x) for x in self.q_table[current_s]))
            l = list()
            for x in range(0, len(self.actions)):
                if self.q_table[current_s][x] == m:
                    l.append(x)
            y = random.randint(0, len(l)-1)
            a = l[y]
       
            self.logger.info("Taking q action: %s with current reward: %s" % (self.actions[a], current_r))

        # try to send the selected action, only update prev_s if this succeeds
        try:
            agent_host.sendCommand(self.actions[a])
            
            if self.actions[a] == "jumpsouth 1":
                agent_host.sendCommand("movesouth 1")
            
            self.prev_s = current_s
            self.prev_a = a

        except RuntimeError as e:
            self.logger.error("Failed to send command: %s" % e)

        return current_r
    
    def clear_actions(self):
        """Resets the actions taken in case of a new attempt to fetch."""
        self.q_table = {}
    
    def run(self, agent_host):
        """run the agent on the world"""

        total_reward = 0
        current_r = 0
        
        self.prev_s = None
        self.prev_a = None
        
        is_first_action = True
        
        # main loop:
        world_state = agent_host.getWorldState()
        while world_state.is_mission_running:

            current_r = 0
            
            if is_first_action:
                # wait until have received a valid observation
                while True:
                    time.sleep(0.1)
                    world_state = agent_host.getWorldState()
                    for error in world_state.errors:
                        self.logger.error("Error: %s" % error.text)
                    for reward in world_state.rewards:
                        current_r += reward.getValue()
                    if world_state.is_mission_running and len(world_state.observations)>0 and not world_state.observations[-1].text=="{}":
                        total_reward += self.act(world_state, agent_host, current_r)
                        break
                    if not world_state.is_mission_running:
                        break
                is_first_action = False
            else:
                # wait for non-zero reward
                while world_state.is_mission_running and current_r == 0:
                    time.sleep(0.1)
                    world_state = agent_host.getWorldState()
                    for error in world_state.errors:
                        self.logger.error("Error: %s" % error.text)
                    for reward in world_state.rewards:
                        current_r += reward.getValue()
                # allow time to stabilise after action
                while True:
                    time.sleep(0.1)
                    world_state = agent_host.getWorldState()
                    for error in world_state.errors:
                        self.logger.error("Error: %s" % error.text)
                    for reward in world_state.rewards:
                        current_r += reward.getValue()
                    if world_state.is_mission_running and len(world_state.observations)>0 and not world_state.observations[-1].text=="{}":
                        total_reward += self.act(world_state, agent_host, current_r)
                        break
                    if not world_state.is_mission_running:
                        break

        # process final reward
        self.logger.debug("Final reward: %d" % current_r)
        total_reward += current_r

        # update Q values
        if self.prev_s is not None and self.prev_a is not None:
            self.updateQTableFromTerminatingState( current_r )
            
        self.drawQ()
    
        return total_reward
    
    # modify this to for world dimensions on Q-table
    def drawQ( self, curr_x=None, curr_y=None ):
        scale = 40
        world_x = 6
        world_y = 14
        if self.canvas is None or self.root is None:
            self.root = tk.Tk()
            self.root.wm_title("Q-table")
            self.canvas = tk.Canvas(self.root, width=world_x*scale, height=world_y*scale, borderwidth=0, highlightthickness=0, bg="black")
            self.canvas.grid()
            self.root.update()
        self.canvas.delete("all")
        action_inset = 0.1
        action_radius = 0.1
        curr_radius = 0.2
        action_positions = [ ( 0.5, action_inset ), ( 0.5, 1-action_inset ), ( action_inset, 0.5 ), ( 1-action_inset, 0.5 ) ]
        # (NSWE to match action order)
        min_value = -20
        max_value = 20
        for x in range(world_x):
            for y in range(world_y):
                s = "%d:%d" % (x,y)
                self.canvas.create_rectangle( x*scale, y*scale, (x+1)*scale, (y+1)*scale, outline="#fff", fill="#000")
                for action in range(4):
                    if not s in self.q_table:
                        continue
                    value = self.q_table[s][action]
                    color = int( 255 * ( value - min_value ) / ( max_value - min_value )) # map value to 0-255
                    color = max( min( color, 255 ), 0 ) # ensure within [0,255]
                    color_string = '#%02x%02x%02x' % (255-color, color, 0)
                    self.canvas.create_oval( (x + action_positions[action][0] - action_radius ) *scale,
                                             (y + action_positions[action][1] - action_radius ) *scale,
                                             (x + action_positions[action][0] + action_radius ) *scale,
                                             (y + action_positions[action][1] + action_radius ) *scale, 
                                             outline=color_string, fill=color_string )
        if curr_x is not None and curr_y is not None:
            self.canvas.create_oval( (curr_x + 0.5 - curr_radius ) * scale, 
                                     (curr_y + 0.5 - curr_radius ) * scale, 
                                     (curr_x + 0.5 + curr_radius ) * scale, 
                                     (curr_y + 0.5 + curr_radius ) * scale, 
                                     outline="#fff", fill="#fff" )
        self.root.update()

# Run 

In [11]:
if sys.version_info[0] == 2:
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)  # flush print output immediately
else:
    import functools
    print = functools.partial(print, flush=True)
    

my_client_pool = MalmoPython.ClientPool()
my_client_pool.add(MalmoPython.ClientInfo("127.0.0.1", 10000))


# Create default Malmo objects:
agent_host = MalmoPython.AgentHost()
try:
    agent_host.parse( sys.argv )
except RuntimeError as e:
    print('ERROR:',e)
    print(agent_host.getUsage())
    exit(1)
#     sys.exit()
if agent_host.receivedArgument("help"):
    print(agent_host.getUsage())
    exit(0)
#     sys.exit()


# load in world map
world_num = random.randint(0, 19)
# mission_file = "xmls/discrete/discreteTestMap.txt"  # test map
mission_file = 'xmls/discrete/world_{world_num}.xml'.format(world_num=world_num)
with open(mission_file, 'r') as f:
    print("Loading mission from %s" % mission_file)
    missionXML = f.read()

    
# initiate the Racer object
num_reps = 5
n = 10
racer = Racer(n=n)
print("n =", n)
racer.clear_actions()


cumulative_rewards = []
for iRepeat in range(num_reps):
    my_mission = MalmoPython.MissionSpec(missionXML, True)
    my_mission_record = MalmoPython.MissionRecordSpec()  # Records nothing by default
    my_mission.requestVideo(1260, 960)
    my_mission.setViewpoint(0)

    # Attempt to start a mission:
    max_retries = 3
    for retry in range(max_retries):
        try:
            agent_host.startMission( my_mission, my_client_pool, my_mission_record, 0, "Racer" )
            break
        except RuntimeError as e:
            if retry == max_retries - 1:
                print("Error starting mission:",e)
                exit(1)
                # sys.exit()
            else:
                time.sleep(2)

    # Loop until mission starts:
    print("\nWaiting for the mission to start on trial", iRepeat+1)
    world_state = agent_host.getWorldState()
    
    while not world_state.has_mission_begun:
        time.sleep(0.1)
        world_state = agent_host.getWorldState()
        for error in world_state.errors:
            print("Error:",error.text)
    
    print("Mission running...")
    
    # WORK ON THIS
    # UPDATE XML FOR REFERENCE
    # DiscreteMovements
    cumulative_reward = racer.run(agent_host)
    print('Cumulative reward: %d' % cumulative_reward)
    cumulative_rewards += [ cumulative_reward ]
    
    racer.clear_actions()  # clear list of actions for next run
    time.sleep(1)
    

print("\n\nMission ended.")
print("Cumulative rewards for all %d runs:" % num_reps)
print(cumulative_rewards)
# Mission has ended.

ERROR: Caught std::exception: unrecognised option '-f'

Malmo version: 0.36.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test


Loading mission from xmls/discrete/world_18.xml
n = 10

Waiting for the mission to start on trial 1
Mission running...
State: 2:-7 (x = 2.50, z = -7.50)
Taking q action: jumpsouth 0 with current reward: 0
State: 2:-6 (x = 2.50, z = -6.50)
Taking q action: movenorth 1 with current reward: -1.0
State: 2:-7 (x = 2.50, z = -7.50)
Taking q action: movewest 1 with current reward: -1.0
State: 1:-7 (x = 1.50, z = -7.50)
Taking q action: movesouth 1 with current reward: 5.0
State: 1:-6 (x = 1.50, z = -6.50)
Taking q action: movewest 1 with current reward: 7.0
State: 0:-6 (x = 0.50, z = -6.50)
Taking q action: jumpsouth 1 with current reward: 5.0
State: 0:-4 (x = 0.50, z = -4.50)
Taking q action: movesouth 1 with current reward: 6.0
State: 0:-3 (x = 0.50, z = -3.50)
Taking q action: ju