In [1]:
from __future__ import division
import numpy as np

# import MalmoPython
import malmo.MalmoPython as MalmoPython
import os
import random
import sys
import time
import json
import random
import math
import errno
from collections import defaultdict, deque
from timeit import default_timer as timer


In [19]:
# DiscreteMovementCommands
# actions_space = ["movesouth 1", "movewest 1", "moveeast 1", "jumpsouth 1", "movesprint 1", "jumpsouth 0", "movesouth 1"]
# actions_space = ["movewest 1"]
# ContinuousMovementCommands
actions_space = ['move 1', 'strafe 1', 'strafe -1', 'jump 1']

INITIAL_LOCATION = (1, 2, -3)
DIAMOND_WALL_Z = 25
INITIAL_DISTANCE = DIAMOND_WALL_Z - (-3)
reward_map =  {
            'diamond_block': 100,
            'packed_ice': 1,
            'log': -10,
            'water': -5 
            }

class Racer(object):
    def __init__(self, alpha=0.3, gamma=1, n=1):
        """Constructing an RL agent.

        Args
            alpha:  <float>  learning rate      (default = 0.3)
            gamma:  <float>  value decay rate   (default = 1)
            n:      <int>    number of back steps to update (default = 1)
        """
        self.epsilon = 0.2  # chance of taking a random action instead of the best
        self.q_table = {}
        self.n, self.alpha, self.gamma = n, alpha, gamma
        self.actions_taken = defaultdict(lambda: 0, {})
        self.num_actions = 0
        self.reward = 0
        self.diamond_reached = False
        self.obstacles_hit = False
        self.currentState = {
            'agent_x': 1,
            'agent_z': -4,
            'goal_z': 25,
            'has_water_left': False,
            'has_water_right': False,
            'has_water_forward': False,
            'has_wood_left': False,
            'has_wood_right': False,
            'has_wood_forward': False,
        }
        
    def clear_actions(self):
        """Resets the actions in case of a new iteration to fetch. """
        self.actions_taken = defaultdict(lambda: 0, {})
        self.num_actions = 0
        self.reward = 0
        self.diamond_reached = False
        self.obstacles_hit = False
        self.currentState = {
            'agent_x': 1,
            'agent_z': -4,
            'goal_z': 25,
            'has_water_left': False,
            'has_water_right': False,
            'has_water_forward': False,
            'has_wood_left': False,
            'has_wood_right': False,
            'has_wood_forward': False,
        }
        #resetDefault
        
    def get_curr_location(self, agent_host):
        # get the world state
        world_state = agent_host.peekWorldState()
        location = tuple()
#         print("get curr location")
        if world_state.number_of_observations_since_last_state > 0:
            msg = world_state.observations[-1].text
#             print(msg)
            observations = json.loads(msg)

            # get curr location from json
            xpos = observations.get(u'XPos',0)
            ypos = observations.get(u'YPos',0)
            zpos = observations.get(u'ZPos',0)
            location = (xpos, ypos, zpos)
            print("LOCATION: ", location)
            return location
        else:
            return INITIAL_LOCATION
    
    def is_solution(self, reward):
        """If the reward equals to the maximum reward possible returns True, False otherwise. """
        return reward == 100
        
    def calculate_dist_reward(self, agent_host):
        agent_z = self.get_curr_location(agent_host)[2]
        dist_from_wall = DIAMOND_WALL_Z -agent_z
        dist_traveled = INITIAL_DISTANCE - dist_from_wall
        reward = math.floor((dist_traveled*100)/INITIAL_DISTANCE)
        return reward
        
    def get_obj_locations(self, agent_host):
        """Queries for the object's location in the world.

        As a side effect it also returns Racer's location.
        """
        nearby_obs = []
        while True:
            world_state = agent_host.getWorldState()
            if world_state.number_of_observations_since_last_state > 0:
                msg = world_state.observations[-1].text
                ob = json.loads(msg)
                grid = ob.get(u'floorAll',0)
                end = 0
                x1, y1, z1 = -10, -1, -10
                x2, y2, z2 = 10, -1, 10
                for i, block in enumerate(grid):
                    if block == 'log' or block == 'water' or block == 'diamond_block':
                        x = x1 + (i % (x2 - x1 + 1))
                        y = y1 + (i // ((x2 - x1 + 1) * (z2 - z1 + 1)))
                        z = z1 + ((i // (x2 - x1 + 1)) % (z2 - z1 + 1))
                        nearby_obs.append((block, (x, y, z)))
                        print("block:", block)
                        print("block location:", (x, y, z))
                return nearby_obs
            
    def eval_current_state(self, agent_host, current_state):
        print("current_state passed in eval:", current_state)
        location = self.get_curr_location(agent_host)
        print("current location in eval:", location)
        # below line is for continuous movement
#         current_state = location
        nearby_objects = self.get_obj_locations(agent_host)
        self.reward = self.calculate_dist_reward(agent_host)
        if len(nearby_objects) > 1:
            for obj in nearby_objects:
#                 print("object:", obj)
                if obj[1][2] >= current_state[2] and obj[1][2] <= current_state[2] + 1:
                    if obj[0] == 'diamond_block':
                        self.reward += reward_map['diamond_block']
                        self.diamond_reached = True
                    if obj[0] == 'log' and obj[1][0] >= current_state[0] - 1.5 and obj[1][0] <= current_state[0] + 1.5:
                        self.reward += reward_map['log']
                        self.obstacles_hit = True
                        print("hit obstacles!! obstacles hit:", obj)
                    if obj[0] == 'water' and obj[1][0] >= current_state[0] - 1.5 and obj[1][0] <= current_state[0] + 1.5:
                        self.reward += reward_map['water']
                            
        return self.reward, self.diamond_reached, self.obstacles_hit
        
        
    def get_possible_actions(self, agent_host, term_flag, is_first_action=False):
        """Returns all possible actions that can be done at the current state. """
#         print("in get possible actions")
        action_list = []
        if not term_flag:
            action_list.extend(actions_space)
        return action_list
    
    def choose_action(self, curr_state, possible_actions, eps):
        """Chooses an action according to eps-greedy policy. """
#         print("in choose action")
        if curr_state not in self.q_table:
            self.q_table[curr_state] = {}
        for action in possible_actions:
            if action not in self.q_table[curr_state]:
                self.q_table[curr_state][action] = 0
        
        rnd = random.random()
        if rnd <= eps:
            a = random.randint(0, len(possible_actions) - 1)
            return possible_actions[a]
        else:
            # copy dict{actions: q-values} of q_table[curr_state]
            state_actions =self.q_table[curr_state]
            # find the max q-value
            max_q = max(state_actions.values())
            # find the list of actions that return the maximum q-value
            max_actions = [action for action, value in state_actions.items() if value == max_q]
            # pick a random action from the max_actions list
            max_rand = random.randint(0, len(max_actions) - 1)
            return max_actions[max_rand]
        
    def act(self, agent_host, action): 
        #lowkey set up an action queue so we dont need
        # to hardcode the jumpXmove action. if we have a while loop that just
        #runs and constantly time.sleeps() per game state update, we would be able to jump/run
        #without more problems. this should be fine for now. The code is just a little messier.
        print(action + ",", end = " ")
        term, size = action.split()
        if term == 'jump': #pair it with move 1.
            agent_host.sendCommand('move 1')
        term += ' 0'
        
        agent_host.sendCommand(action) #GO
        time.sleep(0.3)
        if term == 'jump 0': #pair it with move 1.
            agent_host.sendCommand('move 0')
        agent_host.sendCommand(term)  #STOP
        # reevaluate the current state after action
        curr_state = self.get_curr_location(agent_host)
        if curr_state not in self.q_table:
            self.q_table[curr_state] = {}
        possible_actions = self.get_possible_actions(agent_host, False)
        for action in possible_actions:
            if action not in self.q_table[curr_state]:
                self.q_table[curr_state][action] = 0
            
        self.eval_current_state(agent_host, curr_state)
        
        # update rewards by calling update_q_table
        
        return self.reward, self.diamond_reached, self.obstacles_hit
    
    def update_q_table(self, tau, S, A, R, T): # THIS OR THE COPY OF IT NEEDS WORK!
        """Performs relevant updates for state tau.

        Args
            tau: <int>  state index to update
            S:   <dequqe>   states queue
            A:   <dequqe>   actions queue
            R:   <dequqe>   rewards queue
            T:   <int>      terminating state index
            
        """
        curr_state, curr_action, curr_reward = S.popleft(), A.popleft(), R.popleft()
        G = sum([self.gamma ** i * R[i] for i in range(len(S))])
        if tau + self.n < T:
            G += self.gamma ** self.n * self.q_table[S[-1]][A[-1]]

        old_q = self.q_table[curr_state][curr_action]
        self.q_table[curr_state][curr_action] = old_q + self.alpha * (G - old_q)
    
        
    def update_q_table2(self, S, A, R, T):
        """Performs relevant updates for the Q-values.
        TD(0) implementation of SARSA algorithm. Better for this use case.
        Args
            S:   <dequqe>   states queue
            A:   <dequqe>   actions queue
            R:   <dequqe>   rewards queue
            T:   <int>      terminating state index

        """
        G = 0
        for i in range(len(S)-1, -1, -1):
            G = self.gamma * G + R[i+1]
            old_q = self.q_table[S[i]][A[i]]
            self.q_table[S[i]][A[i]] = old_q + self.alpha * (G - old_q)
            
    def best_policy(self, agent_host):
        """Reconstructs the best action list according to the greedy policy. """
        self.clear_actions()
        policy = []
        current_r = 0
        is_first_action = True
        term_flag = self.diamond_reached or self.obstacles_hit
        while not term_flag:
            curr_state = self.get_curr_location(agent_host)
            possible_actions = self.get_possible_actions(agent_host, term_flag, is_first_action)
            next_a = self.choose_action(curr_state, possible_actions, 0)
            policy.append(next_a)
            is_first_action = False
            result = self.act(agent_host, next_a)
            current_r = result[0]
            term_flag = self.diamond_reached or self.obstacles_hit
        print(' with reward %.1f' % (current_r))
        return self.is_solution(current_r)
        #print 'Best policy so far is %s with reward %.1f' % (policy, current_r)

    def old_best_policy(self, agent_host):
        """Reconstructs the best action list according to the greedy policy. """
        self.clear_actions()
        policy = []
        current_r = 0
        is_first_action = True
        next_a = ""
        while next_a != "present_actions":
            curr_state = self.get_curr_state()
            possible_actions = self.get_possible_actions(agent_host, is_first_action)
            next_a = self.choose_action(curr_state, possible_actions, 0)
            policy.append(next_a)
            is_first_action = False
            current_r = self.act(agent_host, next_a)
        print(' with reward %.1f' % (current_r))
        return is_solution(current_r)
        #print 'Best policy so far is %s with reward %.1f' % (policy, current_r)
        
    def run(self, agent_host): 
        """Learns the process to reach the diamonds"""
        S, A, R = deque(), deque(), deque()
        present_reward = 0
        done_update = False
        while not done_update:
#             s0 = self.get_curr_state()
            s0 = self.get_curr_location(agent_host)
            term_flag = self.diamond_reached or self.obstacles_hit
            possible_actions = self.get_possible_actions(agent_host, term_flag, True)
            a0 = self.choose_action(s0, possible_actions, self.epsilon)
            S.append(s0)
            A.append(a0)
            R.append(0)

            T = sys.maxsize
            for t in range(sys.maxsize):
                time.sleep(0.1)
                if t < T:
                    print("action: ", A[-1])
                    
                    time.sleep(0.5)
                    
                    result = self.act(agent_host, A[-1])
                    current_r = result[0]
                    term_flag = self.diamond_reached or self.obstacles_hit
                    R.append(current_r)

                    if term_flag:
                        # Terminating state
                        T = t + 1
#                         S.append('Term State')
                        
                        present_reward = current_r
                        print("Reward:", present_reward)
                    else:
#                         s = self.get_curr_state()
                        s = self.get_curr_location(agent_host)
                        S.append(s)
                        possible_actions = self.get_possible_actions(agent_host, term_flag)
                        next_a = self.choose_action(s, possible_actions, self.epsilon)
                        A.append(next_a)

                tau = t - self.n + 1
                if tau >= 0:
                    self.update_q_table(tau, S, A, R, T)

                if tau == T - 1:
                    while len(S) > 1:
                        tau = tau + 1
                        self.update_q_table(tau, S, A, R, T)
                    done_update = True
                    break
                
                if term_flag:
                    agent_host.sendCommand('quit')
        
        
    def old_run(self, agent_host): ###THIS NEEDS WORK ALONG W THE Q TABLE!
        """Learns the process to reach the diamonds"""
        S, A, R = deque(), deque(), deque()
        present_reward = 0
        done_update = False
        while not done_update:
            s0 = self.get_curr_state()
            possible_actions = self.get_possible_actions(agent_host, True)
            a0 = self.choose_action(s0, possible_actions, self.epsilon)
            S.append(s0)
            A.append(a0)
            R.append(0)

            T = sys.maxsize
            for t in range(sys.maxsize):
                time.sleep(0.1)
                if t < T:
                    print("action: ", A[-1])
                    
                    time.sleep(1)
                    
                    current_r = self.act(agent_host, A[-1])
                    R.append(current_r)

                    if A[-1] == "present_actions": #We do not needs this.
                        # Terminating state
                        T = t + 1
                        S.append('Term State')
                        
                        present_reward = current_r
                        print("Reward:", present_reward)
                    else:
                        s = self.get_curr_state()
                        S.append(s)
                        possible_actions = self.get_possible_actions(agent_host)
                        next_a = self.choose_action(s, possible_actions, self.epsilon)
                        A.append(next_a)

                tau = t - self.n + 1
                if tau >= 0:
                    self.update_q_table(tau, S, A, R, T)

                if tau == T - 1:
                    while len(S) > 1:
                        tau = tau + 1
                        self.update_q_table(tau, S, A, R, T)
                    done_update = True
                    break




In [20]:
if __name__ == '__main__':
    random.seed(0)
    #sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)  # flush print output immediately
    print('Starting...', flush=True)

    expected_reward = 3390
    my_client_pool = MalmoPython.ClientPool()
    my_client_pool.add(MalmoPython.ClientInfo("127.0.0.1", 10000))

    agent_host = MalmoPython.AgentHost()
    try:
        agent_host.parse(sys.argv)
    except RuntimeError as e:
        print('ERROR:', e)
        print(agent_host.getUsage())
        exit(1)
    if agent_host.receivedArgument("help"):
        print(agent_host.getUsage())
        exit(0)
    
    world_num = random.randint(0, 25)
    mission_file = 'xmls/world_{world_num}.txt'.format(world_num = world_num)
    with open(mission_file, 'r') as f:
        print("Loading mission from %s" % mission_file)
        missionXML = f.read()
        my_mission = MalmoPython.MissionSpec(missionXML, True)

    num_reps = 30000
    n=10
    racer = Racer(n=n)
    print("n=",n)
    racer.clear_actions()
    for iRepeat in range(num_reps):
        my_mission = MalmoPython.MissionSpec(missionXML, True)
        my_mission_record = MalmoPython.MissionRecordSpec()  # Records nothing by default
        my_mission.requestVideo(800, 500)
        my_mission.setViewpoint(0)
        max_retries = 3
        for retry in range(max_retries):
            try:
                # Attempt to start the mission:
#                 agent_host.startMission(my_mission, my_client_pool, my_mission_record, 0, "Racer")
                agent_host.startMission(my_mission, my_mission_record)
                break
            except RuntimeError as e:
                if retry == max_retries - 1:
                    print("Error starting mission", e)
                    print("Is the game running?")
                    exit(1)
                else:
                    time.sleep(2)

        world_state = agent_host.getWorldState()
        while not world_state.has_mission_begun:
            time.sleep(0.1)
            world_state = agent_host.getWorldState()

        # Every few iteration Odie will show us the best policy that he learned.
        if (iRepeat + 1) % 5 == 0:
            print((iRepeat+1), 'Showing best policy:', end = " ")
            found_solution = racer.best_policy(agent_host)
            if found_solution:
                print('Found solution')
                print('Done')
                break
        else:
            print((iRepeat+1), 'Learning Q-Table:', end = " ")
            racer.run(agent_host)

        racer.clear_actions()
        time.sleep(1)

Starting...
ERROR: Caught std::exception: unrecognised option '-f'

Malmo version: 0.36.0

Allowed options:
  -h [ --help ]         show description of allowed options
  --test                run this as an integration test


Loading mission from xmls/world_12.txt
n= 10
1 Learning Q-Table: action:  jump 1
jump 1, LOCATION:  (1.0, 3.001335979112148, -3.959072924676537)
current_state passed in eval: (1.0, 3.001335979112148, -3.959072924676537)
LOCATION:  (1.0, 3.001335979112148, -3.959072924676537)
current location in eval: (1.0, 3.001335979112148, -3.959072924676537)
block: log
block location: (-7, -1, -3)
block: log
block location: (-6, -1, -3)
block: log
block location: (-5, -1, -3)
block: log
block location: (-4, -1, -3)
block: log
block location: (-3, -1, -3)
block: log
block location: (-2, -1, -3)
block: log
block location: (-1, -1, -3)
block: log
block location: (0, -1, -3)
block: log
block location: (1, -1, -3)
block: log
block location: (2, -1, -3)
block: log
block location: (3,

strafe -1, LOCATION:  (2.2363692930333006, 2.0, -1.648967799527985)
current_state passed in eval: (2.2363692930333006, 2.0, -1.648967799527985)
LOCATION:  (2.2363692930333006, 2.0, -1.648967799527985)
current location in eval: (2.2363692930333006, 2.0, -1.648967799527985)
block: log
block location: (-8, -1, -5)
block: log
block location: (-7, -1, -5)
block: log
block location: (-6, -1, -5)
block: log
block location: (-5, -1, -5)
block: log
block location: (-4, -1, -5)
block: log
block location: (-3, -1, -5)
block: log
block location: (-2, -1, -5)
block: log
block location: (-1, -1, -5)
block: log
block location: (0, -1, -5)
block: log
block location: (1, -1, -5)
block: log
block location: (2, -1, -5)
block: log
block location: (3, -1, -5)
block: log
block location: (4, -1, -5)
block: log
block location: (-8, -1, -4)
block: log
block location: (-7, -1, -4)
block: log
block location: (-6, -1, -4)
block: log
block location: (-5, -1, -4)
block: log
block location: (-4, -1, -4)
block: log
b

Error starting mission A mission is already running.
Is the game running?
5 Showing best policy: jump 1, LOCATION:  (3.467105930892764, 3.001335979112148, 0.3422673051270336)
current_state passed in eval: (3.467105930892764, 3.001335979112148, 0.3422673051270336)
LOCATION:  (3.467105930892764, 3.001335979112148, 0.3422673051270336)
current location in eval: (3.467105930892764, 3.001335979112148, 0.3422673051270336)
block: log
block location: (-9, -1, -7)
block: log
block location: (-8, -1, -7)
block: log
block location: (-7, -1, -7)
block: log
block location: (-6, -1, -7)
block: log
block location: (-5, -1, -7)
block: log
block location: (-4, -1, -7)
block: log
block location: (-3, -1, -7)
block: log
block location: (-2, -1, -7)
block: log
block location: (-1, -1, -7)
block: log
block location: (0, -1, -7)
block: log
block location: (1, -1, -7)
block: log
block location: (2, -1, -7)
block: log
block location: (3, -1, -7)
block: log
block location: (-9, -1, -6)
block: log
block location

jump 1, LOCATION:  (-0.12658641447017902, 3.166109260938214, -2.742908190132354)
current_state passed in eval: (-0.12658641447017902, 3.166109260938214, -2.742908190132354)
LOCATION:  (-0.12658641447017902, 3.166109260938214, -2.742908190132354)
current location in eval: (-0.12658641447017902, 3.166109260938214, -2.742908190132354)
block: log
block location: (-5, -1, -4)
block: log
block location: (-4, -1, -4)
block: log
block location: (-3, -1, -4)
block: log
block location: (-2, -1, -4)
block: log
block location: (-1, -1, -4)
block: log
block location: (0, -1, -4)
block: log
block location: (1, -1, -4)
block: log
block location: (2, -1, -4)
block: log
block location: (3, -1, -4)
block: log
block location: (4, -1, -4)
block: log
block location: (5, -1, -4)
block: log
block location: (6, -1, -4)
block: log
block location: (7, -1, -4)
block: log
block location: (-5, -1, -3)
block: log
block location: (7, -1, -3)
block: log
block location: (-5, -1, -2)
block: log
block location: (7, -1, 

strafe 1, LOCATION:  (0.6152261620113807, 2.0, 0.7611552597695194)
current_state passed in eval: (0.6152261620113807, 2.0, 0.7611552597695194)
LOCATION:  (0.6152261620113807, 2.0, 0.7611552597695194)
current location in eval: (0.6152261620113807, 2.0, 0.7611552597695194)
block: log
block location: (-6, -1, -7)
block: log
block location: (-5, -1, -7)
block: log
block location: (-4, -1, -7)
block: log
block location: (-3, -1, -7)
block: log
block location: (-2, -1, -7)
block: log
block location: (-1, -1, -7)
block: log
block location: (0, -1, -7)
block: log
block location: (1, -1, -7)
block: log
block location: (2, -1, -7)
block: log
block location: (3, -1, -7)
block: log
block location: (4, -1, -7)
block: log
block location: (5, -1, -7)
block: log
block location: (6, -1, -7)
block: log
block location: (-6, -1, -6)
block: log
block location: (-5, -1, -6)
block: log
block location: (-4, -1, -6)
block: log
block location: (-3, -1, -6)
block: log
block location: (-2, -1, -6)
block: log
bloc

KeyboardInterrupt: 