In [1]:
import retro
import torch
import numpy as np
import sys
import datetime
from pathlib import Path
from gym.wrappers import FrameStack, GrayScaleObservation, TransformObservation
from wrappers import ResizeObservation, SkipFrame, Discretizer
from metrics import MetricLogger

In [2]:
import torch
import random, numpy as np

In [3]:
from torch import nn
import copy

class NeuralNet(nn.Module):
    """mini cnn structure
    input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output
    """

    def __init__(self, input_dim, output_dim):
        super().__init__()
        c, h, w = input_dim

        if h != 84:
            raise ValueError(f"Expecting input height: 84, got: {h}")
        if w != 84:
            raise ValueError(f"Expecting input width: 84, got: {w}")

        self.online = nn.Sequential(
            nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
        )

        self.target = copy.deepcopy(self.online)

        # Q_target parameters are frozen.
        for p in self.target.parameters():
            p.requires_grad = False

    def forward(self, input, model):
        if model == "online":
            return self.online(input)
        elif model == "target":
            return self.target(input)

In [4]:

from collections import deque

class Hank:
    def __init__(self, state_dim, action_dim, save_dir, checkpoint=None):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.save_dir = save_dir
        self.memory = deque(maxlen=10000)
        self.batch_size = 32
        self.gamma = 0.9

        self.loss_fn = torch.nn.SmoothL1Loss()
        self.burnin = 1e4  # min. experiences before training
        self.learn_every = 3  # no. of experiences between updates to Q_online
        self.sync_every = 1e4  # no. of experiences between Q_target & Q_online sync

        self.use_cuda = torch.cuda.is_available()

        # Hank's DNN to predict the most optimal action - we implement this in the Learn section
        self.net = NeuralNet(self.state_dim, self.action_dim).double()
        if self.use_cuda:
            self.net = self.net.to(device="cuda")
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025)
        self.exploration_rate = 1
        self.exploration_rate_decay = 0.99999975
        #self.exploration_rate_decay = 0.999999
        self.exploration_rate_min = 0.1
        self.curr_step = 0

        self.save_every = 5e4  # no. of experiences between saving Hank Net

        if checkpoint:
            try:
                self.load(checkpoint)
            except:
                print(f"{checkpoint} not found! Initializing Hank anyway...")

    def act(self, state):
        """
        Given a state, choose an epsilon-greedy action and update value of step.

        Inputs:
        state(LazyFrame): A single observation of the current state, dimension is (state_dim)
        Outputs:
        action_idx (int): An integer representing which action Hank will perform
        """
        # EXPLORE
        if np.random.rand() < self.exploration_rate:
            action_idx = np.random.randint(self.action_dim)

        # EXPLOIT
        else:
            state = state.__array__()
            if self.use_cuda:
                state = torch.tensor(state).cuda()
            else:
                state = torch.tensor(state)
            state = state.unsqueeze(0)
            action_values = self.net(state, model="online")
            action_idx = torch.argmax(action_values, axis=1).item()

        # decrease exploration_rate
        self.exploration_rate *= self.exploration_rate_decay
        self.exploration_rate = max(self.exploration_rate_min, self.exploration_rate)

        # increment step
        self.curr_step += 1
        return action_idx

    def cache(self, state, next_state, action, reward, done):
        """
        Store the experience to self.memory (replay buffer)

        Inputs:
        state (LazyFrame),
        next_state (LazyFrame),
        action (int),
        reward (float),
        done(bool))
        """
        state = state.__array__()
        next_state = next_state.__array__()

        if self.use_cuda:
            state = torch.tensor(state).cuda()
            next_state = torch.tensor(next_state).cuda()
            action = torch.tensor([action]).cuda()
            reward = torch.tensor([reward]).cuda()
            done = torch.tensor([done]).cuda()
        else:
            state = torch.tensor(state)
            next_state = torch.tensor(next_state)
            action = torch.tensor([action])
            reward = torch.tensor([reward])
            done = torch.tensor([done])

        self.memory.append((state, next_state, action, reward, done,))

    def recall(self):
        """
        Retrieve a batch of experiences from memory
        """
        batch = random.sample(self.memory, self.batch_size)
        state, next_state, action, reward, done = map(torch.stack, zip(*batch))
        return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()

    def update_Q_online(self, td_estimate, td_target):
        loss = self.loss_fn(td_estimate, td_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def sync_Q_target(self):
        self.net.target.load_state_dict(self.net.online.state_dict())

    def save(self):
        save_path = (
                self.save_dir / f"Hank_net_{int(self.curr_step // self.save_every)}.chkpt"
        )
        torch.save(
            dict(model=self.net.state_dict(), exploration_rate=self.exploration_rate),
            save_path,
        )
        print(f"HankNet saved to {save_path} at step {self.curr_step}")

    def load(self, load_path):
        if not load_path.exists():
            raise ValueError(f"{load_path} does not exist")

        ckp = torch.load(load_path, map_location=('cuda' if self.use_cuda else 'cpu'))
        exploration_rate = ckp.get('exploration_rate')
        state_dict = ckp.get('model')

        print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
        self.net.load_state_dict(state_dict)
        self.exploration_rate = exploration_rate

    def learn(self):
        if self.curr_step % self.sync_every == 0:
            self.sync_Q_target()

        if self.curr_step % self.save_every == 0:
            self.save()

        if self.curr_step < self.burnin:
            return None, None

        if self.curr_step % self.learn_every != 0:
            return None, None

        # Sample from memory
        state, next_state, action, reward, done = self.recall()

        # Get TD Estimate
        td_est = self.td_estimate(state, action).double()

        # Get TD Target
        td_tgt = self.td_target(reward, next_state, done).double()

        # Backpropagate loss through Q_online
        loss = self.update_Q_online(td_est, td_tgt)

        return (td_est.mean().item(), loss)

    def td_estimate(self, state, action):
        current_Q = self.net(state, model="online")[
            np.arange(0, self.batch_size), action
        ]  # Q_online(s,a)
        return current_Q

    @torch.no_grad()
    def td_target(self, reward, next_state, done):
        next_state_Q = self.net(next_state, model="online")
        best_action = torch.argmax(next_state_Q, axis=1)
        next_Q = self.net(next_state, model="target")[
            np.arange(0, self.batch_size), best_action
        ]
        return (reward + (1 - done.float()) * self.gamma * next_Q).float()

In [5]:

LAWNMOWER_LOCATION = Path().parent.absolute()
retro.data.Integrations.add_custom_path(LAWNMOWER_LOCATION)

""" CHECK NVIDIA CUDA AVAILABILITY """

use_cuda = torch.cuda.is_available()
print(f"Using CUDA: {use_cuda}\n")

""" START ENVIRONMENT """



try:
    save_states = [f'lawn{x}.state' for x in range(10, 0, -1)]
    env = retro.make(game='lawnmower',
                     state=save_states.pop(), # pops off lawn1.state
                     inttype=retro.data.Integrations.ALL)
except FileNotFoundError:
    print(f"ERROR: lawnmower integration directory not found in the following location: {LAWNMOWER_LOCATION}")
    sys.exit()

""" OBSERVATION WRAPPERS """

action_space = [
    ['LEFT', 'B'],
    ['RIGHT', 'B'],
    ['DOWN', 'B'],
    ['UP', 'B']
]

env = Discretizer(env, combos=action_space)
env = ResizeObservation(env, shape=84)
env = GrayScaleObservation(env, keep_dim=False)
env = TransformObservation(env, f=lambda x: x / 255.)
env = FrameStack(env, num_stack=4)

""" CHECKPOINT SAVING """

save_dir = Path("../checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
save_dir.mkdir(parents=True)
#checkpoint = Path('..\\checkpoints\\2021-11-27T18-33-07\\Hank_net_18.chkpt')
hank = Hank(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir)#


Using CUDA: True



In [6]:
env.action_space.n

4

In [7]:

init_state = env.reset()

init_state.shape

(4, 84, 84)

In [8]:
state = init_state

state = state.__array__()

state = torch.tensor(state).cuda()

tensor = state

In [9]:
from torch import nn
import copy

class nn0(nn.Module):
    def __init__(self, input_shape=(4,84,84), output_shape=4):
        super().__init__()
        c, h, w = input_shape

        if h != 84:
            raise ValueError(f"Expecting input height: 84, got: {h}")
        if w != 84:
            raise ValueError(f"Expecting input width: 84, got: {w}")

        # playing around with size in neural
        self.nn = nn.Sequential(
            nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            # nn.ReLU(),
            # nn.Linear(512, 4),
        )

    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, input):
        return self.nn(input)





In [10]:
nn0model = nn0().double().cuda()


In [12]:
tensor.shape

torch.Size([4, 84, 84])

In [13]:
out = nn0model(tensor)

out.shape

RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x49 and 3136x512)

In [None]:
net = NeuralNet(hank.state_dim, hank.action_dim).double().cuda()

print(hank.state_dim)
print(hank.action_dim)

out = net(tensor, model = "online")

out.shape

In [None]:
action = hank.act(init_state)
prev_action = action
action_state = init_state  # current state when action is performed
next_state, _, _, info = env.step(action)


In [None]:
ram = env.get_ram()

tensor = memory_to_tensor(ram)

In [None]:
init_state.shape

In [None]:
tensor.shape

# Neural Network

In [None]:
from torch import nn
import copy

class NeuralNet(nn.Module):
    """mini cnn structure
    input -> (conv2d + relu) x 3 -> flatten -> (dense + relu) x 2 -> output
    """

    def __init__(self, input_dim, output_dim):
        super().__init__()
        # playing around with size in neural
        self.online = nn.Sequential(
            nn.Conv3d(in_channels=4, out_channels=32, kernel_size=(3,4,2), stride=4),
            nn.ReLU(),
            nn.Conv3d(in_channels=32, out_channels=64, kernel_size=(2,2,2), stride=2),
            nn.ReLU(),
            nn.Conv3d(in_channels=64, out_channels=64, kernel_size=(1,2,1), stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3, 16),
            nn.ReLU(),
            nn.Linear(16, output_dim),
        )

        self.target = copy.deepcopy(self.online)

        # Q_target parameters are frozen.
        for p in self.target.parameters():
            p.requires_grad = False

    def forward(self, input, model):
        if model == "online":
            return self.online(input)
        elif model == "target":
            return self.target(input)







# Old Training

In [None]:
for e in range(episodes):

    # State reset between runs
    init_state = env.reset()

    # For randomly selecting save states to work with
    # save_state_no = np.random.randint(1,4)
    # save_state_file = f'lawn{save_state_no}.state'
    # env.load_state(save_state_file, inttype=retro.data.Integrations.ALL)

    # Variables to keep track of for reward function
    frame_count = 0
    frame_since_act = 0
    frame_since_OOF = 0
    fuel_pickups = 0
    turns = 0
    propane_points = 0  # aka cumulative_reward

    reward = 0

    act = False
    learn = False
    delay_act = False
    game_start = False
    # new_best = False # not used

    # initial action
    action = hank.act(init_state)
    prev_action = action
    action_state = init_state  # current state when action is performed
    next_state, _, _, info = env.step(action)
    done = False
    prev_info = info
    frames_until_act = 3

    # Episode training
    while True:

        """ FRAME SENSITIVE CONDITIONS """

        frame_count += 1
        frame_since_act += 1
        frames_until_act -= 1
        # cur_fuel_pickup = 0
        fuel_rew = 0



        if not game_start and info["FUEL_TIME"] < 254: # FUEL_TIME changes randomly
            game_start = True
            prev_action = action
            action = hank.act(next_state)
            act = False




        # equals True if action blocked, False if possible
        act_5fr = prev_info["FRAME_COUNTER_5"] == 3

        if act and act_5fr:
            delay_act = True

        # Run agent on the state if action is possible
        if ((act and not act_5fr) or delay_act) and game_start:
            # Hank is about to act.  Learn from prior actions

            hank.cache(action_state, next_state, prev_action, reward, done)

            #input(f"Learning done based on next_state = current render.  Reward = {reward}  Press any key to continue.")


            #print(f"action = {prev_action}, reward = {reward}")

            # Learn
            q, loss = hank.learn()
            propane_points += reward

            ### UNCOMMENT IF YOU WANT TO SEE INPUT BY INPUT WHAT'S GOING ON
            #print(f"prev_action={prev_action}, reward={reward}")
            #input()

            # Logging
            logger.log_step(reward, loss, q)

            reward = 0

            # Perform new action
            prev_action = action


            ### UNCOMMENT IF YOU WANT TO SEE INPUT BY INPUT WHAT'S GOING ON

            action = hank.act(next_state)

            #print(f"prev_action={action}, reward={reward}")

            #print(info)

            #action = int(input())

            ### DEBUGGING STUFF
            if debug is True:
                print(frame_since_act)

                ram = env.get_ram()
                ram_tensor = memory_to_tensor(ram)
                print_grid(ram_tensor)

                dir = input("Mow which direction?")

                action = int(int(cardinal_input(dir)))
                #input("Action made based on this state. Press any key to continue")
                #print(f"next_action={action}")





            action_state = next_state  # current state when action is performed
            frame_since_act = 0

            act = False  # if acted, then acting should not occur on next frame
            delay_act = False



        if debug is True:
            print(f"player x: {info['PLAYER_X']}")
            print(f"act? {act}")
            print(f"act 5 fr? {act_5fr}")
            print(f"frames until act: {frames_until_act}")

        # Agent performs action
        next_state, _, _, info = env.step(action)

        ram = env.get_ram()
        info["PLAYER_X"] = ram[0x00EA]
        info["PLAYER_Y"] = ram[0x00E8] - 2

        # Render frame
        env.render()





        # by default, no action on next possible frame
        if (prev_info["PLAYER_X"] != info["PLAYER_X"] or
            prev_info["PLAYER_Y"] != info["PLAYER_Y"] or
                (frame_since_act > 6 and act == False)
        ):
            act = True
            frames_until_act = 3

        # Hacky way to handle OOF'ing
        if info["FUEL"] == 0:
            frame_since_OOF += 1


        """ REWARD FUNCTION INFORMATION """

        ### TODO: clean up reward section

        if prev_info is not None:
            if info["FUEL"] > prev_info["FUEL"]:
                fuel_pickups += 1
                # cur_fuel_pickup = 1
                #fuel_rew = 1 * 100 * (1 - 1 / (1 + np.exp(-frame_count / 600)))
                fuel_rew = 2000
                frame_since_OOF = 0
                #print(f"Frame: {frame_count}, reward: {fuel_rew}")
            if info["DIRECTION"] != prev_info["DIRECTION"]:
                turns += 1
            else:
                turns = 0
            if info["GRASS_LEFT"] < prev_info["GRASS_LEFT"]:
                #reward += 10
                pass
                #print("Reward Updated")

            # Penalize for OOF'ing
            if frame_since_OOF > 3:
                reward -= 3000

        # Penalizes for turning too much
        #reward -= (turns - 1) * turns / 1000

        # reward for fuel pickup
        reward += fuel_rew

        # Penalizes for taking too long
        #reward -= (frame_since_act + 1) / 100

        """ STATE UPDATES """



        # Update state
        # state = next_state  # irrelevant now?

        # Store previous info
        prev_info = info



        if debug is True:
            if e > 0:
                pass
                #print(f"Reward = {reward}")
                #print(f"Turns = {turns}")
                #print("~~~current")
                #print(info)
                #print("~~~previous")
                #print(prev_info)
                #print("~~~")
                #print("~~~")

        """ DONE CONDITIONS """



        # Check if OOF
        if frame_since_OOF > 3 or info["GRASS_LEFT"] < 1:
            done = True
            if info["GRASS_LEFT"] < 1:
                reward += 10000  # maybe remove this?

            # Learn from final actions
            hank.cache(action_state, next_state, prev_action, reward, done)

            # Learn
            q, loss = hank.learn()
            propane_points += reward

            # Logging
            logger.log_step(reward, loss, q)

            if propane_points < best_propane_points:
                print(f"Run {e} - Propane Points = {round(propane_points,1)}  ||  Top Propane Points = {round(best_propane_points,1)}")
            elif propane_points >= best_propane_points:
                best_propane_points = propane_points
                # new_best = True # not used
                print(f"Run {e} ~~~ NEW BEST!  Good job, Hank!  New Top Propane Points = {round(best_propane_points,1)}")
            break

    logger.log_episode()

    """ SAVING & CHANGING LAWNS"""

    if e % 10 == 0:
        hank.save()
        logger.record(episode=e, epsilon=hank.exploration_rate, step=hank.curr_step)
        if len(lawn1_clear_ep)>0:
            print(f"Lawn 1 cleared on episode {lawn1_clear_ep}")
        elif len(lawn1_clear_ep)>1:
            print(f"Lawn 1 cleared on episodes {lawn1_clear_ep}")

    if info["GRASS_LEFT"] < 1 and save_states:
        hank.save()
        lawn1_clear_ep.append(e)
        logger.record(episode=e, epsilon=hank.exploration_rate, step=hank.curr_step)
        env.load_state(save_states.pop(), inttype = retro.data.Integrations.ALL)
    elif not save_states:
        sys.exit("HANK, YOU DID IT! YOU RAN THE GAUNTLET! LAWN 1-10 COMPLETE.")