In [2]:
import os, sys
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from src.game_utils import *
from src.training import *
from src.models import *

In [None]:
# Init Game Instance
game = Game(living_penalty=-0.04, render=False)

# Set learning parameters
lr = .8  # learning rate
y = .95  # discount factor
num_episodes = 2000
Q, jList, rList = get_trained_q_table(game, lr, y, num_episodes)
print("\nScore over time: " + str(sum(rList)/num_episodes))
print("\nFinal Q-Table Policy:\n")
#print_policy(state_space)
plt.plot(rList)
plt.show()

In [None]:
# Make use of cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Init Game Instance
game = Game(living_penalty=-0.07, render=False)

# init Q-Network
Qnet = QNetwork(game.state_space, game.action_space).to(device)

# Set learning parameters
e = 0.1  # epsilon
lr = .03  # learning rate
y = .999  # discount factor
num_episodes = 2000

Qnet, jList, rList = trained_q_net(game, Qnet, e, lr, y, num_episodes)

print("\Average steps per episode: " + str(sum(jList) / num_episodes))
print("\nScore over time: " + str(sum(rList) / num_episodes))
print("\nFinal Q-Network Policy:\n")
#print_policy()
plt.plot(rList)
plt.savefig("j_q_network.png")
plt.show()

In [None]:
# Define ensemble net
class EnsembleQNetwork(nn.Module):
    def __init__(self, agents, state_space, action_space, trajectory_depth):
        super(EnsembleQNetwork, self).__init__()
        
        self.agents = agents
        self.state_space = state_space
        self.hidden_size = state_space

        # Image = (trajectory_depth, state_space), Num channels = num_agents
        self.conv1 = nn.Conv2d(
            len(agents), 1, kernel_size=(trajectory_depth, state_space), padding='same'
        )

        self.fc1 = nn.Linear(
            in_features=self.state_space * trajectory_depth, out_features=self.hidden_size
        )
        self.fc2 = nn.Linear(in_features=self.hidden_size, out_features=action_space)

    """ Generate agent plans """
    def get_agent_trajectory(self, agent, game, state, n, max_steps):
        # Get Q values for action and convert to softmax - TODO need agent class to make this easy
        actions_qs = agent.get_q_values(state)
        actions_probs = actions_qs / np.sum(actions_qs)

        if np.sum(actions_qs) == 0:
            actions_probs = np.ones(action_space) / 4

        # Set probabilities for new state
        state_prob = np.zeros(self.state_space)

        for i in range(action_space):
            # Create copy of game state - TODO - need to set same params
            game = Game(living_penalty=-0.04, render=False)
            game.set_state(state)

            # Apply action
            reward, new_state, game_over = game.perform_action(i)

            # Update state values
            state_prob[new_state] += actions_probs[i]

        # For each non-zero landing spot, recurse - TODO
        # ...

        return np.stack([state_prob])

    """ Generate multiple agent plans """
    def get_multi_agents_trajectories(self, game, state, max_steps):
        trajectories = []
        for a in self.agents:
            trajectories.append(self.get_agent_trajectory(a, game=game, state=state, n=0, max_steps=max_steps))

        return np.stack(trajectories)

    """ Forward pass """
    def forward(self, state, game, trajectory_depth=1, method="ensemble"):
        if method == "ensemble":
            # Input as state id
            x = self.get_multi_agents_trajectories(
                game, state, max_steps=1
            )
            x = torch.unsqueeze(torch.tensor(x).float(), dim=0)

            # Input = (batch, num agents, trajectory_depth, state_space)
            x = self.conv1(x)
            x = x.flatten(1)
            x = self.fc1(x)
            x = self.fc2(x)

            return torch.sigmoid(x)
        else:
            max_action = -1
            max_reward = -1
            for a in self.agents:
                game = Game(living_penalty=-0.04, render=False)
                game.set_state(state)
                action_qs = a.get_q_values(state)
                print(actions_qs)
                reward, _, _ = game.perform_action(np.argmax(actions_qs))
                if max_action_reward < reward:
                    max_reward  = reward
                    max_action = np.argmax(actions_qs)
            return np.zeros(self.action_size)



    def one_hot_encoding(self, x):
        '''
        One-hot encodes the input data, based on the defined state_space.
        '''
        out_tensor = torch.zeros([1, state_space])
        out_tensor[0][x] = 1
        return out_tensor

In [None]:
# Define dummy agents, both of which are good on two different parts of the game
Q_dummy_a = np.zeros((64, 4))
Q_dummy_a[:8, 2] = 1
Q_dummy_a[[15, 23, 31, 39, 47, 55, 63], 1] = 1
Q_dummy_a[16:23, 2] = 1

Q_dummy_b = np.zeros((64, 4))
Q_dummy_b[[0, 8, 16, 24, 32, 48, 56], 1] = 1

In [None]:
# Make use of cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Init Game Instance
game = Game(living_penalty=-0.07, map_size="simple", render=False)

# Define State and Action Space
state_space = game.max_row * game.max_col
action_space = len(actions)

# Set learning parameters
e = 0.1  # epsilon
lr = .03  # learning rate
y = .999  # discount factor
num_episodes = 2000

# create lists to contain total rewards and steps per episode
jList = []
rList = []

# init Q-Network
a1 = Agent(agent_type="table", state_space=64, action_space=4)
a1.agent = Q_dummy_a

a2 = Agent(agent_type="table", state_space=64, action_space=4)
a2.agent = Q_dummy_b

agent = EnsembleQNetwork(
    [a1, a1], state_space=64, action_space=4, trajectory_depth=1
)

# define optimizer and loss
# optimizer = optim.SGD(agent.parameters(), lr=lr)
optimizer = optim.Adam(params=agent.parameters())
criterion = nn.SmoothL1Loss()

for i in range(num_episodes):
    # Reset environment and get first new observation
    s = game.reset()
    rAll = 0
    j = 0

    # The Q-Network learning algorithm
    while j < 99:
        j += 1

        # Choose an action by greedily (with e chance of random action) from the Q-network
        with torch.no_grad():
            # Do a feedforward pass for the current state s to get predicted Q-values
            # for all actions (=> agent(s)) and use the max as action a: max_a Q(s, a)
            a = agent(s, game)
            a = a.max(1)[1].view(1, 1)  # max(1)[1] returns index of highest value

        # e greedy exploration
        if np.random.rand(1) < e:
            a[0][0] = np.random.randint(1, 4)

        # Get new state and reward from environment
        # perform action to get reward r, next state s1 and game_over flag
        # calculate maximum overall network outputs: max_a’ Q(s1, a’).
        r, s1, game_over = game.perform_action(game.actions[a])

        # Calculate Q and target Q - TODO - set actual state
        q = agent(s, game).max(1)[0].view(1, 1)
        q1 = agent(s1, game).max(1)[0].view(1, 1)

        with torch.no_grad():
            # Set target Q-value for action to: r + y max_a’ Q(s’, a’)
            target_q = r + y * q1

        # print(q, target_q)
        # Calculate loss
        loss = criterion(q, target_q)
        if j == 1 and i % 100 == 0:
            print("loss and reward: ", i, loss, r)

        # Optimize the model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Add reward to list
        rAll += r

        # Replace old state with new
        s = s1

        if game_over:
            # Reduce chance of random action as we train the model.
            e = 1. / ((i / 50) + 10)
            break
    rList.append(rAll)
    jList.append(j)

print("\Average steps per episode: " + str(sum(jList) / num_episodes))
print("\nScore over time: " + str(sum(rList) / num_episodes))
print("\nFinal Q-Network Policy:\n")

plt.plot(jList)
plt.plot(rList)
plt.savefig("j_q_network.png")
plt.show()

In [None]:
# Define ensemble net
class EnsembleQNetwork2(nn.Module):
    def __init__(self, agents, state_space, action_space, trajectory_depth):
        super(EnsembleQNetwork2, self).__init__()
        
        self.agents = agents
        self.state_space = state_space
        self.hidden_size = state_space

        # Image = (trajectory_depth, state_space), Num channels = num_agents
        self.conv1 = nn.Conv2d(
            len(agents), 1, kernel_size=(trajectory_depth, state_space), padding='same'
        )

        self.fc1 = nn.Linear(
            in_features=self.state_space * trajectory_depth, out_features=self.hidden_size
        )
        self.fc2 = nn.Linear(in_features=self.hidden_size, out_features=len(self.agents))

    def get_agent_trajectory(self, agent, game, state, n, max_steps):
        """ Generate representation of agent trajectories (possible future paths) """

        # Get Q values for action and convert to softmax - TODO need agent class to make this easy
        actions_qs = agent.get_q_values(state)
        actions_probs = actions_qs / np.sum(actions_qs)

        if np.sum(actions_qs) == 0:
            actions_probs = np.ones(action_space) / 4

        # Set probabilities for new state
        state_prob = np.zeros(self.state_space)

        for i in range(action_space):
            # Create copy of game state - TODO - need to set same params
            game = Game(living_penalty=-0.04, render=False)
            game.set_state(state)

            # Apply action
            reward, new_state, game_over = game.perform_action(i)

            # Update state values
            state_prob[new_state] += actions_probs[i]

        # For each non-zero landing spot, recurse - TODO
        # ...

        return np.stack([state_prob])

    
    def get_multi_agents_trajectories(self, game, state, max_steps):
        """ Generate multiple agent plans """

        trajectories = []
        for a in self.agents:
            trajectories.append(self.get_agent_trajectory(a, game=game, state=state, n=0, max_steps=max_steps))

        return np.stack(trajectories)

    def forward(self, state, game, trajectory_depth=1, method="ensemble"):
        """ Model inference """

        if method == "ensemble":
            # Input as state id
            x = self.get_multi_agents_trajectories(
                game, state, max_steps=1
            )
            x = torch.unsqueeze(torch.tensor(x).float(), dim=0)

            # Input = (batch, num agents, trajectory_depth, state_space)
            x = self.conv1(x)
            x = x.flatten(1)
            x = self.fc1(x)
            x = self.fc2(x)

            selected_agent_i = torch.argmax(torch.sigmoid(x)).item()

            return torch.tensor([self.agents[selected_agent_i].get_q_values(state)])
        else:
            # Use simple max action aggregation method
            max_action = -1
            max_reward = -1
            for a in self.agents:
                game = Game(living_penalty=-0.04, render=False)
                game.set_state(state)
                action_qs = a.get_q_values(state)
                print(actions_qs)
                reward, _, _ = game.perform_action(np.argmax(actions_qs))
                if max_action_reward < reward:
                    max_reward  = reward
                    max_action = np.argmax(actions_qs)
            return np.zeros(self.action_size)



    def one_hot_encoding(self, x):
        """ One-hot encodes the input data, based on the defined state_space. """
        out_tensor = torch.zeros([1, state_space])
        out_tensor[0][x] = 1
        return out_tensor

# Make use of cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Init Game Instance
game = Game(living_penalty=-0.07, map_size="simple", render=False)

# Define State and Action Space
state_space = game.max_row * game.max_col
action_space = len(actions)

# Set learning parameters
e = 0.1  # epsilon
lr = .03  # learning rate
y = .999  # discount factor
num_episodes = 2000

# create lists to contain total rewards and steps per episode
jList = []
rList = []

# init Q-Network
a1 = Agent(agent_type="table", state_space=64, action_space=4)
a1.agent = Q_dummy_a

a2 = Agent(agent_type="table", state_space=64, action_space=4)
a2.agent = Q_dummy_b

agent = EnsembleQNetwork2(
    [a1, a1], state_space=64, action_space=4, trajectory_depth=1
)

# define optimizer and loss
# optimizer = optim.SGD(agent.parameters(), lr=lr)
optimizer = optim.Adam(params=agent.parameters())
criterion = nn.SmoothL1Loss()

for i in range(num_episodes):
    # Reset environment and get first new observation
    s = game.reset()
    rAll = 0
    j = 0

    # The Q-Network learning algorithm
    while j < 99:
        j += 1

        # Choose an action by greedily (with e chance of random action) from the Q-network
        with torch.no_grad():
            # Do a feedforward pass for the current state s to get predicted Q-values
            # for all actions (=> agent(s)) and use the max as action a: max_a Q(s, a)
            a = agent(s, game)
            a = a.max(1)[1].view(1, 1)  # max(1)[1] returns index of highest value

        # e greedy exploration
        if np.random.rand(1) < e:
            a[0][0] = np.random.randint(1, 4)

        # Get new state and reward from environment
        # perform action to get reward r, next state s1 and game_over flag
        # calculate maximum overall network outputs: max_a’ Q(s1, a’).
        r, s1, game_over = game.perform_action(game.actions[a])

        # Calculate Q and target Q - TODO - set actual state
        q = agent(s, game).max(1)[0].view(1, 1)
        q1 = agent(s1, game).max(1)[0].view(1, 1)

        with torch.no_grad():
            # Set target Q-value for action to: r + y max_a’ Q(s’, a’)
            target_q = r + y * q1

        # print(q, target_q)
        # Calculate loss
        loss = criterion(q, target_q)
        if j == 1 and i % 100 == 0:
            print("loss and reward: ", i, loss, r)

        # Optimize the model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Add reward to list
        rAll += r

        # Replace old state with new
        s = s1

        if game_over:
            # Reduce chance of random action as we train the model.
            e = 1. / ((i / 50) + 10)
            break
    rList.append(rAll)
    jList.append(j)

print("\Average steps per episode: " + str(sum(jList) / num_episodes))
print("\nScore over time: " + str(sum(rList) / num_episodes))
print("\nFinal Q-Network Policy:\n")

plt.plot(jList)
plt.plot(rList)
plt.savefig("j_q_network.png")
plt.show()