In [7]:
import numpy as np
import laserhockey.laser_hockey_env as lh
import gymnasium as gym
from importlib import reload
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
from copy import deepcopy
import torch.nn.functional as F


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [3]:
#Version 1
class FCQV(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dims=(256, 256), activation_fc=F.relu):
        super(FCQV, self).__init__()

        self.activation_fc = activation_fc
        self.input_layer = nn.Linear(state_dim + action_dim, hidden_dims[0])
        self.bn_input = nn.BatchNorm1d(hidden_dims[0])
        
        self.hidden_layers = nn.ModuleList()
        self.bn_hidden = nn.ModuleList()
        
        # Add two hidden layers with 128 neurons each
        for i in range(len(hidden_dims) - 1):
            hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            bn_layer = nn.BatchNorm1d(hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
            self.bn_hidden.append(bn_layer)
        
        # Output layer with a single neuron for the Q-value estimate
        self.output_layer = nn.Linear(hidden_dims[-1], 1)

    def forward(self, state, action):
        # Combining state and action right at the beginning
        x = torch.cat((state, action), dim=1)
        x = self.activation_fc(self.bn_input(self.input_layer(x)))
        
        for hidden_layer, bn_layer in zip(self.hidden_layers, self.bn_hidden):
            x = self.activation_fc(bn_layer(hidden_layer(x)))
            
        x = self.output_layer(x)
        return x


In [10]:
#Version 2 
class FCQV(nn.Module):
    def __init__(self, 
                 input_dim, 
                 output_dim, 
                 hidden_dims=(128,128), 
                 activation_fc=F.relu):
        super(FCQV, self).__init__()
        self.activation_fc = activation_fc

        self.input_layer = nn.Linear(input_dim, hidden_dims[0])
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            in_dim = hidden_dims[i]
            if i == 0: 
                in_dim += output_dim
            hidden_layer = nn.Linear(in_dim, hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        self.output_layer = nn.Linear(hidden_dims[-1], 1)

        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)
    
    def _format(self, state, action):
        x, u = state, action
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, 
                             device=self.device, 
                             dtype=torch.float32)
            x = x.unsqueeze(0)
        if not isinstance(u, torch.Tensor):
            u = torch.tensor(u, 
                             device=self.device, 
                             dtype=torch.float32)
            u = u.unsqueeze(0)
        return x, u

    def forward(self, state, action):
        x, u = self._format(state, action)
        x = self.activation_fc(self.input_layer(x))
        for i, hidden_layer in enumerate(self.hidden_layers):
            if i == 0:
                x = torch.cat((x, u), dim=1)
            x = self.activation_fc(hidden_layer(x))
        return self.output_layer(x)
    
    def load(self, experiences):
        states, actions, new_states, rewards, is_terminals = experiences
        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).float().to(self.device)
        new_states = torch.from_numpy(new_states).float().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        is_terminals = torch.from_numpy(is_terminals).float().to(self.device)
        return states, actions, new_states, rewards, is_terminals

In [78]:
#Version 1 Policy
class FCDP(nn.Module):
    def __init__(self, input_dim, action_bounds, hidden_dims=(256, 256), activation_fc=F.relu, out_activation_fc=F.tanh):
        super(FCDP, self).__init__()
        self.activation_fc = activation_fc
        self.out_activation_fc = out_activation_fc
        self.env_min, self.env_max = action_bounds
        
        self.input_layer = nn.Linear(input_dim, hidden_dims[0])
        self.bn_input = nn.BatchNorm1d(hidden_dims[0])
        
        self.hidden_layers = nn.ModuleList()
        self.bn_hidden = nn.ModuleList()
        
        # Add two hidden layers with 128 neurons each
        for i in range(len(hidden_dims) - 1):
            hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            bn_layer = nn.BatchNorm1d(hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
            self.bn_hidden.append(bn_layer)
            
        # Output layer with the same number of neurons as the action bounds
        self.output_layer = nn.Linear(hidden_dims[-1], len(self.env_max))
        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.env_min_tensor = torch.tensor(self.env_min, dtype=torch.float32, device=device)
        self.env_max_tensor = torch.tensor(self.env_max, dtype=torch.float32, device=device)
        self.rescale_fn = lambda x: (x + 1) / 2 * (self.env_max_tensor - self.env_min_tensor) + self.env_min_tensor

    def forward(self, state):
        x = state  # Assuming state is already formatted
        x = self.activation_fc(self.bn_input(self.input_layer(x)))
        
        for hidden_layer, bn_layer in zip(self.hidden_layers, self.bn_hidden):
            x = self.activation_fc(bn_layer(hidden_layer(x)))
            
        x = self.output_layer(x)
        x = self.out_activation_fc(x)
        return self.rescale_fn(x)


In [11]:
#Version 2 Policy

class FCDP(nn.Module):
    def __init__(self, 
                 input_dim,
                 action_bounds,
                 hidden_dims=(128,128), 
                 activation_fc=F.relu,
                 out_activation_fc=F.tanh):
        super(FCDP, self).__init__()
        self.activation_fc = activation_fc
        self.out_activation_fc = out_activation_fc
        self.env_min, self.env_max = action_bounds

        self.input_layer = nn.Linear(input_dim, hidden_dims[0])
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        self.output_layer = nn.Linear(hidden_dims[-1], len(self.env_max))

        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)
        
        self.env_min = torch.tensor(self.env_min,
                                    device=self.device, 
                                    dtype=torch.float32)

        self.env_max = torch.tensor(self.env_max,
                                    device=self.device, 
                                    dtype=torch.float32)
        
        self.nn_min = self.out_activation_fc(
            torch.Tensor([float('-inf')])).to(self.device)
        self.nn_max = self.out_activation_fc(
            torch.Tensor([float('inf')])).to(self.device)
        self.rescale_fn = lambda x: (x - self.nn_min) * (self.env_max - self.env_min) / \
                                    (self.nn_max - self.nn_min) + self.env_min

    def _format(self, state):
        x = state
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, 
                             device=self.device, 
                             dtype=torch.float32)
            x = x.unsqueeze(0)
        return x

    def forward(self, state):
        x = self._format(state)
        x = self.activation_fc(self.input_layer(x))
        for hidden_layer in self.hidden_layers:
            x = self.activation_fc(hidden_layer(x))
        x = self.output_layer(x)
        x = self.out_activation_fc(x)
        return self.rescale_fn(x)

In [12]:
class DDPGAgent:
    def __init__(self, state_dim, action_dim, action_bounds, gamma=0.95, lr_value=0.0002, lr_policy=0.0002, value_max_grad_norm=1.0, policy_max_grad_norm=1.0):
        self.gamma = gamma
        self.tau = 0.005
        self.value_max_grad_norm = value_max_grad_norm
        self.policy_max_grad_norm = policy_max_grad_norm
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

       
        
        # Action bounds
        self.env_min, self.env_max = action_bounds
        
        # Action dimension
        self.action_dim = action_dim  
        
        # Network models
        self.online_value_model = FCQV(state_dim, action_dim)
        self.target_value_model = FCQV(state_dim, action_dim)
        self.online_policy_model = FCDP(state_dim, action_bounds)
        self.target_policy_model = FCDP(state_dim, action_bounds)
        self.target_value_model.load_state_dict(self.online_value_model.state_dict())
        self.target_policy_model.load_state_dict(self.online_policy_model.state_dict())
        self.value_optimizer = optim.Adam(self.online_value_model.parameters(), lr=lr_value)
        self.policy_optimizer = optim.Adam(self.online_policy_model.parameters(), lr=lr_policy)

    #def soft_update(self, online_model, target_model):
    #    for target_param, online_param in zip(target_model.parameters(), online_model.parameters()):
    #        target_param.data.copy_(self.tau * online_param.data + (1.0 - self.tau) * target_param.data)

    def soft_update(self, online_model, target_model):
        tau = 0.0001
        for target, online in zip(self.target_value_model.parameters(), 
                                  self.online_value_model.parameters()):
            target_ratio = (1.0 - tau) * target.data
            online_ratio = tau * online.data
            mixed_weights = target_ratio + online_ratio
            target.data.copy_(mixed_weights)

        for target, online in zip(self.target_policy_model.parameters(), 
                                  self.online_policy_model.parameters()):
            target_ratio = (1.0 - tau) * target.data
            online_ratio = tau * online.data
            mixed_weights = target_ratio + online_ratio
            target.data.copy_(mixed_weights)

    def optimize_model(self, experiences):
        states, actions, rewards, next_states, is_terminals = experiences
        batch_size = len(is_terminals)

        

        argmax_a_q_sp = self.target_policy_model(next_states)
        max_a_q_sp = self.target_value_model(next_states, argmax_a_q_sp)

        target_q_sa = rewards + self.gamma * max_a_q_sp * (1 - is_terminals)

        #L2 Loss
        q_sa = self.online_value_model(states, actions)
        td_error = q_sa - target_q_sa.detach()
        value_loss = td_error.pow(2).mul(0.5).mean()

        #Huber Loss
        #loss_function = torch.nn.SmoothL1Loss(reduction='mean')
        #value_loss = loss_function(q_sa, target_q_sa.detach())

        self.value_optimizer.zero_grad()
        value_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.online_value_model.parameters(), self.value_max_grad_norm)
        self.value_optimizer.step()
        
        argmax_a_q_s = self.online_policy_model(states)
        max_a_q_s = self.online_value_model(states, argmax_a_q_s)
        policy_loss = -max_a_q_s.mean()
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.online_policy_model.parameters(), self.policy_max_grad_norm)
        self.policy_optimizer.step()
        # Soft Update der Ziel-Netzwerke
        self.soft_update(self.online_value_model, self.target_value_model)
        self.soft_update(self.online_policy_model, self.target_policy_model)

        return value_loss.item(), policy_loss.item()
    

    
    def he_initialization(self):
        def init_weights(m):
            if isinstance(m, nn.Linear):
                init.kaiming_normal_(m.weight, nonlinearity='relu')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            
        # Apply He initialization to the network models
        self.online_value_model.apply(init_weights)
        self.target_value_model.apply(init_weights)
        self.online_policy_model.apply(init_weights)
        self.target_policy_model.apply(init_weights)

    


In [10]:
class NormalNoiseStrategy():
    def __init__(self, low, high, exploration_noise_ratio=0.1):
        self.low = low
        self.high = high
        self.exploration_noise_ratio = exploration_noise_ratio
        self.ratio_noise_injected = 0

    def _noise_ratio_update(self):
        self.exploration_noise_ratio *= 0.9999
        return self.exploration_noise_ratio

    def select_action(self, model, state, max_exploration=False):
        if max_exploration:
            noise_scale = self.high
        else:
            noise_scale = self.exploration_noise_ratio * self.high

        with torch.no_grad():
            greedy_action = model(state).cpu().detach().data.numpy().squeeze()

        noise = np.random.normal(loc=0, scale=noise_scale, size=len(self.high))
        noisy_action = greedy_action + noise

        # Keep Player 2 static at all times
        noisy_action[3] = 0  # Movement in x-direction for Player 2
        noisy_action[4] = 0  # Movement in y-direction for Player 2
        noisy_action[5] = 0  # Rotation for Player 2

        action = np.clip(noisy_action, self.low, self.high)
        
        self.ratio_noise_injected = np.mean(abs((greedy_action - action) / (self.high - self.low)))
        self._noise_ratio_update()
        return action


In [44]:
# Epsilon Greedy Strategy
class EpsilonGreedyStrategy:
    def __init__(self, start_epsilon, end_epsilon, decay, action_space):
        self.epsilon = start_epsilon
        self.end_epsilon = end_epsilon
        self.decay = decay
        self.action_space = action_space

    def select_action(self, model, state_tensor, episode):
        if episode < train_start:
            # Only allow movements up or down for Player 1
            random_action = self.action_space.sample()
            random_action[4] = 0 # Zufällige Bewegung in y-Richtung für Spieler 2
            random_action[5] = 0  # Zufällige Rotation für Spieler 2
            random_action[6] = 0  # Zufällige Rotation für Spieler 2
            random_action[7] = 0  # Zufällige Rotation für Spieler 2
            self.epsilon = 1.0
            return random_action
        elif np.random.rand() > self.epsilon:  # Exploitation: Mit Wahrscheinlichkeit 1-epsilon
            with torch.no_grad():
                model_output = model(state_tensor).cpu().data.numpy().squeeze()
                # Resetting indices 4 and beyond to zero for defense 
                model_output[4:] = 0
                return model_output
        else:  # Exploration: Mit Wahrscheinlichkeit epsilon
            random_action = self.action_space.sample()
            #random_action[3] = 0  # Zufällige Bewegung in x-Richtung für Spieler 2
            random_action[4] = 0 # Zufällige Bewegung in y-Richtung für Spieler 2
            random_action[5] = 0  # Zufällige Rotation für Spieler 2
            random_action[6] = 0  # Zufällige Rotation für Spieler 2
            random_action[7] = 0  # Zufällige Rotation für Spieler 2
            return random_action


    def decay_epsilon(self):
        if self.epsilon > self.end_epsilon:
            self.epsilon *= self.decay



In [None]:
#Replay Buffer for DDPG
import random
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def store(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = args
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return zip(*random.sample(self.memory, batch_size))

    def __len__(self):
        return len(self.memory)

In [48]:
# Initialize DDPG Agent and Environment
import laserhockey.hockey_env as h_env
from gymnasium import spaces

env = h_env.HockeyEnv()
env.reset(mode=h_env.HockeyEnv.TRAIN_DEFENSE)


#env.discrete_action_space = spaces.Discrete(7)

# Set the environment mode to TRAIN DEFENSE
#reload(lh)
#np.set_printoptions(suppress=True)
#import laserhockey.laser_hockey_env as lh
#env = lh.LaserHockeyEnv()
#env.reset(mode=lh.LaserHockeyEnv.TRAIN_DEFENSE)
# Set the environment mode to TRAIN DEFENSE
#env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE)
#env.reset()

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bounds = (env.action_space.low, env.action_space.high)
#action_strategy = NormalNoiseDecayStrategy(action_bounds[0], action_bounds[1], initial_noise_ratio=1.2)
epsilon_strategy = EpsilonGreedyStrategy(1.0, 0.01, 0.995, env.action_space)

# Initialize the Replay Buffer and the DDPG Agent
replay_buffer = ReplayBuffer(100000)
agent = DDPGAgent(state_dim=state_dim, action_dim=action_dim, action_bounds=action_bounds)

# Use the method to apply He initialization
agent.he_initialization()

num_episodes = 10000
batch_size = 128
train_start = 1000  
update_frequency = 1  # Update the agent every 1 steps

losses = []
all_rewards = []  # Store all episode rewards for moving average calculation

wins =[]

for episode in range(num_episodes):
    obs = env.reset()
    obs_agent1 = env._get_obs()
    episode_reward = 0
    episode_value_losses = []
    episode_policy_losses = []

    

    # Determine the result for the current episode
    last_80_results = wins[-150:]  # Collect results of the last 80 steps
    if 1 in last_80_results:
        episode_result = "Won"
    elif -1 in last_80_results:
        episode_result = "Lost"
    else:
        episode_result = "Draw"

    print(f"Episode {episode + 1}: {episode_result}")

    touched = 0

    for step in range(150):
        if episode >= 1500:
            env.render()
        state_tensor = torch.FloatTensor(obs_agent1).unsqueeze(0)

        

        # Set the policy model to evaluation mode
        agent.online_policy_model.eval()
        action_player1 = epsilon_strategy.select_action(agent.online_policy_model, state_tensor, episode)

        

        agent.online_policy_model.train()
        
        next_obs, reward, done, _, info = env.step(action_player1)

        


        if info['reward_touch_puck'] > 0 and touched == 0:
            touched = 1
            reward += (150 - step) * 0.1  # Add the reward for touching the puck for the first time


        
        reward_closeness = info['reward_closeness_to_puck']
        puck_touch = info['reward_touch_puck']

        # Add reward for closeness
        reward += 5 * reward_closeness

        # Add penalty for not touching the puck at all in the current step
        reward -= (1 - puck_touch) * 0.01

        episode_reward += reward

        #Count wins (successes in defense)
        wins.append(info['winner'])

        obs_agent1 = env._get_obs()
        experience = (obs_agent1, action_player1, reward, next_obs, done)
        replay_buffer.store(*experience)

        # DDPG Training
        if episode > train_start and len(replay_buffer) > batch_size and step % update_frequency == 0:
            experiences = replay_buffer.sample(batch_size)

            
            states_np, actions_np, rewards_np, next_states_np, is_terminals_np = [np.array(x) for x in experiences]

            # Set all actions from index [4:] to zeros
            actions_np[:, 4:] = 0

            states, actions, rewards, next_states, is_terminals = [torch.FloatTensor(x_np).to(device) for x_np in [states_np, actions_np, rewards_np, next_states_np, is_terminals_np]]
            
            #print(actions_np[:, 0:])
            #print(actions_np[:, 4:])

            value_loss, policy_loss = agent.optimize_model((states, actions, rewards, next_states, is_terminals))

            episode_value_losses.append(value_loss)
            episode_policy_losses.append(policy_loss)

        if step == 79:  
            done = True

        if done:
            break


    epsilon_strategy.decay_epsilon()
    all_rewards.append(episode_reward)
    moving_avg_reward = np.mean(all_rewards[-100:])  # Calculate moving average of the last 100 episode rewards
    avg_value_loss = sum(episode_value_losses) / len(episode_value_losses) if episode_value_losses else 0
    avg_policy_loss = sum(episode_policy_losses) / len(episode_policy_losses) if episode_policy_losses else 0

    print(f"Episode {episode + 1}, Avg Value Loss: {avg_value_loss}, Avg Policy Loss: {avg_policy_loss}")
    

    print(f"Episode {episode + 1}, Reward: {episode_reward}, Moving Avg Reward: {moving_avg_reward}, Replay Buffer Size: {len(replay_buffer)}")
    print(f"Current Epsilon: {epsilon_strategy.epsilon}")

    # Calculate loss rate for the last 150 episodes (or less if episode number is < 150)
    recent_games = wins[-150 * 80:] if episode >= 150 else wins  # Look back at results of the last 150 episodes (each episode has 80 steps)
    recent_losses = [1 for i in range(0, len(recent_games), 80) if -1 in recent_games[i:i+80]].count(1)  # Count how many episodes in the recent games have a loss
    loss_rate = recent_losses / (len(recent_games) / 80)  # Calculate loss rate
    print(f"Loss rate over the last {int(len(recent_games)/80)} episodes: {loss_rate:.2f}")

    print(f"Episode {episode + 1}, Reward: {episode_reward}, Moving Avg Reward: {moving_avg_reward}, Replay Buffer Size: {len(replay_buffer)}")
    print(f"Current Epsilon: {epsilon_strategy.epsilon}")

# Save the agent's model after training
torch.save(agent.online_policy_model.state_dict(), 'online_policy_model_checkpoint.pth')
torch.save(agent.online_value_model.state_dict(), 'online_value_model_checkpoint.pth')
torch.save(agent.target_policy_model.state_dict(), 'target_policy_model_checkpoint.pth')
torch.save(agent.target_value_model.state_dict(), 'target_value_model_checkpoint.pth')

Episode 1: Draw
Episode 1, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 1, Reward: 12.643787730378742, Moving Avg Reward: 12.643787730378742, Replay Buffer Size: 20
Current Epsilon: 0.995
Loss rate over the last 0 episodes: 0.00
Episode 1, Reward: 12.643787730378742, Moving Avg Reward: 12.643787730378742, Replay Buffer Size: 20
Current Epsilon: 0.995
Episode 2: Won
Episode 2, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 2, Reward: 14.771109965632832, Moving Avg Reward: 13.707448848005786, Replay Buffer Size: 41
Current Epsilon: 0.995
Loss rate over the last 0 episodes: 0.00
Episode 2, Reward: 14.771109965632832, Moving Avg Reward: 13.707448848005786, Replay Buffer Size: 41
Current Epsilon: 0.995
Episode 3: Won
Episode 3, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 3, Reward: -22.59487860835752, Moving Avg Reward: 1.6066730292180178, Replay Buffer Size: 112
Current Epsilon: 0.995
Loss rate over the last 1 episodes: 0.00
Episode 3, Reward: -22.59487860835752, Moving Avg Reward: 1.606

KeyboardInterrupt: 

In [11]:
#Hindsight Experience Replay Buffer class
class HERBuffer:
    def __init__(self, buffer_size, goal_selection_strategy, reward_function):
        self.buffer_size = buffer_size
        self.goal_selection_strategy = goal_selection_strategy
        self.reward_function = reward_function
        self.memory = []
        self.position = 0
    
    def store(self, *args):
        if len(self.memory) < self.buffer_size:
            self.memory.append(None)
        self.memory[self.position] = args
        self.position = (self.position + 1) % self.buffer_size

    def store_episode(self, episode):
        goal = self.goal_selection_strategy(episode)
        
        for state, action, _, next_state, done in episode:
            reward = self.reward_function(state, action, goal)
            self.store(state, action, reward, next_state, goal)
            
            # HER storage
            additional_goals = self.goal_selection_strategy(episode)
            for g in additional_goals:
                her_reward = self.reward_function(state, action, g)
                self.store(state, action, her_reward, next_state, g)
    
    def sample(self, batch_size):
        return zip(*random.sample(self.memory, batch_size))

    def __len__(self):
        return len(self.memory)


In [42]:
#DDP Agent
import random
np.set_printoptions(suppress=True)
reload(lh)

# Set the environment mode to TRAIN DEFENSE
env = lh.LaserHockeyEnv()
env.reset(mode=lh.LaserHockeyEnv.TRAIN_DEFENSE)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bounds = (env.action_space.low, env.action_space.high)
epsilon_strategy = EpsilonGreedyStrategy(1.0, 0.01, 0.995, env.action_space)

# Goal selection strategy for HERBuffer
def sample_goals(episode):
    goals = []

    for experience in episode:
        _, _, reward, next_state, _ = experience
        if abs(reward) == 0.2:
            goals.append(next_state)

    return goals

def compute_reward(state, action, goal):
    _, _, _, _, info = env.step(action)
    
    if (info.get('reward_closeness_to_puck', 0) > 0) or \
       (info.get('reward_touch_puck', 0) > 0) or \
       (info.get('reward_puck_direction', 0) > 0):
        return 0.2
    elif (info.get('reward_closeness_to_puck', 0) < 0) or \
         (info.get('reward_touch_puck', 0) < 0) or \
         (info.get('reward_puck_direction', 0) < 0):
        return -0.2
    
    return 0




her_buffer = HERBuffer(buffer_size=5000, goal_selection_strategy=sample_goals, reward_function=compute_reward)

agent = DDPGAgent(state_dim=state_dim, action_dim=action_dim, action_bounds=action_bounds)
agent.he_initialization()

num_episodes = 10000
batch_size = 256
train_start = 1000 
update_frequency = 1

losses = []
all_rewards = []
wins = []

for episode in range(num_episodes):
    obs = env.reset()
    obs_agent1 = env._get_obs()
    episode_reward = 0
    episode_value_losses = []
    episode_policy_losses = []

   

    episode_trajectory = []

    for step in range(80):
        state_tensor = torch.FloatTensor(obs_agent1).unsqueeze(0)
        agent.online_policy_model.eval()
        action_player1 = epsilon_strategy.select_action(agent.online_policy_model, state_tensor, episode)

        
        agent.online_policy_model.train()

        next_obs, reward, done, _, info = env.step(action_player1)
        reward += info['winner']
        reward += info['reward_closeness_to_puck']
        reward += info['reward_touch_puck']
        reward += info['reward_puck_direction']
        episode_reward += reward

        #print(f"info: {info}")

        obs_agent1 = env._get_obs()
        experience = (obs_agent1, action_player1, reward, next_obs, done)
        #Store trajectories for HER
        episode_trajectory.append(experience)
        her_buffer.store(*experience)
        

        # DDPG Training

        if episode > train_start and len(her_buffer) > batch_size:

            experiences = her_buffer.sample(batch_size)
            states, actions, rewards, next_states, is_terminals = [torch.FloatTensor(x_np).to(device) for x_np in [states_np, actions_np, rewards_np, next_states_np, is_terminals_np]]

            
            value_loss, policy_loss = agent.optimize_model((states, actions, rewards, next_states, is_terminals))
            episode_value_losses.append(value_loss)
            episode_policy_losses.append(policy_loss)

        if step == 79: 
            done = True

        if done:
            break

    
    #Store new trajectories in HER
    her_buffer.store_episode(episode_trajectory)

    if random.random() < 0.15:
        gradient = agent.online_policy_model.input_layer.weight.grad
        if gradient is not None:
            print(f"Mean gradient of online policy model's input layer: {gradient.mean()}")

        q_gradient = agent.online_value_model.input_layer.weight.grad
        if q_gradient is not None:
            print(f"Mean gradient of online value model's input layer: {q_gradient.mean()}")

    epsilon_strategy.decay_epsilon()
    all_rewards.append(episode_reward)
    moving_avg_reward = np.mean(all_rewards[-100:])
    avg_value_loss = sum(episode_value_losses) / len(episode_value_losses) if episode_value_losses else 0
    avg_policy_loss = sum(episode_policy_losses) / len(episode_policy_losses) if episode_policy_losses else 0

    print(f"Episode {episode + 1}, Avg Value Loss: {avg_value_loss}, Avg Policy Loss: {avg_policy_loss}")
    print(f"Episode {episode + 1}, Reward: {episode_reward}, Moving Avg Reward: {moving_avg_reward}, HER Buffer Size: {len(her_buffer)}")
    print(f"Current Epsilon: {epsilon_strategy.epsilon}")

    recent_games = wins[-150 * 80:] if episode >= 150 else wins
    recent_losses = [1 for i in range(0, len(recent_games), 80) if -1 in recent_games[i:i+80]].count(1)
    #loss_rate = recent_losses / (len(recent_games) / 80) if len(recent_games) > 0 else 0
    #print(f"Loss rate over the last {int(len(recent_games)/80)} episodes: {loss_rate:.2f}")


    #if loss_rate > 0.80 and episode > train_start:
    #    print("Bad Agent. Restarting Training...")
    #    agent = DDPGAgent(state_dim=state_dim, action_dim=action_dim, action_bounds=action_bounds)
    #    agent.he_initialization()
    #    wins.clear()

# Save the models
torch.save(agent.online_policy_model.state_dict(), "online_policy_model_defense.pt")
torch.save(agent.online_value_model.state_dict(), "online_value_model_defense.pt")
torch.save(agent.target_policy_model.state_dict(), "target_policy_model_defense.pt")
torch.save(agent.target_value_model.state_dict(), "target_value_model_defense.pt")



Episode 1, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 1, Reward: -0.15751856180049464, Moving Avg Reward: -0.15751856180049464, HER Buffer Size: 160
Current Epsilon: 0.995
Player 2 scored
Episode 2, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 2, Reward: -11.39032215801075, Moving Avg Reward: -5.773920359905623, HER Buffer Size: 242
Current Epsilon: 0.995
Player 2 scored
Episode 3, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 3, Reward: -11.356689256428453, Moving Avg Reward: -7.634843325413233, HER Buffer Size: 324
Current Epsilon: 0.995
Player 1 scored
Episode 4, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 4, Reward: -0.39075220754125634, Moving Avg Reward: -5.82382054594524, HER Buffer Size: 484
Current Epsilon: 0.995
Player 2 scored
Episode 5, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 5, Reward: -11.285190693622859, Moving Avg Reward: -6.916094575480765, HER Buffer Size: 570
Current Epsilon: 0.995
Player 2 scored
Episode 6, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 6

KeyboardInterrupt: 

In [None]:
def print_her_buffer(buffer, num_entries=1000):
    """Prints the first num_entries experiences from the HER buffer."""
    for idx, experience in enumerate(buffer.memory):
        if idx >= num_entries:
            break
        print(f"Experience {idx + 1}:")
        obs, action, reward, next_obs, done = experience
        print(f"State shape: {np.array(obs)}")
        print(f"Action shape: {np.array(action)}")
        print(f"Reward: {reward}")
        print(f"Next State shape: {np.array(next_obs)}")
        print(f"Done: {done}\n")

# Call the function
print_her_buffer(her_buffer)



def extract_data_from_buffer(buffer, batch_size):
    states, actions, rewards, next_states, dones = [], [], [], [], []
    
    sampled_experiences = buffer.sample(batch_size)
    
    for idx, experience in enumerate(sampled_experiences):
        if idx >= batch_size:
            break
        
        print(f"Experience {idx + 1} Length: {len(experience)}")
        print(experience)
        
        if len(experience) != 5:
            print(f"Error: Expected length of experience is 5, but got {len(experience)}")
            continue
        
        try:
            state, action, reward, next_state, done = experience
        except ValueError as e:
            print(f"Unexpected error during unpacking: {e}")
            continue
        
        states.append(np.array(state))
        actions.append(np.array(action))
        rewards.append(reward)
        next_states.append(np.array(next_state))
        dones.append(done)
    
    states_np = np.array(states)
    actions_np = np.array(actions)
    rewards_np = np.array(rewards)
    next_states_np = np.array(next_states)
    dones_np = np.array(dones)
    
    return states_np, actions_np, rewards_np, next_states_np, dones_np






Experience 1:
State shape: [ -6.          -0.00520372   0.00200054   0.999998     0.
  -0.27195147   0.10002708   6.75694084   3.29298735   0.
   1.           0.           0.           0.           0.72931671
   4.61147165 -10.9371109   -3.84231043]
Action shape: [ 0.        -0.4070022  0.5639671  0.         0.         0.       ]
Reward: -11.006225556078348
Next State shape: [ -6.          -0.00520372   0.00200054   0.999998     0.
  -0.27195147   0.10002708   6.75694084   3.29298735   0.
   1.           0.           0.           0.           0.72931671
   4.61147165 -10.9371109   -3.84231043]
Done: False

Experience 2:
State shape: [ -6.00000286  -0.01078224   0.00652385   0.99997872   0.
  -0.30552006   0.22616762   6.75694084   3.29298735   0.
   1.           0.           0.           0.           0.51079369
   4.53470182 -10.92617416  -3.83846807]
Action shape: [ 0.         -0.05837875  0.71119857  0.          0.          0.        ]
Reward: -11.006225556078348
Next State shape: [ 

In [None]:
# Print the number of layers and neurons in the policy model
policy_model_layers = list(agent.online_policy_model.children())
policy_input_neurons = state_dim
policy_hidden_neurons = [layer.out_features for layer in policy_model_layers if isinstance(layer, torch.nn.Linear)]
policy_output_neurons = action_dim

print("Policy Model:")
print(f"Number of Layers: {len(policy_model_layers)}")
print(f"Number of Input Neurons: {policy_input_neurons}")
print(f"Number of Hidden Layers: {len(policy_hidden_neurons)}")
print(f"Number of Hidden Neurons in each layer: {policy_hidden_neurons}")
print(f"Number of Output Neurons: {policy_output_neurons}")

# Print the number of layers and neurons in the value model
value_model_layers = list(agent.online_value_model.children())
value_input_neurons = state_dim + action_dim
value_hidden_neurons = [layer.out_features for layer in value_model_layers if isinstance(layer, torch.nn.Linear)]
value_output_neurons = 1

print("\nValue Model:")
print(f"Number of Layers: {len(value_model_layers)}")
print(f"Number of Input Neurons: {value_input_neurons}")
print(f"Number of Hidden Layers: {len(value_hidden_neurons)}")
print(f"Number of Hidden Neurons in each layer: {value_hidden_neurons}")
print(f"Number of Output Neurons: {value_output_neurons}")

Policy Model:
Number of Layers: 5
Number of Input Neurons: 18
Number of Hidden Layers: 2
Number of Hidden Neurons in each layer: [256, 6]
Number of Output Neurons: 6

Value Model:
Number of Layers: 5
Number of Input Neurons: 24
Number of Hidden Layers: 2
Number of Hidden Neurons in each layer: [256, 1]
Number of Output Neurons: 1


In [None]:
# Latest Initialize DDPG Agent and Environment
import random
np.set_printoptions(suppress=True)
reload(lh)

# Set the environment mode to TRAIN DEFENSE
env = lh.LaserHockeyEnv()
env.reset(mode=lh.LaserHockeyEnv.TRAIN_DEFENSE)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bounds = (env.action_space.low, env.action_space.high)
epsilon_strategy = EpsilonGreedyStrategy(1.0, 0.01, 0.995, env.action_space)

def sample_goals(episode):
    goals = []
    for experience in episode:
        _, _, reward, next_state, done, info = experience
        # This assumes that you have added 'info' to your stored experiences
        if info['reward_closeness_to_puck'] > 0 or info['reward_touch_puck'] > 0 or info['reward_puck_direction'] > 0:
            goals.append(next_state)
    return goals



def compute_reward(state, action, goal):
    _, _, _, _, info = env.step(action)
    
    closeness_reward = info.get('reward_closeness_to_puck', 0)
    touch_reward = info.get('reward_touch_puck', 0)
    direction_reward = info.get('reward_puck_direction', 0)
    game_result = info.get('winner', 0)

         
    
    # Combine rewards, possibly with weights to prioritize certain behaviors
    total_reward = 3*closeness_reward + 10*touch_reward + 5*direction_reward

    if game_result >= 0:
        total_reward += 0.5 
    
    # If all conditions are met, give a positive reward
    if closeness_reward > 0 and touch_reward > 0 and direction_reward > 0:
        return 15
    
    # If any condition is not met, penalize the agent
    if closeness_reward <= 0:
        return -0.1
    if touch_reward <= 0:
        return -0.3
    if direction_reward <= 0:
        return -0.55
    
    return total_reward  # Return combined reward as a fallback


her_buffer = HERBuffer(buffer_size=100000, goal_selection_strategy=sample_goals, reward_function=compute_reward)

agent = DDPGAgent(state_dim=state_dim, action_dim=action_dim, action_bounds=action_bounds)
agent.he_initialization()

num_episodes = 10000
batch_size = 512
train_start = 200  
update_frequency = 1

losses = []
all_rewards = []
wins = []

for episode in range(num_episodes):
    
    obs = env.reset()

    obs_agent1 = env._get_obs()
    episode_reward = 0
    episode_value_losses = []
    episode_policy_losses = []

    episode_trajectory = []

    for step in range(80):
        
        if episode >= 200:
            obs = env.render()

        state_tensor = torch.FloatTensor(obs_agent1).unsqueeze(0)
        agent.online_policy_model.eval()
        action_player1 = epsilon_strategy.select_action(agent.online_policy_model, state_tensor, episode)
        
        agent.online_policy_model.train()

        next_obs, reward, done, _, info = env.step(action_player1)
        reward += info['winner']
        reward += info['reward_closeness_to_puck']
        reward += info['reward_touch_puck']
        reward += info['reward_puck_direction']

        #print(f"info: {info}")
        episode_reward += reward

        obs_agent1 = env._get_obs()
        experience = (obs_agent1, action_player1, reward, next_obs, done, info)  # Added 'info' at the end
        episode_trajectory.append(experience)


        # DDPG Training
        if episode > train_start and len(her_buffer) > batch_size:
            experiences = her_buffer.sample(batch_size)
            states, actions, rewards, next_states, is_terminals = [torch.FloatTensor(x_np).to(device) for x_np in experiences]
            
            value_loss, policy_loss = agent.optimize_model((states, actions, rewards, next_states, is_terminals))
            episode_value_losses.append(value_loss)
            episode_policy_losses.append(policy_loss)

        if step == 79: 
            done = True

        if done:
            break

    # Store trajectories experiences with HER logic, after episode is done
    her_goals = sample_goals(episode_trajectory)
    for experience in episode_trajectory:
        state, action, reward, next_state, done, info = experience
        # Store the original experience
        her_buffer.store(state, action, reward, next_state, done)

        for goal in her_goals:
            # Recompute the reward and store the adjusted experience
            her_reward = compute_reward(state, action, goal)
            her_buffer.store(state, action, her_reward, next_state, done)

    epsilon_strategy.decay_epsilon()
    all_rewards.append(episode_reward)
    moving_avg_reward = np.mean(all_rewards[-100:])
    avg_value_loss = sum(episode_value_losses) / len(episode_value_losses) if episode_value_losses else 0
    avg_policy_loss = sum(episode_policy_losses) / len(episode_policy_losses) if episode_policy_losses else 0

    print(f"Episode {episode + 1}, Avg Value Loss: {avg_value_loss}, Avg Policy Loss: {avg_policy_loss}")
    print(f"Episode {episode + 1}, Reward: {episode_reward}, Moving Avg Reward: {moving_avg_reward}, HER Buffer Size: {len(her_buffer)}")
    print(f"Current Epsilon: {epsilon_strategy.epsilon}")

    # Note: I left out the code regarding "loss_rate" because it wasn't completely provided. 

# Save the models
torch.save(agent.online_policy_model.state_dict(), "online_policy_model_defense.pt")
torch.save(agent.online_value_model.state_dict(), "online_value_model_defense.pt")
torch.save(agent.target_policy_model.state_dict(), "target_policy_model_defense.pt")
torch.save(agent.target_value_model.state_dict(), "target_value_model_defense.pt")


Player 1 scored
Episode 1, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 1, Reward: 0.6496168649744513, Moving Avg Reward: 0.6496168649744513, HER Buffer Size: 4000
Current Epsilon: 0.995
Player 2 scored
Episode 2, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 2, Reward: -11.217263197228286, Moving Avg Reward: -5.2838231661269175, HER Buffer Size: 4045
Current Epsilon: 0.995
Player 1 scored
Episode 3, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 3, Reward: 11.020645200824736, Moving Avg Reward: 0.15099962285696714, HER Buffer Size: 4529
Current Epsilon: 0.990025
Player 2 scored
Episode 4, Avg Value Loss: 0.0893888481524448, Avg Policy Loss: -0.15893647273959116
Episode 4, Reward: -11.272325482621142, Moving Avg Reward: -2.70483165351256, HER Buffer Size: 4572
Current Epsilon: 0.985074875
Player 1 scored
Episode 5, Avg Value Loss: 0.014409171733112163, Avg Policy Loss: -0.18131441102452475
Episode 5, Reward: 11.825610327606398, Moving Avg Reward: 0.2012567427112316, HER Buffer Size: 74

In [37]:
import numpy as np
import laserhockey.hockey_env as h_env
import gymnasium as gym
from importlib import reload
import time

env = h_env.HockeyEnv()

player1 = h_env.BasicOpponent(weak=False)

env.reset(mode=lh.LaserHockeyEnv)

# Laden Sie die Gewichte in Ihren Agenten
agent.online_policy_model.load_state_dict(torch.load("online_policy_model_defense.pt"))
agent.online_value_model.load_state_dict(torch.load("online_value_model_defense.pt"))
agent.target_policy_model.load_state_dict(torch.load("target_policy_model_defense.pt"))
agent.target_value_model.load_state_dict(torch.load("target_value_model_defense.pt"))

# Initialisieren Sie die Umgebung und den Basis-Opponenten
player2 = lh.BasicOpponent()

num_games = 10
max_steps_per_game = 100

for game in range(num_games):
    obs, info = env.reset()
    obs_agent2 = env.obs_agent_two()
    
    for step in range(max_steps_per_game):
        env.render()

        # Nutzen Sie den Agenten, um eine Aktion für player1 basierend auf dem aktuellen Zustand auszuwählen
        state_tensor = torch.FloatTensor(obs).unsqueeze(0)
        agent.online_policy_model.eval()
        with torch.no_grad():
            action_agent1 = np.squeeze(agent.online_policy_model(state_tensor).numpy())
        agent.online_policy_model.train()

        action_agent2 = player2.act(obs_agent2)
        obs, r, d, _, info = env.step(np.hstack([action_agent1, action_agent2]))

        obs_agent2 = env.obs_agent_two()

        if d:
            break




KeyboardInterrupt: 

In [36]:
env = h_env.HockeyEnv()

# Einstellen der Spieler
player1 = h_env.BasicOpponent()
player2 = h_env.HumanOpponent(env=env, player=2)

for game in range(10):
    obs, info = env.reset()
    obs_agent2 = env.obs_agent_two()

    while True:
        time.sleep(0.05)
        env.render()

        a1 = player1.act(obs)
        a2 = player2.act(obs_agent2)
        
        obs, r, d, _, info = env.step(np.hstack([a1, a2]))
        obs_agent2 = env.obs_agent_two()
        
        if d:  # Wenn das Spiel vorbei ist, brechen Sie die innere Schleife ab und starten Sie ein neues Spiel
            break

env.close()

NameError: name 'h_env' is not defined

In [23]:
# Normal Noise Decay Strategy Latest Initialize DDPG Agent and Environment
import numpy as n
import random
import torch

np.set_printoptions(suppress=True)
reload(lh)

# Set the environment mode to TRAIN DEFENSE
env = lh.LaserHockeyEnv()
env.reset(mode=lh.LaserHockeyEnv.TRAIN_DEFENSE)
initial_state_player2 = env._get_obs()[7:14]  # Capture the initial state of Player 2 after resetting the environment

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bounds = (env.action_space.low, env.action_space.high)
noise_strategy = NormalNoiseStrategy(action_bounds[0], action_bounds[1])


def sample_goals(episode):
    goals = []
    for experience in episode:
        _, _, reward, next_state, done, info = experience
        # This assumes that you have added 'info' to your stored experiences
        if info['reward_closeness_to_puck'] > 0 or info['reward_touch_puck'] > 0 or info['reward_puck_direction'] > 0:
            goals.append(next_state)
    return goals



def compute_reward(state, action, goal):
    _, _, _, _, info = env.step(action)
    
    closeness_reward = info.get('reward_closeness_to_puck', 0)
    touch_reward = info.get('reward_touch_puck', 0)
    direction_reward = info.get('reward_puck_direction', 0)
    game_result = info.get('winner', 0)
    Y_max = action_bounds[1][1]
    Y_min = action_bounds[0][1]

    upper_threshold = Y_max - (1/3) * (Y_max - Y_min)
    lower_threshold = Y_min + (1/3) * (Y_max - Y_min)

         
    total_reward = 0  # Initialization of total_reward
    # Combine rewards, possibly with weights to prioritize certain behaviors
    

    if game_result >= 0:
        total_reward += 0.5

    else:
        total_reward -= 20

    
    if touch_reward == 1:
        total_reward += 100

    if closeness_reward >= 0:
        total_reward += 20

    if direction_reward >= 0:
        total_reward += 20

    # Add penalty for states that are in the top or bottom third of the field
    y_position_player1 = state[1]  # y-position of the defending player

    if y_position_player1 > upper_threshold or y_position_player1 < lower_threshold:
        total_reward -= 100  # Deduct a penalty from the total reward

    
    return total_reward  # Return combined reward as a fallback


her_buffer = HERBuffer(buffer_size=150000, goal_selection_strategy=sample_goals, reward_function=compute_reward)

agent = DDPGAgent(state_dim=state_dim, action_dim=action_dim, action_bounds=action_bounds)
agent.he_initialization()

num_episodes = 10000
batch_size = 1024
train_start = 100  
update_frequency = 1

losses = []
all_rewards = []
wins = []



for episode in range(num_episodes):
    
    obs = env.reset()

    obs_agent1 = env._get_obs()
    episode_reward = 0
    episode_value_losses = []
    episode_policy_losses = []

    episode_trajectory = []

    for step in range(80):
        
        if episode >= 10:
            obs = env.render()

        state_tensor = torch.FloatTensor(obs_agent1).unsqueeze(0)
        agent.online_policy_model.eval()
        action_player1 = noise_strategy.select_action(agent.online_policy_model, state_tensor, episode)

        
        agent.online_policy_model.train()

        next_obs, reward, done, _, info = env.step(action_player1)
        next_obs[7:14] = initial_state_player2  # Set the state of Player 2 to its initial state
        
        reward += info['winner']
        reward += info['reward_closeness_to_puck']
        reward += info['reward_touch_puck']
        reward += info['reward_puck_direction']

        #print(f"info: {info}")
        episode_reward += reward

        obs_agent1 = env._get_obs()
        experience = (obs_agent1, action_player1, reward, next_obs, done, info)  # Added 'info' at the end
        episode_trajectory.append(experience)


        # DDPG Training
        if episode > train_start and len(her_buffer) > batch_size:
            experiences = her_buffer.sample(batch_size)
            states, actions, rewards, next_states, is_terminals = [torch.FloatTensor(x_np).to(device) for x_np in experiences]
            
            value_loss, policy_loss = agent.optimize_model((states, actions, rewards, next_states, is_terminals))
            episode_value_losses.append(value_loss)
            episode_policy_losses.append(policy_loss)

        if step == 79: 
            done = True

        if done:
            break

    # Store trajectories experiences with HER logic, after episode is done
    her_goals = sample_goals(episode_trajectory)
    for experience in episode_trajectory:
        state, action, reward, next_state, done, info = experience
        # Store the original experience
        her_buffer.store(state, action, reward, next_state, done)

        for goal in her_goals:
            # Recompute the reward and store the adjusted experience
            her_reward = compute_reward(state, action, goal)
            her_buffer.store(state, action, her_reward, next_state, done)

    
    all_rewards.append(episode_reward)
    moving_avg_reward = np.mean(all_rewards[-100:])
    avg_value_loss = sum(episode_value_losses) / len(episode_value_losses) if episode_value_losses else 0
    avg_policy_loss = sum(episode_policy_losses) / len(episode_policy_losses) if episode_policy_losses else 0

    print(f"Episode {episode + 1}, Avg Value Loss: {avg_value_loss}, Avg Policy Loss: {avg_policy_loss}")
    print(f"Episode {episode + 1}, Reward: {episode_reward}, Moving Avg Reward: {moving_avg_reward}, HER Buffer Size: {len(her_buffer)}")
    

    # Note: I left out the code regarding "loss_rate" because it wasn't completely provided. 

# Save the models
torch.save(agent.online_policy_model.state_dict(), "online_policy_model_defense.pt")
torch.save(agent.online_value_model.state_dict(), "online_value_model_defense.pt")
torch.save(agent.target_policy_model.state_dict(), "target_policy_model_defense.pt")
torch.save(agent.target_value_model.state_dict(), "target_value_model_defense.pt")



Player 2 scored
Episode 1, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 1, Reward: -11.34528234995238, Moving Avg Reward: -11.34528234995238, HER Buffer Size: 44
Player 2 scored
Episode 2, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 2, Reward: -11.314224926309848, Moving Avg Reward: -11.329753638131113, HER Buffer Size: 89
Player 2 scored
Episode 3, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 3, Reward: -11.34327856418234, Moving Avg Reward: -11.334261946814856, HER Buffer Size: 134
Player 2 scored
Episode 4, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 4, Reward: -11.433561458543588, Moving Avg Reward: -11.359086824747038, HER Buffer Size: 177
Episode 5, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 5, Reward: 0.24532738213854308, Moving Avg Reward: -9.038203983369922, HER Buffer Size: 337
Player 1 scored
Player 2 scored
Player 2 scored
Episode 6, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 6, Reward: 11.02567151389122, Moving Avg Reward: -5.694224733826399, HER Buffer Size: 6



Player 2 scored
Episode 11, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 11, Reward: -11.264412706439584, Moving Avg Reward: -6.126712881154833, HER Buffer Size: 7746
Episode 12, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 12, Reward: -2.4852056086371235, Moving Avg Reward: -5.823253941778357, HER Buffer Size: 7826
Player 1 scored
Episode 13, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 13, Reward: -0.9401759301082587, Moving Avg Reward: -5.447632556265272, HER Buffer Size: 11026
Player 1 scored
Episode 14, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 14, Reward: 11.028643763542176, Moving Avg Reward: -4.270755676279025, HER Buffer Size: 11926
Player 2 scored
Episode 15, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 15, Reward: -11.385371196566403, Moving Avg Reward: -4.745063377631518, HER Buffer Size: 11969
Player 2 scored
Episode 16, Avg Value Loss: 0, Avg Policy Loss: 0
Episode 16, Reward: -11.369541840181174, Moving Avg Reward: -5.159093281540872, HER Buffer Size: 12010
Playe

KeyboardInterrupt: 

In [16]:
#Version 3 Policy

class FCDP(nn.Module):
    def __init__(self, 
                 input_dim,
                 action_bounds,
                 hidden_sizes=(256, 256), 
                 activation_fc=F.relu,
                 out_activation_fc=F.tanh,
                 learning_rate=0.0001,
                 lr_milestones=[1000, 5000],
                 lr_factor=0.5):
        super(FCDP, self).__init__()
        self.activation_fc = activation_fc
        self.out_activation_fc = out_activation_fc
        self.env_min, self.env_max = action_bounds
        self.device = device
        # Layers
        layer_sizes = [input_dim] + list(hidden_sizes) + [len(self.env_max)]
        self.layers = nn.ModuleList([nn.Linear(i, o) for i, o in zip(layer_sizes[:-1], layer_sizes[1:])])
        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        self.lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=lr_milestones, gamma=lr_factor)

        # Action scaling
        self.env_min = torch.tensor(self.env_min, device=self.device, dtype=torch.float32)
        self.env_max = torch.tensor(self.env_max, device=self.device, dtype=torch.float32)
        
        self.nn_min = self.out_activation_fc(torch.Tensor([float('-inf')])).to(self.device)
        self.nn_max = self.out_activation_fc(torch.Tensor([float('inf')])).to(self.device)
        self.rescale_fn = lambda x: (x - self.nn_min) * (self.env_max - self.env_min) / (self.nn_max - self.nn_min) + self.env_min

    def _format(self, state):
        x = state
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, device=self.device, dtype=torch.float32)
            x = x.unsqueeze(0)
        return x

    def forward(self, state):
        x = self._format(state)
        for layer in self.layers[:-1]:
            x = self.activation_fc(layer(x))
        x = self.out_activation_fc(self.layers[-1](x))
        return self.rescale_fn(x)

    def predict(self, state):
        state = np.array(state)
        with torch.no_grad():
            return self.forward(torch.from_numpy(state.astype(np.float32)).to(self.device)).cpu().numpy()


In [15]:
#Version 3 Value

class FCQV(nn.Module):
    def __init__(self, 
                 input_dim, 
                 output_dim, 
                 device,
                 learning_rate=0.001,
                 lr_milestones=[1000, 5000],
                 lr_factor=0.5,
                 hidden_sizes=(256, 256), 
                 activation_fc=F.relu):
        super(FCQV, self).__init__()
        
        self.num_inputs = input_dim
        self.n_actions = output_dim
        self.activation_fc = activation_fc
        self.device = device

        # Create layers
        layer_sizes = [self.num_inputs + self.n_actions] + list(hidden_sizes) + [1]
        self.layers = nn.ModuleList([nn.Linear(i, o) for i, o in zip(layer_sizes[:-1], layer_sizes[1:])])

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        self.lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=lr_milestones, gamma=lr_factor)
        self.loss = torch.nn.MSELoss()

    def _format(self, state, action):
        x, u = state, action
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, device=self.device, dtype=torch.float32)
            x = x.unsqueeze(0)
        if not isinstance(u, torch.Tensor):
            u = torch.tensor(u, device=self.device, dtype=torch.float32)
            u = u.unsqueeze(0)
        return x, u

    def forward(self, state, action):
        x, u = self._format(state, action)
        x = torch.cat((x, u), dim=1)
        for layer in self.layers[:-1]:
            x = self.activation_fc(layer(x))
        return self.layers[-1](x)
    
    def predict(self, state, action):
        state = np.array(state)
        action = np.array(action)
        with torch.no_grad():
            return self.forward(torch.from_numpy(state.astype(np.float32)).to(self.device), 
                                torch.from_numpy(action.astype(np.float32)).to(self.device)).cpu().numpy()

    def load(self, experiences):
        states, actions, new_states, rewards, is_terminals = experiences
        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).float().to(self.device)
        new_states = torch.from_numpy(new_states).float().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        is_terminals = torch.from_numpy(is_terminals).float().to(self.device)
        return states, actions, new_states, rewards, is_terminals
