In [None]:
!pip uninstall -y gym gym-super-mario-bros
!pip install gym==0.26.2 gym-notices gym-super-mario-bros==7.4.0
!pip install stable-baselines3 gym_super_mario_bros nes_py

Found existing installation: gym 0.25.2
Uninstalling gym-0.25.2:
  Successfully uninstalled gym-0.25.2
[0mCollecting gym==0.26.2
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting gym-super-mario-bros==7.4.0
  Downloading gym_super_mario_bros-7.4.0-py3-none-any.whl.metadata (10 kB)
Collecting nes-py>=8.1.4 (from gym-super-mario-bros==7.4.0)
  Downloading nes_py-8.2.1.tar.gz (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.7/77.7 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyglet<=1.5.21,>=1.4.0 (from nes-py>=8.1.4->gym-super-mario-bros==7.4.0)
  Downloading pyglet-1.5.21-py3-none-any.whl.metad

In [None]:
# Importiere grundlegende Bibliotheken
import gym
import gym_super_mario_bros
from gym.wrappers import FrameStack, GrayScaleObservation
from gym.spaces import Box
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT

# Importiere Bibliotheken für numerische Berechnungen und Deep Learning
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Importiere weitere Hilfsbibliotheken
import random
from collections import deque
import matplotlib.pyplot as plt
import torchvision.transforms as T
import time

# Importiere Bibliotheken für die Interaktion mit Google Colab
from google.colab import files

In [None]:
# Define Wrappers
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """Return only every `skip`-th frame"""
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        """Repeat action, and sum reward"""
        total_reward = 0.0
        for i in range(self._skip):
            obs, reward, done, truncated, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, truncated, info

class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def permute_orientation(self, observation):
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        return observation

    def observation(self, observation):
        observation = self.permute_orientation(observation)
        transform = T.Grayscale()
        observation = transform(observation)
        return observation

class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        if isinstance(shape, int):
            self.shape = (shape, shape)
        else:
            self.shape = tuple(shape)

        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transforms = T.Compose(
            [T.Resize(self.shape, antialias=True), T.Normalize(0, 255)]
        )
        observation = transforms(observation).squeeze(0)
        return observation

# Configure environment
#env = gym_super_mario_bros.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode="human")
#env = JoypadSpace(env, SIMPLE_MOVEMENT)

# Environment konfigurieren
#env = gym_super_mario_bros.make('SuperMarioBros-v0', apply_api_compatibility=True, render_mode="none")
#env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0', stages=['1-1'], apply_api_compatibility=True, render_mode="none")
env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0', stages=['1-4'], apply_api_compatibility=True, render_mode="none")
env = JoypadSpace(env, SIMPLE_MOVEMENT)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Apply Wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
if gym.__version__ < '0.26':
    env = FrameStack(env, num_stack=4, new_step_api=True)
else:
    env = FrameStack(env, num_stack=4)

# Hyperparameters
state_space = env.observation_space.shape  # (4, 84, 84)
action_space = env.action_space.n
learning_rate = 0.00015
gamma = 0.99

epsilon = 1.0
epsilon_min = 0.02
epsilon_decay = 0.99997
batch_size = 32
target_update = 20
replayBuffer_size = 300000
num_episodes = 14000
frame_skip = 1
step_count = 0
start_learning = 0
save_weights = 500

# Replay Buffer
replayBuffer = deque(maxlen=replayBuffer_size)

# Training loop with evaluation
training_rewards = []
evaluation_rewards = []
moving_average_training = []
moving_average_evaluation = []
min_training_rewards = []
max_training_rewards = []
min_moving_average_training = []
max_moving_average_training = []
min_moving_average_evaluation = []
max_moving_average_evaluation = []

start_episode = 0
#evaluation_interval = 100  # Evaluate every 100 episodes
#evaluation_episodes = 10  # Number of episodes to evaluate

# Define Neural Network
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        # Compute the output size after convolutional layers
        def conv2d_size_out(size, kernel_size, stride):
            return (size - (kernel_size - 1) - 1) // stride + 1

        conv_h = conv2d_size_out(conv2d_size_out(conv2d_size_out(input_shape[1], 8, 4), 4, 2), 3, 1)
        conv_w = conv2d_size_out(conv2d_size_out(conv2d_size_out(input_shape[2], 8, 4), 4, 2), 3, 1)
        linear_input_size = conv_w * conv_h * 64

        self.fc1 = nn.Linear(linear_input_size, 512)
        self.fc2 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # flatten
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# Instantiate Q-networks
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
policy_net = DQN((4, 84, 84), action_space).to(device)  # Adjusted input shape to (4, 84, 84)
target_net = DQN((4, 84, 84), action_space).to(device)  # Adjusted input shape to (4, 84, 84)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
#loss_fn = nn.MSELoss()
loss_fn = nn.SmoothL1Loss()

# Preprocess state
def preprocess_state(state):
    state = np.ascontiguousarray(state)  # Remove negative strides
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
    return state

# Select action
def select_action(state, epsilon):
    if random.random() < epsilon:
        action = env.action_space.sample()
        return action
    else:
        with torch.no_grad():
            return policy_net(state).argmax().item()


# Load pretrained networks if necessary
policy_net_weights_path = '/content/policy_net_weights.pth'
target_net_weights_path = '/content/policy_net_weights.pth'

policy_net.load_state_dict(torch.load(policy_net_weights_path))
target_net.load_state_dict(torch.load(target_net_weights_path))

target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

import os

# Initialize log file

file_path = "training_log_double_14.txt"
if os.path.exists(file_path):
    os.remove(file_path)

with open(file_path, 'a') as file:
    file.write("Episode,Total_Reward,Moving_Average,Episode_Length,Step_Count,Success\n")

# Only when pretrained
# epsilon = 0.02
# step_count = 1379678
# Episode start 0 or else
# start_episode = 23000
# num_episodes = 50000


for episode in range(start_episode, num_episodes):
    # --- Training loop ---
    state, _ = env.reset()
    state = preprocess_state(state)
    done = False
    total_reward = 0
    frame_count = 0
    episode_length = 0
    x_pos_last = 40
    time_last = 400
    #standing_still_counter = 0

    while not done:
        action = select_action(state, epsilon)
        if episode > start_learning:
            epsilon = max(epsilon * epsilon_decay, epsilon_min)

        next_state, reward, terminated, truncated, next_info = env.step(action)
        next_state = preprocess_state(next_state)
        done = terminated or truncated

        # Reward adjustment logic
        #if next_info['time'] < time_last:
        #    reward += time_penalty
        #if next_info['x_pos'] == x_pos_last:
        #    reward += position_penalty
        #    standing_still_counter += 1
        #else:
        #    standing_still_counter = 0

        #if standing_still_counter >= 4:
        #    reward += -10

        #x_pos_last = next_info['x_pos']
        #time_last = next_info['time']
        total_reward += reward

        replayBuffer.append((state, action, reward, next_state, done))
        state = next_state

        if len(replayBuffer) > batch_size and episode > start_learning:
            batch = random.sample(replayBuffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)

            # Tensor conversion for training
            states = torch.cat(states)
            actions = torch.tensor(actions, dtype=torch.int64).to(device)
            rewards = torch.tensor(rewards).to(device)
            next_states = torch.cat(next_states)
            dones = torch.tensor(dones, dtype=torch.float32).to(device)

            # Compute Q-values and loss
            current_q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            next_state_actions = policy_net(next_states).argmax(dim=1)
            next_q_values = target_net(next_states).gather(1, next_state_actions.unsqueeze(1)).squeeze(1)
            expected_q_values = rewards + gamma * next_q_values * (1 - dones)

            loss = loss_fn(current_q_values, expected_q_values.detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        step_count += 1
        episode_length += 1

        if done:
            # Save training rewards and calculate moving averages
            training_rewards.append(total_reward)
            moving_average_training.append(np.mean(training_rewards[-100:]))
            min_training_rewards.append(np.min(training_rewards[-100:]))
            max_training_rewards.append(np.max(training_rewards[-100:]))
            min_moving_average_training.append(np.min(moving_average_training[-100:]))
            max_moving_average_training.append(np.max(moving_average_training[-100:]))

            # Check if Mario completed the level
            if next_info['flag_get']:
                success = 1  # Mario successfully completed the level
            else:
                success = 0  # Mario did not complete the level

            # Output information about the episode
            print(f"Episode: {episode}, Total Reward: {total_reward}, Epsilon: {epsilon}")
            print(f"Replay Buffer Memory: {len(replayBuffer)}, Episode Length: {episode_length}, Step Count: {step_count}")
            print(f"Moving Average (Training): {moving_average_training[-1]}, Success: {success}")

            # Write episode data to log file
            with open(file_path, 'a') as file:
                file.write(f"{episode},{total_reward},{moving_average_training[-1]},{episode_length},{step_count},{success}\n")


    if step_count % 7500 == 0:
        target_net.load_state_dict(policy_net.state_dict())

      # --- Save model weights ---
    if episode % save_weights == 0:
        policy_weight_filename = f'policy_net_weights_{episode}.pth'
        target_weight_filename = f'target_net_weights_{episode}.pth'
        torch.save(policy_net.state_dict(), policy_weight_filename)
        torch.save(target_net.state_dict(), target_weight_filename)

# Save weights after training
torch.save(policy_net.state_dict(), 'policy_net_weights.pth')
torch.save(target_net.state_dict(), 'target_net_weights.pth')

files.download('policy_net_weights.pth')
files.download('target_net_weights.pth')

# Save rewards and moving averages to files for later analysis
np.save('training_rewards.npy', training_rewards)
#np.save('evaluation_rewards.npy', evaluation_rewards)
np.save('moving_average_training.npy', moving_average_training)
#np.save('moving_average_evaluation.npy', moving_average_evaluation)

files.download('training_rewards.npy')
files.download('moving_average_training.npy')

# Plotting Moving Averages with min and max
episodes = range(len(moving_average_training))

# Plot the result
plt.figure(figsize=(15, 5))

# Plot smoothed Training Moving Average with sliding min and max
plt.plot(episodes, moving_average_training, 'b-', label='Avg (Last 100 Episodes)', linewidth=2)
plt.plot(episodes, min_moving_average_training, 'r-', linewidth=1.5, label='Min (Last 100 episodes)')
plt.plot(episodes, max_moving_average_training, 'g-', linewidth=1.5, label='Max (Last 100 episodes)')

# Fill the area between min and max with lighter transparency
plt.fill_between(episodes, min_moving_average_training, max_moving_average_training, facecolor='blue', alpha=0.07)

# Titles and labels
plt.title('Double DQN Moving Average Rewards (Training)')
plt.xlabel('Episodes')
plt.ylabel('Reward Value')
plt.legend(loc='upper left')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.show()

env.close()