# **Imports and Installs**

For usage in Google Colab:

In [None]:
!pip install imageio-ffmpeg
!pip install gym-super-mario-bros
!pip install tensorflow_addons

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Imports:

In [None]:
import csv
import numpy as np

import tensorflow as tf
import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
plt.style.use('ggplot') 

import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY # 5 actions
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT # 7 actions
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT # 12 actions

# For importing custom modules
import sys
sys.path.append('/content/drive/MyDrive/DRL_Group44/Modules')

# **Environment Creation and Preprocessing**

In [None]:
# If run on Google Colab the imports will produce a reportMissingImports warning 
# However, everything works as intended and if run locally there are no warnings at all
from helper_functions import make_env, preprocess_obs

# **Prioritized Experience Replay Buffer (PERB)**

The following PERB implementation is based on Schaul et al. (2016) https://arxiv.org/pdf/1511.05952.pdf, uses a sum tree and was adopted from this tutorial: https://adventuresinmachinelearning.com/prioritised-experience-replay/

In [None]:
from perb import PERB

# **Policy, SoftQN and Training Classes**




SoftQN implementation according to Haarnoja et al. (2017) https://arxiv.org/abs/1702.08165

In [None]:
from softqn import Policy, SoftQN, train_SoftQN

# **Model Training**


The warnings while running the training can be ignored as we wanted to train on the original, unaltered SMB world (the other versions are simplified representations of it, see: https://github.com/Kautenja/gym-super-mario-bros#environments).

In [None]:
# Set hyperparameters
resize_env = (84,84)
num_epochs = 2000001
env_steps_per_epoch = 3
batch_size = 64
learning_rate = 0.00025
discount_factor = 0.9
entropy_factor = 1 / 600
tau = 1/3750
#path = "drive/MyDrive/DRL_Group44/SoftQ/Transfer_Learning/Pre_Trained_Differential"
path = "drive/MyDrive/DRL_Group44/SoftQ/lol"
video_steps=1000
saving_epoch=5000
plotting_epoch=200
transfer = False


# Initialize the optimizer
learning_rate = 0.00025
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Create environment
num_skip = 4
num_stack = 4
reward_scale_factor = 600
env = make_env(level="SuperMarioBros-1-1-v0", # Select -<world>-<stage>-v<version>
               movement_type=RIGHT_ONLY, 
               num_skip=num_skip, 
               num_stack=num_stack, 
               reward_scale_factor=reward_scale_factor) 
img_dim = tf.shape(preprocess_obs(obs=env.reset(), resize_env=resize_env))

# Initialize Policy and buffer
policy = Policy(heat_param=1/600) # heat_param noch zu tunen
buffer_size = 64
buffer = PERB(size=buffer_size, img_dim=img_dim)


# Initialize model network and build it dynamically (according to enviroment's observation space)
model_network = SoftQN(env)
model_network.build(input_shape=(None, img_dim[0], img_dim[1], img_dim[2]))

# Initialize target network and build it dynamically (according to enviroment's observation space)
target_network = SoftQN(env)
target_network.build(input_shape=(None, img_dim[0], img_dim[1], img_dim[2]))

# Copy weights from model to target network
target_network.set_weights(model_network.get_weights())


# Train the model network and save the final weights
train_SoftQN(env=env, 
             model_network=model_network, 
             target_network=target_network, 
             policy=policy, 
             buffer=buffer, 
             optimizer=optimizer, 
             resize_env=resize_env,
             num_epochs=num_epochs, 
             env_steps_per_epoch=env_steps_per_epoch,
             batch_size=batch_size, 
             discount_factor=discount_factor, 
             tau=tau, 
             path=path,
             video_steps=video_steps,
             saving_epoch=saving_epoch,
             plotting_epoch=plotting_epoch,
             entropy_factor=entropy_factor,
             transfer=transfer)

# **Transfer Learning**

The warnings while running the training can be ignored as we wanted to train on the original, unaltered SMB world (the other versions are simplified representations of it, see: https://github.com/Kautenja/gym-super-mario-bros#environments).

In [None]:
# Set hyperparameters
resize_env = (84,84)
num_epochs = 2000001
env_steps_per_epoch = 3
batch_size = 64
discount_factor = 0.9
entropy_factor = 1 / 600
tau = 1/3750
path = "drive/MyDrive/DRL_Group44/SoftQ/Transfer_Learning/Pre_Trained_00025_all_earlier_10"
video_steps=1000
saving_epoch=5000
plotting_epoch=200
transfer = True


# Initialize the optimizer
learning_rate = 0.00025
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Create environment
num_skip = 4
num_stack = 4
reward_scale_factor = 600
env = make_env(level="SuperMarioBros-1-2-v0",  # Select -<world>-<stage>-v<version>
               movement_type=RIGHT_ONLY, 
               num_skip=num_skip, 
               num_stack=num_stack, 
               reward_scale_factor=reward_scale_factor)
img_dim = tf.shape(preprocess_obs(obs=env.reset(), resize_env=resize_env))

# Initialize Policy and buffer
policy = Policy(heat_param=1/600)
buffer_size=32000
buffer = PERB(size=buffer_size, img_dim=img_dim)


# Initialize model network and build it dynamically (according to enviroment's observation space), then load pretrained weights
model_network = SoftQN(env)
model_network.build(input_shape=(None, img_dim[0], img_dim[1], img_dim[2]))
model_network.load_weights("drive/MyDrive/DRL_Group44/SoftQ/Transfer_Learning/Pre_Trained_00025_all_earlier_10/SuperMarioBrosDQNWeights2_epoch_185000")

# Initialize target network and build it dynamically (according to enviroment's observation space)
target_network = SoftQN(env)
target_network.build(input_shape=(None, img_dim[0], img_dim[1], img_dim[2]))

# Copy weights from model to target network
target_network.set_weights(model_network.get_weights())


# # Make only the last xx layers trainable
# for layer in model_network.layers[:-5]:
#     layer.trainable=False
# model_network.compile()

# for layer in target_network.layers[:-5]:
#     layer.trainable=False
# target_network.compile()


# Train the model network and save the final weights
train_SoftQN(env=env, 
             model_network=model_network, 
             target_network=target_network, 
             policy=policy, 
             buffer=buffer, 
             optimizer=optimizer, 
             resize_env=resize_env,
             num_epochs=num_epochs, 
             env_steps_per_epoch=env_steps_per_epoch,
             batch_size=batch_size, 
             discount_factor=discount_factor, 
             tau=tau, 
             path=path,
             video_steps=video_steps,
             saving_epoch=saving_epoch,
             plotting_epoch=plotting_epoch,
             entropy_factor=entropy_factor,
             transfer=transfer)

# **Evaluation**



In [None]:
from evaluation import evaluation

In [None]:
path = "PathToData"
data_paths = [f"{path}losses_rews_wins_SOFTQ_1-2_scratch.csv", 
              f"{path}losses_rews_wins_SOFTQ_1-2_all_earlier_50.csv", 
              f"{path}losses_rews_wins_SOFTQ_1-2_all_earlier_30.csv", 
              f"{path}losses_rews_wins_SOFTQ_1-2_all_earlier_10.csv"]
labels = ["Scratch", "All_50", "All_30", "All_10"]
save = True

In [None]:
mode = "reward"
avg_window = 500
save_path = "PathToSave"
evaluation(data_paths=data_paths, labels=labels, mode=mode, avg_window=avg_window, save=save, save_path=save_path)

In [None]:
mode = "loss"
avg_window = 15000
save_path = "PathToSave"
evaluation(data_paths=data_paths, labels=labels, avg_window=avg_window, mode=mode, save=save, save_path=save_path)

In [None]:
mode = "win"
avg_window=500
save_path = "PathToSave"
evaluation(data_paths=data_paths, labels=labels, mode=mode, avg_window=avg_window, save=save, save_path=save_path)