## Training and Testing

### 1. Importing Dependencies

In [None]:
import gym
import matplotlib.pyplot as plt
import os
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import RIGHT_ONLY, SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
from gym.wrappers import GrayScaleObservation, ResizeObservation
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.monitor import Monitor

### 2. Setting up Environment for Training

In [None]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info

In [None]:
class CustomRewardAndDoneEnv(gym.Wrapper):
    def __init__(self, env=None):
        super(CustomRewardAndDoneEnv, self).__init__(env)
        self.current_score = 0
        self.current_x = 0
        self.current_x_count = 0
        self.max_x = 0
    def reset(self, **kwargs):
        self.current_score = 0
        self.current_x = 0
        self.current_x_count = 0
        self.max_x = 0
        return self.env.reset(**kwargs)
    def step(self, action):
        state, reward, done, info = self.env.step(action)
        reward += max(0, info['x_pos'] - self.max_x)
        if (info['x_pos'] - self.current_x) == 0:
            self.current_x_count += 1
        else:
            self.current_x_count = 0
        if info["flag_get"]:
            reward += 500
            done = True
            print("GOAL")
        if info["life"] < 2:
            reward -= 500
            done = True
        self.current_score = info["score"]
        self.max_x = max(self.max_x, self.current_x)
        self.current_x = info["x_pos"]
        return state, reward / 10., done, info

In [None]:
CUSTOM_MOVEMENT = [['left', 'A'], ['right', 'B'], ['right', 'A', 'B']]

In [None]:
# Choose Movement Type
MOVEMENT = SIMPLE_MOVEMENT
# 1. Create the base environment
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
# 2. Simplify the controls
env = JoypadSpace(env, MOVEMENT)
# 3. Custom Reward
# env = CustomRewardAndDoneEnv(env)
# 4. Grayscale
env = GrayScaleObservation(env, keep_dim=True)
# 5. Resize
#env = ResizeObservation(env, shape=84)
# 6. Frame Skip
env = SkipFrame(env, skip=4)
# 7. For Tensorboard Log
env = Monitor(env)
# 8. Wrap inside the Dummy Environment
env = DummyVecEnv([lambda: env])
# 9. Stack the frames
env = VecFrameStack(env, 4, channels_order='last')

### 3. Training the Agent

In [None]:
# Training and Logging Callback Function Class
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
# Defining the path for saving the models and log
CHECKPOINT_DIR = './train/'
LOG_DIR = './train/logs/'

In [None]:
# Setup model saving callback
callback = TrainAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR)

In [None]:
# Create agent model
model = PPO('CnnPolicy',
            env,
            verbose=1,
            tensorboard_log=LOG_DIR,
            learning_rate=0.0001,
            n_steps=512,
            ent_coef=0.01,
            gae_lambda=1.0,
            gamma=0.9,
            seed=11)

In [None]:
# Start model training
model.learn(total_timesteps=1000000,
            callback=callback)

In [None]:
del model

### 4. Testing the Model

In [None]:
# 1. Create the base environment
env = gym_super_mario_bros.make('SuperMarioBros-v0')
# 2. Simplify the controls 
env = JoypadSpace(env, RIGHT_ONLY)
# 3. Custom Reward
# env = CustomRewardAndDoneEnv(env)
# 4. Grayscale
env = GrayScaleObservation(env, keep_dim=True)
# 5. Resize
env = ResizeObservation(env, shape=84)
# 6. Wrap inside the Dummy Environment
env = DummyVecEnv([lambda: env])
# 7. Stack the frames
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
# Load model
model = PPO.load('./train/model_name', env=env, clip_range=0.2)

In [None]:
#Test Model
episodes = 15
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _ = model.predict(state)
        state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

### 5. Opening Tensorboard

In [None]:
LOG_NAME = 'log_name'
TB_LOG = os.path.join(LOG_DIR, LOG_NAME)

!tensorboard --logdir={TB_LOG}