# Reinforcement Learning Project: Super Mario Bros

**disclaimer:** most of the functionalities in this notebook are based on the code in this github repository: https://github.com/Sourish07/Super-Mario-Bros-RL

things I added myself (assisted by Gemini 1.5): 

- transfering the code in the repository into a single notebook
- saving progress data
- progress visualization graphs


### importing dependencies

In [10]:
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace
import os
import torch
from torch import nn
import pandas as pd
import numpy as np
from gym import Wrapper
from gym.wrappers import GrayScaleObservation, ResizeObservation, FrameStack
import time
import datetime


### initialising utilities to monitor progress


In [11]:
def get_current_date_time_string():
    return datetime.datetime.now().strftime("%Y-%m-%d-%H_%M_%S")


class Timer():
    def __init__(self):
        self.times = []

    def start(self):
        self.t = time.time()

    def print(self, msg=''):
        print(f"Time taken: {msg}", time.time() - self.t)

    def get(self):
        return time.time() - self.t
    
    def store(self):
        self.times.append(time.time() - self.t)

    def average(self):
        return sum(self.times) / len(self.times)

### basic preparation

In [12]:
# create output folder for final model
model_path = os.path.join("models", get_current_date_time_string())
os.makedirs(model_path, exist_ok=True)

# check if GPU is available
if torch.cuda.is_available():
    print("Using CUDA device:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available")


Using CUDA device: NVIDIA GeForce RTX 2070 SUPER


In [13]:
class SkipFrame(Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self.skip = skip
    
    def step(self, action):
        total_reward = 0.0
        done = False
        for _ in range(self.skip):
            next_state, reward, done, trunc, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return next_state, total_reward, done, trunc, info
    

def apply_wrappers(env):
    env = SkipFrame(env, skip=4) # Num of frames to apply one action to
    env = ResizeObservation(env, shape=84) # Resize frame from 240x256 to 84x84
    env = GrayScaleObservation(env)
    env = FrameStack(env, num_stack=4, lz4_compress=True) # May need to change lz4_compress to False if issues arise
    return env

### setting up environment

In [14]:
ENV_NAME = 'SuperMarioBros-1-1-v0'
SHOULD_TRAIN = True
DISPLAY = True
CKPT_SAVE_INTERVAL = 1000
NUM_OF_EPISODES = 15000

env = gym_super_mario_bros.make(ENV_NAME,  apply_api_compatibility=True)
env = JoypadSpace(env, RIGHT_ONLY)

env = apply_wrappers(env)

### setting up the DQNN to train the model

In [15]:
class AgentNN(nn.Module):
    def __init__(self, input_shape, n_actions, freeze=False):
        super().__init__()
        # Conolutional layers
        self.conv_layers = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )

        conv_out_size = self._get_conv_out(input_shape)

        # Linear layers
        self.network = nn.Sequential(
            self.conv_layers,
            nn.Flatten(),
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

        if freeze:
            self._freeze()
        
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.to(self.device)

    def forward(self, x):
        return self.network(x)

    def _get_conv_out(self, shape):
        o = self.conv_layers(torch.zeros(1, *shape))
        # np.prod returns the product of array elements over a given axis
        return int(np.prod(o.size()))
    
    def _freeze(self):        
        for p in self.network.parameters():
            p.requires_grad = False
    

### setting up the agent that traverses the environment

In [16]:
import torch
import numpy as np
from agent_nn import AgentNN

from tensordict import TensorDict
from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage

class Agent:
    def __init__(self, 
                 input_dims, 
                 num_actions, 
                 lr=0.00025, 
                 gamma=0.9, 
                 epsilon=1.0, 
                 eps_decay=0.99999975, 
                 eps_min=0.1, 
                 replay_buffer_capacity=100_000, 
                 batch_size=32, 
                 sync_network_rate=10000):
        
        self.num_actions = num_actions
        self.learn_step_counter = 0

        # Hyperparameters
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.batch_size = batch_size
        self.sync_network_rate = sync_network_rate

        # Networks
        self.online_network = AgentNN(input_dims, num_actions)
        self.target_network = AgentNN(input_dims, num_actions, freeze=True)

        # Optimizer and loss
        self.optimizer = torch.optim.Adam(self.online_network.parameters(), lr=self.lr)
        self.loss = torch.nn.MSELoss()
        # self.loss = torch.nn.SmoothL1Loss() # Feel free to try this loss function instead!

        # Replay buffer
        storage = LazyMemmapStorage(replay_buffer_capacity)
        self.replay_buffer = TensorDictReplayBuffer(storage=storage)

    def choose_action(self, observation):
        if np.random.random() < self.epsilon:
            return np.random.randint(self.num_actions)
        # Passing in a list of numpy arrays is slower than creating a tensor from a numpy array
        # Hence the `np.array(observation)` instead of `observation`
        # observation is a LIST of numpy arrays because of the LazyFrame wrapper
        # Unqueeze adds a dimension to the tensor, which represents the batch dimension
        observation = torch.tensor(np.array(observation), dtype=torch.float32) \
                        .unsqueeze(0) \
                        .to(self.online_network.device)
        # Grabbing the index of the action that's associated with the highest Q-value
        return self.online_network(observation).argmax().item()
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon * self.eps_decay, self.eps_min)

    def store_in_memory(self, state, action, reward, next_state, done):
        self.replay_buffer.add(TensorDict({
                                            "state": torch.tensor(np.array(state), dtype=torch.float32), 
                                            "action": torch.tensor(action),
                                            "reward": torch.tensor(reward), 
                                            "next_state": torch.tensor(np.array(next_state), dtype=torch.float32), 
                                            "done": torch.tensor(done)
                                          }, batch_size=[]))
        
    def sync_networks(self):
        if self.learn_step_counter % self.sync_network_rate == 0 and self.learn_step_counter > 0:
            self.target_network.load_state_dict(self.online_network.state_dict())

    def save_model(self, path):
        torch.save(self.online_network.state_dict(), path)

    def load_model(self, path):
        self.online_network.load_state_dict(torch.load(path))
        self.target_network.load_state_dict(torch.load(path))

    def learn(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        
        self.sync_networks()
        
        self.optimizer.zero_grad()

        samples = self.replay_buffer.sample(self.batch_size).to(self.online_network.device)

        keys = ("state", "action", "reward", "next_state", "done")

        states, actions, rewards, next_states, dones = [samples[key] for key in keys]

        predicted_q_values = self.online_network(states) # Shape is (batch_size, n_actions)
        predicted_q_values = predicted_q_values[np.arange(self.batch_size), actions.squeeze()]

        # Max returns two tensors, the first one is the maximum value, the second one is the index of the maximum value
        target_q_values = self.target_network(next_states).max(dim=1)[0]
        # The rewards of any future states don't matter if the current state is a terminal state
        # If done is true, then 1 - done is 0, so the part after the plus sign (representing the future rewards) is 0
        target_q_values = rewards + self.gamma * target_q_values * (1 - dones.float())

        loss = self.loss(predicted_q_values, target_q_values)
        loss.backward()
        self.optimizer.step()

        self.learn_step_counter += 1
        self.decay_epsilon()


        

### running the agent to train the model

In [17]:
# setting up a list for performance data:

episodes = []
total_reward_list = []
epsilon = []
buffer_size = []
step_count = []

# setting up the agent
agent = Agent(input_dims=env.observation_space.shape, num_actions=env.action_space.n)

if not SHOULD_TRAIN:
    folder_name = ""
    ckpt_name = ""
    agent.load_model(os.path.join("models", folder_name, ckpt_name))
    agent.epsilon = 0.2
    agent.eps_min = 0.0
    agent.eps_decay = 0.0

env.reset()
next_state, reward, done, trunc, info = env.step(action=0)

In [18]:
# running the actual agent
for i in range(NUM_OF_EPISODES):    
    print("Episode:", i)
    done = False
    state, _ = env.reset()
    total_reward = 0
    while not done:
        a = agent.choose_action(state)
        new_state, reward, done, truncated, info  = env.step(a)
        total_reward += reward



        if SHOULD_TRAIN:
            agent.store_in_memory(state, a, reward, new_state, done)
            agent.learn()

        state = new_state

    print("Total reward:", total_reward, "Epsilon:", agent.epsilon, "Size of replay buffer:", len(agent.replay_buffer), "Learn step counter:", agent.learn_step_counter)
    episodes.append(i)
    total_reward_list.append(total_reward)  
    epsilon.append(agent.epsilon)
    buffer_size.append(len(agent.replay_buffer))
    step_count.append(agent.learn_step_counter)
    if SHOULD_TRAIN and (i + 1) % CKPT_SAVE_INTERVAL == 0:
        agent.save_model(os.path.join(model_path, "model_" + str(i + 1) + "_iter.pt"))

    print("average reward:", sum(total_reward_list)/len(total_reward_list))
    
env.close()


Episode: 0
Total reward: 626.0 Epsilon: 0.9999555009845417 Size of replay buffer: 209 Learn step counter: 178
average reward: 626.0
Episode: 1
Total reward: 637.0 Epsilon: 0.999917503392708 Size of replay buffer: 361 Learn step counter: 330
average reward: 631.5
Episode: 2
Total reward: 795.0 Epsilon: 0.9998565102776767 Size of replay buffer: 605 Learn step counter: 574
average reward: 686.0
Episode: 3
Total reward: 618.0 Epsilon: 0.9998337637979636 Size of replay buffer: 696 Learn step counter: 665
average reward: 669.0
Episode: 4
Total reward: 1212.0 Epsilon: 0.9996230710084099 Size of replay buffer: 1539 Learn step counter: 1508
average reward: 777.6
Episode: 5
Total reward: 233.0 Epsilon: 0.9996130748264299 Size of replay buffer: 1579 Learn step counter: 1548
average reward: 686.8333333333334
Episode: 6
Total reward: 237.0 Epsilon: 0.9996045781503414 Size of replay buffer: 1613 Learn step counter: 1582
average reward: 622.5714285714286
Episode: 7
Total reward: 582.0 Epsilon: 0.9995

In [19]:
# for execution:
# metrics = pd.DataFrame({
#     "Episodes": episodes,
#     "Total Reward": total_reward_list,
#     "Epsilon": epsilon,
#     "Buffer Size": buffer_size,
#     "Learn Step Counter": step_count
# })

In [9]:
# for review
metrics = pd.read_csv("metrics.csv")

In [38]:
metrics.to_csv("metrics")

### visualization of progress

In [27]:
import plotly.graph_objs as go
import plotly.subplots as sp


metrics = pd.read_csv("metrics.csv")
data_series = metrics["Total Reward"]
# Calculate moving averages
window_size = 100
moving_average_100 = data_series.rolling(window=window_size).mean()

window_size = 1000
moving_average_1000 = data_series.rolling(window=window_size).mean()

# Fit a trendline to the absolute values
x = np.arange(len(data_series))
coefficients = np.polyfit(x, data_series, 1)  # Linear fit (degree 1)
trendline = np.poly1d(coefficients)

# Create subplots
fig = sp.make_subplots(rows=4, cols=1, subplot_titles=("Moving Average, rolling window = 100", 
                                                       "Moving Average, rolling window = 1000", 
                                                       "every reward with trendline",
                                                       "Epsilon"))

# Add traces for the first moving average
fig.add_trace(go.Scatter(x=moving_average_100.index, y=moving_average_100, mode='lines', name='Moving Average (100)', line=dict(color='blue')), row=1, col=1)

# Add traces for the second moving average
fig.add_trace(go.Scatter(x=moving_average_1000.index, y=moving_average_1000, mode='lines', name='Moving Average (1000)', line=dict(color='blue')), row=2, col=1)

# Add traces for the absolute values and the trendline
fig.add_trace(go.Scatter(x=x, y=data_series, mode='markers', name='Data Points', marker=dict(color='blue', size=3)), row=3, col=1)
fig.add_trace(go.Scatter(x=x, y=trendline(x), mode='lines', name='Trendline', line=dict(color='red')), row=3, col=1)
fig.add_trace(go.Scatter(x=metrics["Epsilon"].index, y=metrics["Epsilon"], mode='lines', name='Epsilon', line=dict(color='blue')), row=4, col=1)

# Update layout
fig.update_layout(title='Analysis of Data', height=1200)

# Update x and y axis labels
fig.update_xaxes(title_text='Index', row=1, col=1)
fig.update_yaxes(title_text='Moving Average', row=1, col=1)
fig.update_xaxes(title_text='Index', row=2, col=1)
fig.update_yaxes(title_text='Moving Average', row=2, col=1)
fig.update_xaxes(title_text='Index', row=3, col=1)
fig.update_yaxes(title_text='Absolute Value', row=3, col=1)

# Show plot
fig.show()
