###Code

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install gym-super-mario-bros==7.3.0

Collecting gym-super-mario-bros==7.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/a0/b8/07460212c2568f78b02995834e7bdc25349e586473919e2983e01b984abf/gym_super_mario_bros-7.3.0-py2.py3-none-any.whl (198kB)
[K     |█▋                              | 10kB 23.8MB/s eta 0:00:01[K     |███▎                            | 20kB 31.2MB/s eta 0:00:01[K     |█████                           | 30kB 36.7MB/s eta 0:00:01[K     |██████▋                         | 40kB 29.2MB/s eta 0:00:01[K     |████████▎                       | 51kB 29.8MB/s eta 0:00:01[K     |██████████                      | 61kB 29.6MB/s eta 0:00:01[K     |███████████▌                    | 71kB 31.3MB/s eta 0:00:01[K     |█████████████▏                  | 81kB 21.3MB/s eta 0:00:01[K     |██████████████▉                 | 92kB 22.1MB/s eta 0:00:01[K     |████████████████▌               | 102kB 23.0MB/s eta 0:00:01[K     |██████████████████▏             | 112kB 23.0MB/s eta 0:00:01[K     |███████

In [None]:
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
import gym

import torch
import torch.nn as nn
import numpy as np
import cv2
import random
import collections
import matplotlib.pyplot as plt

In [None]:
class proc_env1(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(proc_env1, self).__init__(env)
        self.buffer = collections.deque(maxlen=2)
        self.skip = skip

    def reset(self):
        self.buffer.clear()
        obs = self.env.reset()
        self.buffer.append(obs)
        return obs

    def step(self, action):
        tot_rew = 0.0
        done = None
        for _ in range(self.skip):
            obs, rew, done, info = self.env.step(action)
            self.buffer.append(obs)
            tot_rew += rew
            if done:
                break
        max_frame = np.max(np.stack(self.buffer), axis=0)
        return max_frame, tot_rew, done, info

class proc_env2(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(proc_env2, self).__init__(env)
        self.obs_space = gym.spaces.Box(low=0.0, high=1.0, shape=(1, 84, 84),dtype=np.float32)

    def observation(self, obs):
        img = np.reshape(obs, [240, 256, 3]).astype(np.float32)
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        res = np.reshape(resized[18:102, :],[84, 84, 1])
        
        return np.array(np.moveaxis(res, 2, 0)) / 255.0

def proc_env(env):
    env = proc_env1(env)
    env = proc_env2(env)
    return JoypadSpace(env, [["right"], ["right", "A"], ["left"], ["left","A"]]) #, ["A"]])

In [None]:
class DQN(nn.Module):

    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(3136, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )

    def forward(self, x):
        conv_out = self.conv(x)
        return conv_out


class Agent:

    def __init__(self, s_space, a_space, max_mem, batch_size, erate, emin, decay, lr, gamma, ddq):

        self.dev = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.s_space = s_space
        self.a_space = a_space
        self.max_mem = max_mem
        self.mem_samp = batch_size
        self.erate = erate
        self.emin = emin
        self.decay = decay
        self.loss = nn.SmoothL1Loss().to(self.dev)
        self.gamma = gamma
        self.ddq = ddq
        self.end_pos = 0
        self.num_que = 0
        self.step = 0
        self.copy = 5000  # Change target weights every 5000 steps
        self.memory = [torch.zeros(self.max_mem, *self.s_space), torch.zeros(self.max_mem, *self.s_space),
                       torch.zeros(self.max_mem, 1),torch.zeros(self.max_mem, 1),torch.zeros(self.max_mem, 1)] #state,next_state,action,reward,done

        if self.ddq:  
            self.Q1 = DQN(s_space, a_space).to(self.dev)
            self.Q2 = DQN(s_space, a_space).to(self.dev)
        else:  
            self.Q1 = DQN(s_space, a_space).to(self.dev)

        self.optimizer = torch.optim.Adam(self.Q1.parameters(), lr=lr)

    def action(self, state,ep):       
        if random.random() < self.erate: #*(0.999**ep):  
            return torch.tensor([[random.randrange(self.a_space)]])

        if self.ddq:
            self.step += 1

        return torch.argmax(self.Q1(state.to(self.dev))).unsqueeze(0).unsqueeze(0).cpu()

    def ER(self):
        if self.ddq and self.step % self.copy == 0:
            self.Q2.load_state_dict(self.Q1.state_dict())

        if self.mem_samp > self.num_que:
            return

        s1, s2, act, rew, done = self.rand_samp()
        s1 = s1.to(self.dev)
        s2 = s2.to(self.dev)
        act = act.to(self.dev)
        rew = rew.to(self.dev)
        done = done.to(self.dev)
        
        self.optimizer.zero_grad()
        
        if self.ddq:
            target = rew + torch.mul((self.gamma * self.Q2(s2).max(1).values.unsqueeze(1)), 1 - done)
        else:
            target = rew + torch.mul((self.gamma * self.Q1(s2).max(1).values.unsqueeze(1)), 1 - done)

        current = self.Q1(s1).gather(1, act.long())
        loss = self.loss(current, target)
        loss.backward()
        self.optimizer.step()
        self.erate *= self.decay
        if self.erate < self.emin:
          self.erate = self.emin

    def ER_data(self, s1, s2, act, rew, done):
        idx = self.end_pos

        self.memory[0][idx] = s1
        self.memory[1][idx] = s2
        self.memory[2][idx] = act
        self.memory[3][idx] = rew
        self.memory[4][idx] = done

        self.end_pos = (self.end_pos + 1) % self.max_mem
        self.num_que = min(self.num_que + 1, self.max_mem)
        
    def rand_samp(self):
        idx = random.choices(range(self.num_que), k=self.mem_samp)
        
        s1 = self.memory[0][idx]
        s2 = self.memory[1][idx]
        act = self.memory[2][idx]
        rew = self.memory[3][idx]
        done = self.memory[4][idx]
        
        return s1, s2, act, rew, done


In [None]:
def main():
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = proc_env(env)
    agent = Agent(s_space=env.obs_space.shape,a_space=env.action_space.n,
                  max_mem=30000,batch_size=32,erate=1.0,emin=0.00001,decay=0.999,lr=0.00025,gamma=0.90,ddq=False) #ddq = True or False
    PATH = '/content/drive/MyDrive/ModelParams_Eps_Decay_0.999_eps_1'
    rewards = []
    flag_pole_vec = []
    num_eps = 10000 #1000 # 10000
    

    for ep_num in range(num_eps):
        state = env.reset()
        state = torch.Tensor([state])
        tot_rew = 0
        idx = 0

        while True:
            act = agent.action(state,ep_num)
            idx += 1
            
            n_state, rew, term, info = env.step(int(act[0]))
            n_state = torch.Tensor([n_state])
            term = torch.tensor([int(term)])
            tot_rew += rew
            rew = torch.tensor([rew])

            agent.ER_data(state, n_state, act, rew, term)
            agent.ER()            
            state = n_state

            if term == 1:
                break
        flag_pole = 0
        if info['flag_get']:
          flag_pole = 1
          print('Level Completed')
        flag_pole_vec.append(flag_pole)
        rewards.append(tot_rew)

        if (ep_num + 1)%200 == 0:
          torch.save(agent.Q1.state_dict(), PATH)
        
        print("Total reward after episode {} is {}".format(ep_num + 1, rewards[-1]))
        num_eps += 1      

    if num_eps > 500:
        plt.figure(1)
        plt.title("Episodes trained vs. Average Rewards (per 500 eps)")
        plt.plot([0 for _ in range(500)] + 
                 np.convolve(rewards, np.ones((500,))/500, mode="valid").tolist())
        
        plt.figure(2)
        plt.title("Episodes trained vs. Win Percentage (per 500 eps)")
        plt.plot([0 for _ in range(500)] + 
                 np.convolve(flag_pole_vec, np.ones((500,))/500, mode="valid").tolist())
        plt.show()

main()                     


  return (self.ram[0x86] - self.ram[0x071c]) % 256


Total reward after episode 1 is 609.0
Total reward after episode 2 is 228.0
Total reward after episode 3 is 629.0
Total reward after episode 4 is 600.0
Total reward after episode 5 is 209.0
Total reward after episode 6 is 744.0
Total reward after episode 7 is 1013.0
Total reward after episode 8 is 1305.0
Total reward after episode 9 is 230.0
Total reward after episode 10 is 765.0
Total reward after episode 11 is 230.0
Total reward after episode 12 is 1026.0
Total reward after episode 13 is 627.0
Total reward after episode 14 is 627.0
Total reward after episode 15 is 1043.0
Total reward after episode 16 is 627.0
Total reward after episode 17 is 231.0
Total reward after episode 18 is 1031.0
Total reward after episode 19 is 570.0
Total reward after episode 20 is 231.0
Total reward after episode 21 is 231.0
Total reward after episode 22 is 602.0
Total reward after episode 23 is 619.0
Total reward after episode 24 is 665.0
Total reward after episode 25 is 722.0
Total reward after episode 26

KeyboardInterrupt: ignored

### Load Latest Model

In [None]:
model_save_name = 'Latest_Model_v2.pt'
path = F"/content/gdrive/MyDrive/Model Save Folder/{model_save_name}"
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
env = proc_env(env)
agent = Agent(s_space=env.obs_space.shape,a_space=env.action_space.n,
                  max_mem=30000,batch_size=32,erate=0,emin=0,decay=0.99,lr=0.00025,gamma=0.90,ddq=True) 
agent.Q1.load_state_dict(torch.load(path))

In [None]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

env = gym_super_mario_bros.make('SuperMarioBros-2-1-v0')
env = proc_env(env)
state = env.reset()
state = torch.Tensor([state])
tot_rew = 0
idx = 0

while True:
    act = agent.action_best(state)
    idx += 1
    
    n_state, rew, term, info = env.step(int(act[0]))
    n_state = torch.Tensor([n_state])
    term = torch.tensor([int(term)])
    tot_rew += rew
    rew = torch.tensor([rew])

    agent.ER_data(state, n_state, act, rew, term)
    agent.ER()            
    state = n_state
    screen = env.render(mode='rgb_array')
    plt.imshow(screen)
    ipythondisplay.clear_output(wait=True)
    ipythondisplay.display(plt.gcf())

    if term == 1:
        break
flag_pole = 0
if info['flag_get']:
  flag_pole = 1
  print('Level Competion')


print("Total reward is {}".format(rew))