In [4]:
import gym, math, glob
import numpy as np

import cv2

from timeit import default_timer as timer
from datetime import timedelta

import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from IPython.display import clear_output
from matplotlib import pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# from utils.wrappers import *
# from agents.DQN import Model as DQN_Agent
# from utils.ReplayMemory import ExperienceReplayMemory

#from utils.Replay.ipynb import ReplayBuffer
from utils.hyperparameters import Config
from utils.plot import plot_all_data
import Game.tetris_fun as game

pygame 2.0.1 (SDL 2.0.14, Python 3.8.8)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [None]:
# Solution for error: no available video device
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"

In [None]:
class ReplayBuffer:
    
    def __init__(self, size, screen_shape=(84, 84)):
        self.size = size
        self.screen_shape = screen_shape
        self.num_in_buffer = 0
        self.screens = deque(maxlen=self.size)
        self.actions = deque(maxlen=self.size)
        self.rewards = deque(maxlen=self.size)
#          self.next_screens = deque(maxlin=self.size)
        self.terminal = deque(maxlen=self.size)
        
    def push(self, screen, action, reward):
        self.screens.append(screen)
        self.actions.append(np.unit8(action))
        self.rewards.append(reward)
#         self.next_screens.append(next_screens)
        
        self.num_in_buffer = len(self.screens)
        
    def can_sample(self, batch_size):
        """Returns true if `batch_size` different transitions can be sampled from the buffer."""
        return batch_size + 1 <= self.num_in_buffer
    
    def _encode_sample(self, idxes):
        # Return batch data for screens, actions, rewards, next screens and terminal info
        # one screen state corresponding to one action by default, needing to consider grouped screens and actions
        obs_batch      = torch.from_numpy(np.concatenate([self.screens[idx] for idx in idxes], 0))
        act_batch      = torch.from_numpy(np.concatenate([self.actions[idx] for idx in idxes], 0))
        rew_batch      = torch.from_numpy(np.concatenate([self.rewards[idx] for idx in idxes], 0))
        next_obs_batch = torch.from_numpy(np.concatenate([self.screens[idx + 1] for idx in idxes], 0))
        done_mask      = np.array([1.0 if self.terminal[idx] else 0.0 for idx in idxes], dtype=np.float32)
        
        return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask
        
    
    def sample(self, batch_size):
        assert self.can_sample(batch_size)
        inds = random.sample(range(self.num_in_buffer), batch_size)
        
        return self._encode_sample(inds)
        
        
        
        

In [None]:
# The DQN model
class DQN(nn.Model):
    def __init__(self, num_actions, in_channels=3):
        super(DQN, self).__init__()
        self.in_channels = in_channels
        self.num_actions = num_actions
        
        # could add batchnorm2d layers after each covnet if data volume is too large
        # see: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
        
        self.conv1 = nn.Conv2d(self.in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)

        self.fc1 = nn.Linear(self.feature_size(), 512)
        self.fc2 = nn.Linear(512, self.num_actions)
        
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        # reshape the tensor to one dimension for fc layers
        x = x.view(x.size(0), -1)

        x = F.relu(self.fc1(x))

        return self.fc2(x)
    
    def feature_size(self):
        return self.conv3(self.conv2(self.conv1(torch.zeros(1, *self.in_channels)))).view(1, -1).size(1)
#         return (size - (kernel_size - 1) - 1) // stride  + 1
    

In [None]:
# Hyperparameters and utilities

BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 0.001
TARGET_UPDATE = 10
lr = 0.001
memory_size = 100000
num_episodes = 1000


def get_action(state, policy_net):
    # Return a number indicating the pos of 1 in the array for a action
    steps_done = 0
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return torch.max(policy_net(state), 1)[1]
    else:
        action = random.randint(0, 5)
        return action
    
def get_act_array(act_num):
    action = np.zeros(6, dtype=int)
    action[act_num] = 1
    return action

def get_next_qs(target_net, next_obs_batch, done_mask, BATCH_SIZE):
    not_terminal = np.where(done_mask==0.0)
    not_terminal_states = next_obs_batch[not_terminal]
    values = torch.zeros(BATCH_SIZE)
    values[not_terminal] = target_net(not_terminal_states).max(dim=1)[0].detach()
    return values
            
            

def plot_durations(episode_durations):
    plt.figure(2)
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())

In [None]:
num_actions = 6
in_channels = 3
screen_shape = (84, 84)

"""BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 0.001
TARGET_UPDATE = 10
lr = 0.001
memory_size = 100000
num_episodes = 1000"""


def train(env=game.GameState(), num_actions, in_channels, memory_size, screen_shape):
    env = env
    
    # if GPU is available, use it otherwise use CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    policy_net = DQN(num_action, in_channels).to(device)
    target_net = DQN(num_action, in_channels).to(device)
    
    # set weight and bias of target net as policy net
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    
    optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
    
    memory = ReplayBuffer(memory_size, screen_shape)
    
    episode_durations = []
    
    for episode in range(num_episodes):
        start_act = np.zeros(num_actions)
        x_t, r_0, terminal = game_state.frame_step(start_act)
        
        x_t = cv2.cvtColor(cv2.resize(x_t, (84, 84)), cv2.COLOR_BGR2GRAY)  # resize the screen and convert color to gray
        ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)  # set the background to black and tetriminos to white
#         s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)

       timestep = 0
        
        while True:
            timestep += 1
            act_num = get_action(x_t, policy_net)
            act = get_act_array(act_num)
            x_t1, r_1, terminal = game_state.frame_step(act)
            
            memory.push(x_t, act_num, r_1)
            
            x_t = x_t1
            
            if memory.can_sample(BATCH_SIZE):
                obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = memory.sample(BATCH_SIZE)
                
                curr_qs = policy_net(obs_batch).gather(1, act_batch)
                next_qs = get_next_qs(target_net, next_obs_batch, done_mask, BATCH_SIZE)
                
                target_q_values = rew_batch + GAMMA * next_qs
                
                criterion = nn.MSELoss()
                loss = critetion(curr_qs, target_q_values.unsqueeze(1))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
            if terminal:
                episode_durations.append(timestep)
                plot_durations(episode_durations)
                break
                
            if episode % target_update == 0:
                target_net.load_state_dict(policy_net.state_dict())
                
            
                
            
                
            
            
            
        
        
    
    
    
    

In [7]:
arr = np.array([1, 2, 3])
test = torch.zeros(10)
print(test)
test[arr] = 1
print(test)

new_id = np.array([2,5,8])
test_new = test[new_id]
print("new ", test_new)
print(np.where(new_id==2))

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
tensor([0., 1., 1., 1., 0., 0., 0., 0., 0., 0.])
new  tensor([1., 0., 0.])
(array([0]),)
