In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from snakegame.game import SnakeGameAI, Direction, Point
from collections import deque
import numpy as np
import matplotlib.pyplot as plt
from IPython import display


pygame 2.1.3 (SDL 2.28.4, Python 3.11.6)
Hello from the pygame community. https://www.pygame.org/contribute.html


# Model and Trainer

In [2]:
class LinearQnet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size),
        )
    
    def forward(self, x):
        return self.model(x)
    
    def save(self, path='model.h5'):
        torch.save(self.model, path)

class QTrainer:
    def __init__(self, model, lr, gamma):
        self.lr = lr
        self.gamma = gamma
        self.model = model
        self.optimizer = optim.Adam(model.parameters(), self.lr)
        # losss = (Q_new - Q)**2
        self.criterion = nn.MSELoss()
    
    def train_step(self, state, action, reward, state_new, done):
        state =     torch.tensor(state, dtype=torch.float)
        state_new = torch.tensor(state_new, dtype=torch.float)
        action =    torch.tensor(action, dtype=torch.long)
        reward =    torch.tensor(reward, dtype=torch.float)

        # (n,x)
        # Why do we need this 
        if len(state.shape) == 1:
            # (1,x)
            state =         torch.unsqueeze(state, 0)
            state_new =    torch.unsqueeze(state_new, 0)
            action =        torch.unsqueeze(action, 0)
            reward =        torch.unsqueeze(reward, 0)
            done =          (done,)
        
        # 1: predicted Q valus with current state
        pred = self.model(state)

        # 2: Q_new = r + discounted_factor * max(next_pred Q value) 
        # pred.clone()
        # preds[argmax(action)] = Q_new
        # only do this if not done
        target = pred.clone()
        for idx in range(len(done)):
            q_new = reward[idx]
            if not done[idx]:
                q_new = reward[idx] + self.gamma * torch.max(self.model(state_new[idx]))
            target[idx][torch.argmax(action).item()] = q_new
        
        self.optimizer.zero_grad()
        loss = self.criterion(target, pred)
        loss.backward()

        self.optimizer.step()

In [3]:
plt.ion()

def plot(scores, mean_scores):
    display.clear_output(wait=True)
    display.display(plt.gcf())
    plt.clf()
    plt.title('Training...')
    plt.xlabel('Number of Games')
    plt.ylabel('Score')
    plt.plot(scores)
    plt.plot(mean_scores)
    plt.ylim(ymin=0)
    plt.text(len(scores)-1, scores[-1], str(scores[-1]))
    plt.text(len(mean_scores)-1, mean_scores[-1], str(mean_scores[-1]))
    plt.show(block=False)
    plt.pause(.1)


In [4]:
MAX_MEM = 100000
BATCH_SIZE = 1000
LEARNING_RATE = 0.001

class Agent:

    def __init__(self):
        self.n_games = 0
        self.epsilon = 0 #choose to explore or exploit policy 
        self.gamma = 0.9 # discounted factor
        self.memory = deque(maxlen = MAX_MEM)
        self.model = LinearQnet(11, 256, 3) #11 states, 256 hidden states, 3 action states
        self.trainer = QTrainer(self.model, lr = LEARNING_RATE, gamma = self.gamma)

        # TODO: model, trainer

    def get_state(self, game: SnakeGameAI):
        #There are 11 states in the game of snake, each is a boolean
        # [danger sraight, danger right, danger left, 
        # direction left, right, up, down, 
        # food left, food right, food up, food down]
        head = game.head
        
        point_left = Point(head.x - 20, head.y)
        point_right = Point(head.x + 20, head.y)
        point_up = Point(head.x, head.y - 20)
        point_down = Point(head.x, head.y + 20)

        # 4 points of direction
        dir_left = game.direction == Direction.LEFT
        dir_right = game.direction == Direction.RIGHT
        dir_up = game.direction == Direction.UP
        dir_down = game.direction == Direction.DOWN

        state = [
            # 3 state for danger state (straight, right, left)
            #danger straight
           (dir_up and game.is_collision(point_up)) or 
           (dir_right and game.is_collision(point_right)) or 
           (dir_down and game.is_collision(point_down)) or
           (dir_left and game.is_collision(point_left)),
           #danger right, this is relative to the direction of the snake
           #but the point should be relative to world
           #so danger right when going down is to its left point 
           (dir_up and game.is_collision(point_right)) or 
           (dir_right and game.is_collision(point_down)) or 
           (dir_down and game.is_collision(point_left)) or
           (dir_left and game.is_collision(point_up)),
           #same principle for danger left
           (dir_up and game.is_collision(point_left)) or 
           (dir_right and game.is_collision(point_up)) or 
           (dir_down and game.is_collision(point_right)) or
           (dir_left and game.is_collision(point_down)),

            # 4 dim for move direction
            dir_up, 
            dir_right, 
            dir_down, 
            dir_left, 

            # 4 dim for food location ( relative to the snake head)
            # up, right, down, left
            game.food.y < game.head.y, #up
            game.food.x > game.head.x, #right
            game.food.x < game.head.x, #left
            game.food.y > game.head.y, #down
        ]

        # convert True false to 1, 0
        return np.array(state, dtype = int)
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def train_long_memory(self):
        if len(self.memory) > BATCH_SIZE:
            sample = random.sample(self.memory, BATCH_SIZE) #returns list of tuples
        else:
            sample = self.memory
        
        states, actions, rewards, next_states, dones = zip(*sample)
        self.trainer.train_step(states, actions, rewards, next_states, dones)

    def train_short_memory(self, state, action, reward, next_state, done):
        self.trainer.train_step(state, action, reward, next_state, done)
    
    def get_action(self, state):
        self.epsilon = 80 - self.n_games
        final_move = [0,0,0]
        if random.randint(0,200) < self.epsilon:
            move = random.randint(0,2)
            final_move[move] = 1
        else:
            pred_action = self.model(torch.tensor(state, dtype=torch.float))
            move = torch.argmax(pred_action).item()
            final_move[move] = 1
        return final_move

def train():
    plot_scores = []
    plot_mean_scores = []
    total_score = 0
    record = 0
    agent = Agent()
    game = SnakeGameAI()

    while True:
        # get old state
        state_old = agent.get_state(game)
        # get move
        final_move = agent.get_action(state_old)
        #perform move and get new state
        reward, done, score = game.play_step(final_move)
        state_new = agent.get_state(game)

        #train short memory
        agent.train_short_memory(state_old, final_move, reward, state_new, done)

        #remember
        agent.remember(state_old, final_move, reward, state_new, done)

        if done:
            # train long memory, plot result
            game.reset()
            agent.n_games += 1
            agent.train_long_memory()

            if score > record:
                record = score
                #save agent
            
            print('Game', agent.n_games, 'Score = ', score, ',Record=', record)

            #TODO: plot
            plot_scores.append(score)
            total_score += score
            mean_score = total_score / agent.n_games
            plot_mean_scores.append(mean_score)

            plot(plot_scores, plot_mean_scores)

In [5]:
train()

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'