In [579]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from agents import QLearningAgent
from collections import deque
from collections.abc import Iterable

In [587]:
class DQNAgentModel(torch.nn.Module):
    def __init__(self, ip_size, op_size, lr = 0.1, gamma = 0.95):
        super().__init__()
        self.ip_size = ip_size
        self.op_size = op_size
        self.net = torch.nn.Sequential(
                torch.nn.Linear(ip_size, op_size)
        )
        self.memory = deque(maxlen=1000000)
        self.maxmemory = 1000
        self.batch_size = 4
        self.lr = lr
        self.gamma = gamma
        self.optim = torch.optim.SGD(lr = lr, params = self.parameters())
        self.criterion = torch.nn.MSELoss()
        self.losses = []
        self.steps = 0
    def forward(self, x):
        if not isinstance(x, torch.Tensor):# assumes bs 1
            x = torch.tensor(x, dtype = torch.float)
        return self.net(x)
    def get_random_memories(self, batch_size):
        indices = np.random.choice(len(self.memory), batch_size)
        states = []
        actions = []
        rewards = []
        next_states = []
        finished = []
        for i in indices:
            states.append(self.memory[i][0])
            actions.append(self.memory[i][1])
            rewards.append(self.memory[i][2])
            next_states.append(self.memory[i][3])
            finished.append(self.memory[i][4])
        return {'states' :torch.tensor(states, dtype = torch.float).reshape(len(states),self.ip_size),
                'actions' : torch.tensor(actions, dtype = torch.long),
                'rewards': torch.tensor(rewards, dtype = torch.float),
                'next_states' : torch.tensor(next_states, dtype = torch.float).reshape(len(states),self.ip_size),
                'finished' : torch.tensor(finished)
               }
    def addObservation(self,state, action, reward, next_state, finished):
        self.memory.append([state, action, reward, next_state, finished])
    def train_step(self, states, labels):
        self.steps += 1
        optim.zero_grad()
        pred = self.forward(states).squeeze()
#         print(pred, labels)
        loss = criterion(labels, pred)
        loss.backward()
        losses.append(loss.item())
        if self.steps % 10 == 0:
            print(f'{self.steps}: loss = {loss}, model = {model.net[0].weight.data},{model.net[0].bias.data}')
        optim.step()
    
    def replay_experiences(self):
        if len(self.memory)<self.batch_size:
            return
        memories = self.get_random_memories(self.batch_size)
        states = memories['states']
        actions = memories['actions']
        labels = self.forward(states)
        next_state_qvals = torch.max(self.forward(memories['next_states']), axis = 1)[0]
        next_state_qvals[memories['finished']] *= 0
        
        labels[range(labels.shape[0]), actions] = \
            memories['rewards'] + self.gamma * next_state_qvals
#         print(states, labels)
        self.train_step(states, labels)
        
    def plot_stats(self):
        plt.plot(self.losses)

In [588]:
a= torch.randn(1,2)
print(a)


tensor([[ 0.0832, -0.1354]])


In [589]:

model = DQNAgentModel(1,2)
a = model([[0]])
a
# model

# model.addObservation(0.1,2,0.5,[0,0],True)
# model.addObservation(0.2,1,0.5,[0,1],False)
# model.memory

# model.get_random_memories(2)

# model.replay_experiences()

tensor([[0.2275, 0.7924]], grad_fn=<AddmmBackward0>)

In [590]:
class DeepQLearningAgent(QLearningAgent):
    def __init__(self, state_dim, action_dim, lr = 0.01, gamma = 0.95):
        super().__init__()
        self.model = DQNAgentModel(state_dim, action_dim, lr, gamma)
    def train(self,*args,**kwargs):
        super().train(*args,**kwargs)
        self.model.train()
    def getBestAction(self,state, actions):
        if not isinstance(state, Iterable):
            state = [state]
        qVals = self.model([state])[0].data
        best_a = max(zip(qVals[actions], actions))[1]
#         print(actions,qVals, best_a)
        return best_a
    def _update(self, state, action, reward, next_state, next_state_actions, done = False):
        self.model.addObservation(state, action, reward, next_state, finished = done)
        self.model.replay_experiences()
        

In [591]:
import simplegame

In [592]:
agent = DeepQLearningAgent(1,2)
print(agent.model)
env = simplegame.OneDtarget()
agent.train(epsilon = 0.2)
simplegame.playOneEpisode(env, agent, 1000)

DQNAgentModel(
  (net): Sequential(
    (0): Linear(in_features=1, out_features=2, bias=True)
  )
  (criterion): MSELoss()
)
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
10: loss = 2.8977768421173096, model = tensor([[ 0.2225],
        [-0.4289]]),tensor([0.2275, 0.7924])
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0

420: loss = 4.336092472076416, model = tensor([[ 0.2225],
        [-0.4289]]),tensor([0.2275, 0.7924])
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
430: loss = 2.896655321121216, model = tensor([[ 0.2225],
        [-0.4289]]),tensor([0.2275, 0.7924])
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0

[  0   0   0   0   0 100   0   0   0   1   0]
890: loss = 1.4572179317474365, model = tensor([[ 0.2225],
        [-0.4289]]),tensor([0.2275, 0.7924])
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
900: loss = 0.01778058521449566, model = tensor([[ 0.2225],
        [-0.4289]]),tensor([0.2275, 0.7924])
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0   0   0   0 100   0   0   0   0   1]
[  0   0   0   0   0 100   0   0   0   1   0]
[  0   0 

In [576]:
agent.model

DQNAgentModel(
  (net): Sequential(
    (0): Linear(in_features=1, out_features=2, bias=True)
  )
  (criterion): MSELoss()
)