In [15]:
import torch
import torch.nn as nn
import os
import numpy as np
import gym
import gym_simpleflappy
import random

In [16]:

class DGNAgent(nn.Module):
    def __init__(self, num_actions=2, num_features=3, lr=1e-5, weights_file=''):
        super(DGNAgent, self).__init__()
        
        device = torch.device('cpu')
        if torch.cuda.is_available():
            device = torch.device('cuda')

        net = nn.Sequential(
#             nn.Flatten((1,num_features)),
            nn.Linear(num_features, 16),
            nn.ReLU(),
            nn.Dropout(p=.1),
            nn.Linear(16, num_actions),
            nn.ReLU(),
            nn.Sigmoid()
            
        ).to(device)
        self.net = net

        torch.optim.SGD(self.net.parameters(), lr=lr)
        self.criterion = nn.MSELoss()
        
        
        if weights_file and os.path.exists(weights_file):
            self.net.load_state_dict(torch.load(weights_file))
            self.net.eval()

        print('neural network created')
        print(self.net)
        
    
    def forward(self, input_f):
        output = self.net(input_f)
        return output.detach()
    
    def save(self, path='weights'):
        torch.save(self.net.state_dict(), path)
        


In [17]:
flappyAgent = DGNAgent()

neural network created
Sequential(
  (0): Linear(in_features=3, out_features=16, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.1, inplace=False)
  (3): Linear(in_features=16, out_features=2, bias=True)
  (4): ReLU()
  (5): Sigmoid()
)


In [18]:
from collections import deque


ENV_NAME = 'SimpleFlappyDistance-v0'
env = gym.make(ENV_NAME)


max_eps = 1000
episode = 0
steps = 0
screen_width = 288
screen_height = 512
max_speed = 10.0

done = False
steps = 0
epsilon = 0.1
ground_y = screen_height * 0.9
pipe_max = ground_y - screen_height * 0.1
pipe_min = screen_height * 0.4
discount = .9
replay = deque()
replay_memory = 10000
observe = 3200 # timesteps to observe before training
steps = 0
FINAL_EPSILON = 0.0001 # final value of epsilon
INITIAL_EPSILON = 0.1 # starting value of epsilon
explore = 3000000 # frames over which to anneal epsilon
loss = 0



In [19]:
def preprocess_features(feats):
    # features => [bird speed, hor. distance to next pipe, ver. distance to next pipe]
    new_feats = [feats[1], feats[2], feats[3] - feats[0]]
#     new_feats[0] /= max_speed
#     new_feats[1] /= screen_width
#     new_feats[2] /= screen_height    
    return new_feats

In [20]:
observation = env.reset()

features = preprocess_features(list(observation))
features = torch.cuda.FloatTensor(features)


flappyAgent.net.zero_grad()

while not done:  #  Game cycle
    if random.random() <= epsilon:
        action = random.randrange(2)
    else:
        q = flappyAgent.forward(features)
        action =  np.argmax(q.cpu()).item()

    if epsilon > FINAL_EPSILON and steps > observe:
        epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / explore

    observation, reward, done, info = env.step(action)  #  Make a move
    features1 = preprocess_features(list(observation))
    features1 = torch.cuda.FloatTensor(features1)

    replay.append((features, action, reward, features1, done))
    
    if len(replay) > replay_memory:
        replay.popleft()

    if steps > observe:
        # sample a minibatch of size 32 from replay memory
        flappyAgent.net.train()

        minibatch = random.sample(replay, 32)
        f, a, r, f1, alive = zip(*minibatch)
        f = np.concatenate(f)
        f1 = np.concatenate(f1)
        targets = model.forward(f)
        targets[range(32), a] = r + discount*np.max(model.forward(f1), axis=1)*(not done)
#         loss += model.train_on_batch(f, targets)
#         for i in range(32):
        
            ## TREBA LOSS

#     for i, (images, labels) in enumerate(train_loader):
#         optimizer.zero_grad()
#         output = net(images.float())
#         loss = criterion(output.float(), labels.long())

        if i % 10 == 0:
            print('TRAIN - Epoch %d, Batch: %d, Loss: %f' % (epoch, i, loss.detach().cpu().item()))

        loss.backward()
        optimizer.step()


        print(targets)


    steps += 1    
    features = features1
    
    print("STEPS: "+ str(steps) + ", EPSILON: " + str(epsilon) + ", ACTION: " + str(action) + ", REWARD: " + str(reward) + ", Loss: " + str(loss))


STEPS: 1, EPSILON: 0.1, ACTION: 1, REWARD: 1.0, Loss: 0
STEPS: 2, EPSILON: 0.1, ACTION: 1, REWARD: 1.0, Loss: 0
STEPS: 3, EPSILON: 0.1, ACTION: 1, REWARD: 1.0, Loss: 0
STEPS: 4, EPSILON: 0.1, ACTION: 1, REWARD: 1.0, Loss: 0
STEPS: 5, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 6, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 7, EPSILON: 0.1, ACTION: 1, REWARD: 1.0, Loss: 0
STEPS: 8, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 9, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 10, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 11, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 12, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 13, EPSILON: 0.1, ACTION: 1, REWARD: 1.0, Loss: 0
STEPS: 14, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 15, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 16, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 17, EPSILON: 0.1, ACTION: 0, REWARD: 1.0, Loss: 0
STEPS: 18, EPSILON: 0.1, ACTION: 0, REWA