In [1]:
from gym import make
import numpy as np
import torch
import copy
import random
from collections import deque
from torch.autograd import Variable
from matplotlib import pyplot as plt
import torch.nn.functional as F
import torch.nn as nn

In [0]:
N_STEP = 2
GAMMA = 0.999

In [2]:
def transform_state(state):
    state = (np.array(state) + np.array((1.2, 0))) / np.array((1.8, 0.07))
    result = []
    result.extend(state)
    return np.array(result)

In [0]:
class Replay:
    def __init__(self, replay_size):
        self.rs = replay_size
        self.arr = []
        self.ind = 0

    def add(self, transition):
        if len(self.arr) < self.rs:
            self.arr.append(transition)
        else:
            self.arr[self.ind] = transition
            self.ind = (self.ind + 1) % self.rs

    def get(self, batch_size):
        res = random.sample(self.arr, batch_size)
        states = np.array([tran[0] for tran in res])
        actions = np.array([tran[1] for tran in res])
        next_states = np.array([tran[2] for tran in res])
        rewards = np.array([tran[3] for tran in res])
        dones = np.array([tran[4] for tran in res])
        dones = np.array([0 if done else 1 for done in dones])
        return states, actions, next_states, rewards, dones

In [0]:
class DQN(torch.nn.Module):
    def __init__(self, 
                 state_dim, 
                 action_dim,
                 hd1=32,
                 hd2=32, 
                 batch_size=100,
                 replay_size=10000,
                 update_rate=100):
        
        super().__init__()
        self.gamma = GAMMA ** N_STEP
        self.model = torch.nn.Sequential(
            torch.nn.Linear(state_dim, hd1),
            torch.nn.Sigmoid(),
            torch.nn.Linear(hd1, hd2),
            torch.nn.Sigmoid(),
            torch.nn.Linear(hd2, action_dim))
        self.replay = Replay(replay_size)
        self.bs = batch_size
        self.Loss = torch.nn.MSELoss()
        self.target = copy.deepcopy(self.model)
        # self.optim = torch.optim.Adam(self.model.parameters(), lr=1)
        self.lr = 0
        self.ur = update_rate
        
    def forward(self, state):
        return self.model(state)
    
    def update(self, transition):
        #         state, action, next_state, reward, done = transition
        self.replay.add(transition)
        if len(self.replay.arr) < self.bs:
            return
        states, actions, next_states, rewards, dones = self.replay.get(int(self.bs))
        self.lr += 1
        Q1 = self.get_probs(states)[np.arange(self.bs), actions]
        Q2 = rewards + self.gamma * np.max(self.get_target(next_states).data.numpy(), 1) * dones
        Q2 = Variable(torch.Tensor(Q2), requires_grad=True)
        optim = torch.optim.Adam(self.model.parameters(), lr=1. / pow(self.lr, 0.8))

        loss = self.Loss(Q1, Q2)
        # self.optim.zero_grad()
        optim.zero_grad()

        loss.backward()
        # self.optim.step()
        optim.step()
        if self.lr > self.ur:
            self.update_target()

    def get_target(self, state):
        ns = Variable(torch.Tensor(state))
        return self.target(ns)

    def get_probs(self, state):
        ns = Variable(torch.Tensor(state))
        return self.model(ns)

    def act(self, state, target=False):
        res = torch.argmax(self.get_probs(state).data)
        return int(res)
            
    def update_target(self):
        self.target = copy.deepcopy(self.model)
    
    def save(self, path=None):
        torch.save(self.model.state_dict(), "agent"+str(path)+".pkl")
    
    def load(self, path=None):
        self.model.load_state_dict(torch.load("agent"+str(path)+".pkl"))

In [0]:
def eps(eps_min, i, top=800, top2=400):
    slope = (eps_min - 1.0) / top
    if i < top:
        return slope * i + 1.0
    return eps_min
    # return 0

In [0]:
class DDQN(torch.nn.Module):
    def __init__(self, 
                 state_dim, 
                 action_dim,
                 hd1=24,
                 hd2=24, 
                 batch_size=64,
                 replay_size=10000,
                 update_rate=0.5):
        
        super().__init__()
        self.gamma = GAMMA ** N_STEP
        self.dqnA= torch.nn.Sequential(
            torch.nn.Linear(state_dim, hd1),
            torch.nn.ReLU(),
            torch.nn.Linear(hd1, hd2),
            torch.nn.ReLU(),
            torch.nn.Linear(hd2, action_dim))
        
        self.dqnB= torch.nn.Sequential(
            torch.nn.Linear(state_dim, hd1),
            torch.nn.ReLU(),
            torch.nn.Linear(hd1, hd2),
            torch.nn.ReLU(),
            torch.nn.Linear(hd2, action_dim))
        self.lrA = 0
        self.lrB = 0
        self.replay = Replay(replay_size)
        self.bs = batch_size
        self.Loss = torch.nn.MSELoss()
        self.ur = update_rate
        # self.optimA = torch.optim.RMSprop(self.dqnA.parameters(), lr=1e-3)
        # self.optimB = torch.optim.RMSprop(self.dqnB.parameters(), lr=1e-3)

        
    def forward(self, state):
        return self.dqnA(state)


    def update(self, transition):
        self.replay.add(transition)
        if len(self.replay.arr) < self.bs:
            return
        states, actions, next_states, rewards, dones = self.replay.get(int(self.bs))
        if np.random.rand() > self.ur:
            self.lrA += 1
            next_actions = torch.argmax(self.get_probsA(next_states), 1)
            Q1 = self.get_probsA(states)[np.arange(self.bs), actions]
            Q2 = rewards + self.gamma * self.get_probsB(next_states)[np.arange(self.bs), next_actions].detach().numpy() * dones
            Q2 = Variable(torch.Tensor(Q2), requires_grad=True)
            optimA = torch.optim.Adam(self.dqnA.parameters(), lr=1. / pow(self.lrA, 0.8))
            loss = self.Loss(Q1, Q2)
            optimA.zero_grad()
            loss.backward()
            optimA.step()
        else:
            self.lrB += 1
            next_actions = torch.argmax(self.get_probsB(next_states), 1)
            Q1 = self.get_probsB(states)[np.arange(self.bs), actions]
            Q2 = rewards + self.gamma * self.get_probsA(next_states)[np.arange(self.bs), next_actions].detach().numpy() * dones
            Q2 = Variable(torch.Tensor(Q2), requires_grad=True)
            optimB = torch.optim.Adam(self.dqnB.parameters(), lr=1. / pow(self.lrB, 0.8))
            loss = self.Loss(Q1, Q2)
            optimB.zero_grad()
            loss.backward()
            optimB.step()
    
    def get_probsA(self, state):
        ns = Variable(torch.Tensor(state))
        return self.dqnA(ns)
    
    def get_probsB(self, state):
        ns = Variable(torch.Tensor(state))
        return self.dqnB(ns)     

    def act(self, state, target=False):
        res = torch.argmax(self.get_probsA(state).data)
        return int(res)
            
    def save(self, path=None):
        torch.save(self.dqnA.state_dict(), "agentCarA"+str(path)+".pkl")
        torch.save(self.dqnB.state_dict(), "agentCarB"+str(path)+".pkl")

    
    def load(self, path=None):
        self.dqnA.load_state_dict(torch.load("agentCarA"+str(path)+".pkl"))
        self.dqnB.load_state_dict(torch.load("agentCarB"+str(path)+".pkl"))

In [0]:
env = make("MountainCarContinuous-v0")

In [10]:
def act(x, n):
    return np.array([-1. + 2. * x / n])

In [0]:
def trans_act(action):
    return int(5 * (1. + action))

In [None]:
env = make("MountainCarContinuous-v0")
env.seed(420)
n = 15
ddqn = DDQN(state_dim=2, 
            action_dim=n,
            hd1=12, 
            hd2=12,
            update_rate=0.2)
eps_min = 0.001
rw = []
episodes = 1200
best = 0
for i in range(episodes):
    state = transform_state(env.reset())
    total_reward = 0
    steps = 0
    done = False
    reward_buffer = deque(maxlen=N_STEP)
    state_buffer = deque(maxlen=N_STEP)
    action_buffer = deque(maxlen=N_STEP)
    while not done:
        if random.random() < eps(eps_min, i, top=1000):
        # if random.random() < eps_min:
            action = np.random.randint(n)
        else:
            action = ddqn.act(state)
        next_state, reward, done, _ = env.step(act(action, n))
        next_state = transform_state(next_state)
        total_reward += reward + next_state[1]
        steps += 1
        reward_buffer.append(reward)
        state_buffer.append(state)
        action_buffer.append(action)
        if len(reward_buffer) == N_STEP:
            ddqn.update((state_buffer[0],
                         action_buffer[0], 
                         next_state, 
                         sum([(GAMMA ** i) * r for i, r in enumerate(reward_buffer)]), 
                         done))
        state = next_state
    if len(reward_buffer) == N_STEP:
        rb = list(reward_buffer)
        for k in range(1, N_STEP):
            ddqn.update((state_buffer[k],
                         action_buffer[k], 
                         next_state, sum([(GAMMA ** i) * r for i, r in enumerate(rb[k:])]),
                         done))
    
    

    rwc = 0
    state = transform_state(env.reset())
    
    steps = 0
    done = False
    while not done:
        action = ddqn.act(state)
        next_state, reward, done, _ = env.step(act(action, n))
        next_state = transform_state(next_state)
        rwc += reward
        steps += 1
        state = next_state
    if rwc > best:
        ddqn.save(i)
        best = rwc
    rw.append(rwc)
    if sum(rw[-50:])/50 > 200:
        break

In [141]:
env = make("MountainCarContinuous-v0")
total_reward = 0
tdqn = DDQN(2, n, 12, 12)
tdqn.load(2)
for i in range(100):
    state = transform_state(env.reset())
    steps = 0
    done = False
    while not done:
        action = tdqn.act(state)
        next_state, reward, done, _ = env.step(act(action, n))
        next_state = transform_state(next_state)
        total_reward += reward
        steps += 1
        state = next_state
    # print(i)
tr = total_reward / 100
tr

97.61666222222524

In [0]:
!rm agent*

In [13]:
class Agent:
    def __init__(self):
        state_dim = 2
        action_dim = 15
        hd1 = 12
        hd2 = 12
        self.dqnA= torch.nn.Sequential(
            torch.nn.Linear(state_dim, hd1),
            torch.nn.ReLU(),
            torch.nn.Linear(hd1, hd2),
            torch.nn.ReLU(),
            torch.nn.Linear(hd2, action_dim))
        
        self.dqnB= torch.nn.Sequential(
            torch.nn.Linear(state_dim, hd1),
            torch.nn.ReLU(),
            torch.nn.Linear(hd1, hd2),
            torch.nn.ReLU(),
            torch.nn.Linear(hd2, action_dim))


        self.dqnA.load_state_dict(torch.load("agentCarA2.pkl"))
        self.dqnB.load_state_dict(torch.load("agentCarB2.pkl"))

    def get_probsA(self, state):
        ns = Variable(torch.Tensor(state))
        return self.dqnA(ns)
    
    def get_probsB(self, state):
        ns = Variable(torch.Tensor(state))
        return self.dqnB(ns)     

    def act(self, state, target=False):
        res = torch.argmax(self.get_probsA(state).data)
        return int(res)



In [14]:
ag = Agent()

In [15]:
env = make("MountainCarContinuous-v0")
total_reward = 0
for i in range(100):
    state = transform_state(env.reset())
    steps = 0
    done = False
    while not done:
        action = ag.act(state)
        next_state, reward, done, _ = env.step(act(action, 15))
        next_state = transform_state(next_state)
        total_reward += reward
        steps += 1
        state = next_state
tr = total_reward / 100
tr

96.59444000000585