In [2]:
import matplotlib
import math
import numpy as np
%matplotlib inline
matplotlib.use("TkAgg")
import gym
import gridworld
from gym import wrappers, logger
    
import torch
import torchvision.datasets as datasets
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from torch import nn
from torch import optim
import copy

from random import sample
from collections import deque
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device : ",device)

device :  cuda


In [3]:
def UONoise():
    theta = 0.15
    sigma = 0.8
    state = 0
    while True:
        yield state
        state += -theta*state+sigma*np.random.randn()

class NN_Q(nn.Module):
    def __init__(self):
        super(NN_Q, self).__init__()
        self.f1 = nn.Linear(2,100)
        self.f2 = nn.Linear(101,150)
        self.f3 = nn.Linear(150,1)
        
        self.bn0 = nn.BatchNorm1d(2)
        self.bn1 = nn.BatchNorm1d(100)
        self.bn2 = nn.BatchNorm1d(150)
        
    def forward(self, x , action ):
        x = self.bn0(x)
        x = torch.relu(self.f1(x))
        x = self.bn1(x)
        x = torch.cat((x,action),dim=1)
        x = torch.relu(self.f2(x))
        x = self.bn2(x)
        x = self.f3(x)
        return x

class NN_mu(nn.Module):
    def __init__(self):
        super(NN_mu, self).__init__()
        self.f1 = nn.Linear(2,50)
        self.f2 = nn.Linear(50,25)
        self.f3 = nn.Linear(25,1)
        
        self.bn0 = nn.BatchNorm1d(2)
        self.bn1 = nn.BatchNorm1d(50)
        self.bn2 = nn.BatchNorm1d(25)
        
    def forward(self, x ):
        x = self.bn0(x)
        x = torch.relu(self.f1(x))
        x = self.bn1(x)
        x = torch.relu(self.f2(x))
        x = self.bn2(x)
        x = torch.tanh(self.f3(x))
        return x

class Memory():
    def __init__(self, N=500000):
        self.data = deque(maxlen=N)
        
    def sample(self,n):
        samples = sample(self.data,n)
        lastobs = torch.stack([item[0] for item in samples],dim=0)
        action = torch.cat([item[1] for item in samples],dim=0)
        r = torch.stack([item[2] for item in samples],dim=0)
        obs = torch.stack([item[3] for item in samples],dim=0)
        done = torch.Tensor([item[4] for item in samples]).unsqueeze(1).to(device)
        
        return lastobs,action,r,obs,done
            
    
    def store(self,last_obs,a,r,obs,done):
        self.data.append([last_obs,a,r,obs,done])

class DDPG_agent():
    def __init__(self,tau=0.001,gamma=0.99,batch=64,update_freq=1,max_explo=200,epochs=1,start_train=1):
        
        #Creating Q functions
        self.Q = NN_Q().to(device,torch.double)
        self.Q_target = NN_Q().to(device,torch.double)
        self.Q_loss = nn.MSELoss()
        self.Q_target.load_state_dict(self.Q.state_dict())
        self.opt_Q = torch.optim.Adam(self.Q.parameters(),lr=0.001)
        
        #Creating mu functions
        self.mu = NN_mu().to(device,torch.double)
        self.mu_target = NN_mu().to(device,torch.double)
        self.mu_target.load_state_dict(self.mu.state_dict())
        self.opt_mu = torch.optim.Adam(self.mu.parameters(),lr=0.0001)
        self.mu.eval()
        
        #Memory storage
        self.memory = Memory()
        self.last_obs = torch.zeros(0)
        self.last_a = None
        
        #Hyperparameters
        self.tau = tau
        self.gamma = gamma
        self.batch = batch
        self.update_freq = update_freq
        self.epochs = epochs
        self.start_train = start_train
        
        #Noise handling and indexing
        self.max_explo = max_explo
        self.episodes = 0
        self.exploration = UONoise() #variance of normal distribution used for exploration
        self.i = 0                   #nb of updates
        
    def phi(self,obs):
        return torch.Tensor(obs).to(device,torch.double)
        
    def update(self):
        self.mu.train()
        
        for _ in range(self.epochs):
            self.i +=1
            lastobs,action,r,obs,done = self.memory.sample(self.batch)
               
            #1- Critic update (Q)
            self.opt_Q.zero_grad()
            self.opt_mu.zero_grad()
            with torch.no_grad():
                y = r + self.gamma * (1-done) * self.Q_target.forward(obs,self.mu_target.forward(obs))
            Qloss = self.Q_loss( y , self.Q.forward(lastobs,action) )
            writer.add_scalar('QLoss',Qloss.item(),self.i)
            Qloss.backward()
            self.opt_Q.step()


            #2- Actor update (mu)
            self.opt_Q.zero_grad()
            self.opt_mu.zero_grad()
            mu_loss = -self.Q.forward( lastobs , self.mu.forward(lastobs) ).mean()
            writer.add_scalar('mu_loss',-mu_loss.item(),self.i)
            mu_loss.backward()
            self.opt_mu.step()

        #3- smooth update of Q and mu
        for p_target,p in zip(self.Q_target.parameters(),self.Q.parameters()):
            p_target.data.copy_( self.tau * p.data + (1-self.tau) * p_target.data )
        for p_target,p in zip(self.mu_target.parameters(),self.mu.parameters()):
            p_target.data.copy_( self.tau * p.data + (1-self.tau) * p_target.data )
            
        self.mu.eval()

        
    
    def act(self,obs,r,done):
        obs = self.phi(obs)
        r = torch.Tensor([r]).to(device,torch.double)
        exploration = 1 - min(self.episodes,self.max_explo) / self.max_explo
        
        with torch.no_grad():
            action = torch.clamp(self.mu.forward(obs.unsqueeze(0)) + next(self.exploration)*exploration ,-1 ,1 )
                 
        
        if self.last_obs.shape[0] != 0:
            self.memory.store(self.last_obs,self.last_a,r,obs,done)
            
        if self.episodes%self.update_freq==0 and self.episodes>self.start_train:
            self.update()
        
        if done:
            self.episodes +=1
        
        self.last_obs = obs
        self.last_a = action
        return action.item()

In [4]:
# Execution avec un Agent Q-learning
writer = SummaryWriter("runs/LunarLanderContinuous-v2/DDPG")
env = gym.make('MountainCarContinuous-v0')
agent = DDPG_agent()
env.seed(0)
reward = 0
done = False
rsum = 0
episode_count = 5000

#Training phase
print("Starting training phase on ",episode_count," episodes :")
for i in range(1,episode_count+1):
    obs = env.reset()
    j = 0
    rsum = 0

    while True:
        action = agent.act(obs,reward,done)
        obs, reward, done, _ = env.step([action])
        rsum += reward
        j += 1
        if (i%50==0):
            env.render()
        if done:
            writer.add_scalar('train rewards',rsum,i)
            print("Episode : " + str(i) + " rsum=" + str(round(rsum,2)) + ", " + str(j) + " actions")
            break
env.close()

Starting training phase on  5000  episodes :


RuntimeError: running_mean should contain 8 elements not 2

In [None]:
env.