https://github.com/ranjitation/DQN-for-LunarLander/blob/master/dqn_agent.py

In [37]:
import gym
import csv
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from collections import deque, namedtuple
import matplotlib.pyplot as plt
%matplotlib inline

In [38]:
class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [39]:
BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 5e-4               # learning rate 
UPDATE_EVERY = 4        # how often to update the network

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

env = gym.make('LunarLander-v2')
env.seed(0)

[0]

In [40]:
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states 
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)


class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.
        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [41]:
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    
    with open("DQN_LUNAR_train_scores.csv", "w") as csvfile:
            header = ["episode", "score"]
            writer = csv.writer(csvfile, delimiter=',')
            writer.writerow(header)
    
            for i_episode in range(1, n_episodes+1):
                state = env.reset()
                score = 0
                for t in range(max_t):
                    action = agent.act(state, eps)
                    next_state, reward, done, _ = env.step(action)
                    agent.step(state, action, reward, next_state, done)
                    state = next_state
                    score += reward
                    if done:
                        break 

                scores_window.append(score)       # save most recent score
                scores.append(score)              # save most recent score
                eps = max(eps_end, eps_decay*eps) # decrease epsilon
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
                writer.writerow([i_episode, score])
    return scores

agent = Agent(state_size=8, action_size=4, seed=0)
scores = dqn()

Episode 1	Average Score: -280.78
Episode 2	Average Score: -253.76
Episode 3	Average Score: -191.90
Episode 4	Average Score: -175.44
Episode 5	Average Score: -162.29
Episode 6	Average Score: -150.11
Episode 7	Average Score: -139.52
Episode 8	Average Score: -140.86
Episode 9	Average Score: -135.67
Episode 10	Average Score: -161.64
Episode 11	Average Score: -149.89
Episode 12	Average Score: -155.69
Episode 13	Average Score: -170.63
Episode 14	Average Score: -173.98
Episode 15	Average Score: -173.85
Episode 16	Average Score: -172.03
Episode 17	Average Score: -159.53
Episode 18	Average Score: -156.12
Episode 19	Average Score: -155.45
Episode 20	Average Score: -152.78
Episode 21	Average Score: -154.58
Episode 22	Average Score: -150.26
Episode 23	Average Score: -150.21
Episode 24	Average Score: -158.17
Episode 25	Average Score: -169.37
Episode 26	Average Score: -166.74
Episode 27	Average Score: -163.22
Episode 28	Average Score: -162.83
Episode 29	Average Score: -159.63
Episode 30	Average Scor

Episode 240	Average Score: -85.97
Episode 241	Average Score: -87.24
Episode 242	Average Score: -84.05
Episode 243	Average Score: -83.16
Episode 244	Average Score: -82.89
Episode 245	Average Score: -82.10
Episode 246	Average Score: -79.10
Episode 247	Average Score: -78.46
Episode 248	Average Score: -77.44
Episode 249	Average Score: -76.93
Episode 250	Average Score: -75.94
Episode 251	Average Score: -74.22
Episode 252	Average Score: -74.10
Episode 253	Average Score: -72.05
Episode 254	Average Score: -72.03
Episode 255	Average Score: -71.54
Episode 256	Average Score: -71.91
Episode 257	Average Score: -71.90
Episode 258	Average Score: -73.10
Episode 259	Average Score: -72.30
Episode 260	Average Score: -73.73
Episode 261	Average Score: -72.89
Episode 262	Average Score: -71.35
Episode 263	Average Score: -70.26
Episode 264	Average Score: -73.00
Episode 265	Average Score: -72.22
Episode 266	Average Score: -75.03
Episode 267	Average Score: -74.22
Episode 268	Average Score: -73.69
Episode 269	Av

Episode 482	Average Score: 9.55
Episode 483	Average Score: 8.73
Episode 484	Average Score: 8.73
Episode 485	Average Score: 8.35
Episode 486	Average Score: 8.81
Episode 487	Average Score: 8.76
Episode 488	Average Score: 11.03
Episode 489	Average Score: 11.79
Episode 490	Average Score: 10.37
Episode 491	Average Score: 10.59
Episode 492	Average Score: 10.76
Episode 493	Average Score: 11.43
Episode 494	Average Score: 11.32
Episode 495	Average Score: 13.27
Episode 496	Average Score: 16.43
Episode 497	Average Score: 18.63
Episode 498	Average Score: 21.24
Episode 499	Average Score: 22.98
Episode 500	Average Score: 23.27
Episode 501	Average Score: 23.79
Episode 502	Average Score: 25.22
Episode 503	Average Score: 25.97
Episode 504	Average Score: 24.87
Episode 505	Average Score: 26.70
Episode 506	Average Score: 26.73
Episode 507	Average Score: 27.21
Episode 508	Average Score: 29.70
Episode 509	Average Score: 30.24
Episode 510	Average Score: 30.71
Episode 511	Average Score: 30.96
Episode 512	Aver

Episode 729	Average Score: 163.22
Episode 730	Average Score: 163.29
Episode 731	Average Score: 164.36
Episode 732	Average Score: 165.54
Episode 733	Average Score: 165.64
Episode 734	Average Score: 164.84
Episode 735	Average Score: 165.25
Episode 736	Average Score: 166.24
Episode 737	Average Score: 168.85
Episode 738	Average Score: 169.88
Episode 739	Average Score: 171.26
Episode 740	Average Score: 173.84
Episode 741	Average Score: 175.64
Episode 742	Average Score: 175.98
Episode 743	Average Score: 176.31
Episode 744	Average Score: 177.20
Episode 745	Average Score: 177.92
Episode 746	Average Score: 182.56
Episode 747	Average Score: 182.78
Episode 748	Average Score: 182.42
Episode 749	Average Score: 181.98
Episode 750	Average Score: 182.19
Episode 751	Average Score: 182.06
Episode 752	Average Score: 181.88
Episode 753	Average Score: 181.01
Episode 754	Average Score: 181.55
Episode 755	Average Score: 181.68
Episode 756	Average Score: 181.58
Episode 757	Average Score: 183.85
Episode 758	Av

Episode 970	Average Score: 225.96
Episode 971	Average Score: 226.52
Episode 972	Average Score: 224.34
Episode 973	Average Score: 224.78
Episode 974	Average Score: 224.30
Episode 975	Average Score: 224.75
Episode 976	Average Score: 224.67
Episode 977	Average Score: 224.44
Episode 978	Average Score: 224.64
Episode 979	Average Score: 222.84
Episode 980	Average Score: 223.25
Episode 981	Average Score: 223.61
Episode 982	Average Score: 221.76
Episode 983	Average Score: 221.50
Episode 984	Average Score: 222.03
Episode 985	Average Score: 222.07
Episode 986	Average Score: 224.06
Episode 987	Average Score: 224.22
Episode 988	Average Score: 223.94
Episode 989	Average Score: 225.10
Episode 990	Average Score: 225.57
Episode 991	Average Score: 226.19
Episode 992	Average Score: 226.98
Episode 993	Average Score: 227.44
Episode 994	Average Score: 226.88
Episode 995	Average Score: 226.81
Episode 996	Average Score: 225.69
Episode 997	Average Score: 225.43
Episode 998	Average Score: 225.29
Episode 999	Av

Episode 1205	Average Score: 193.71
Episode 1206	Average Score: 193.95
Episode 1207	Average Score: 194.26
Episode 1208	Average Score: 194.49
Episode 1209	Average Score: 194.34
Episode 1210	Average Score: 194.17
Episode 1211	Average Score: 194.43
Episode 1212	Average Score: 197.31
Episode 1213	Average Score: 195.33
Episode 1214	Average Score: 192.62
Episode 1215	Average Score: 194.66
Episode 1216	Average Score: 195.55
Episode 1217	Average Score: 195.59
Episode 1218	Average Score: 195.45
Episode 1219	Average Score: 195.29
Episode 1220	Average Score: 192.75
Episode 1221	Average Score: 193.52
Episode 1222	Average Score: 193.12
Episode 1223	Average Score: 192.68
Episode 1224	Average Score: 192.20
Episode 1225	Average Score: 191.73
Episode 1226	Average Score: 191.97
Episode 1227	Average Score: 191.34
Episode 1228	Average Score: 191.62
Episode 1229	Average Score: 191.39
Episode 1230	Average Score: 192.03
Episode 1231	Average Score: 192.08
Episode 1232	Average Score: 192.77
Episode 1233	Average

Episode 1440	Average Score: 193.10
Episode 1441	Average Score: 192.93
Episode 1442	Average Score: 193.71
Episode 1443	Average Score: 193.88
Episode 1444	Average Score: 194.08
Episode 1445	Average Score: 194.54
Episode 1446	Average Score: 193.88
Episode 1447	Average Score: 194.20
Episode 1448	Average Score: 194.70
Episode 1449	Average Score: 191.97
Episode 1450	Average Score: 191.65
Episode 1451	Average Score: 191.80
Episode 1452	Average Score: 188.71
Episode 1453	Average Score: 188.80
Episode 1454	Average Score: 186.03
Episode 1455	Average Score: 185.41
Episode 1456	Average Score: 185.93
Episode 1457	Average Score: 185.41
Episode 1458	Average Score: 182.53
Episode 1459	Average Score: 182.01
Episode 1460	Average Score: 182.85
Episode 1461	Average Score: 182.87
Episode 1462	Average Score: 183.38
Episode 1463	Average Score: 183.79
Episode 1464	Average Score: 186.13
Episode 1465	Average Score: 187.89
Episode 1466	Average Score: 187.94
Episode 1467	Average Score: 186.16
Episode 1468	Average

Episode 1675	Average Score: 179.55
Episode 1676	Average Score: 179.32
Episode 1677	Average Score: 179.05
Episode 1678	Average Score: 178.85
Episode 1679	Average Score: 179.34
Episode 1680	Average Score: 177.34
Episode 1681	Average Score: 177.54
Episode 1682	Average Score: 175.33
Episode 1683	Average Score: 175.54
Episode 1684	Average Score: 173.05
Episode 1685	Average Score: 172.04
Episode 1686	Average Score: 171.97
Episode 1687	Average Score: 172.04
Episode 1688	Average Score: 171.84
Episode 1689	Average Score: 172.69
Episode 1690	Average Score: 173.36
Episode 1691	Average Score: 174.58
Episode 1692	Average Score: 177.27
Episode 1693	Average Score: 177.46
Episode 1694	Average Score: 177.84
Episode 1695	Average Score: 177.93
Episode 1696	Average Score: 177.88
Episode 1697	Average Score: 174.80
Episode 1698	Average Score: 172.13
Episode 1699	Average Score: 171.99
Episode 1700	Average Score: 172.21
Episode 1701	Average Score: 175.02
Episode 1702	Average Score: 175.59
Episode 1703	Average

Episode 1910	Average Score: 151.10
Episode 1911	Average Score: 151.19
Episode 1912	Average Score: 151.39
Episode 1913	Average Score: 150.87
Episode 1914	Average Score: 151.94
Episode 1915	Average Score: 147.76
Episode 1916	Average Score: 147.28
Episode 1917	Average Score: 146.66
Episode 1918	Average Score: 146.04
Episode 1919	Average Score: 145.04
Episode 1920	Average Score: 143.10
Episode 1921	Average Score: 141.95
Episode 1922	Average Score: 142.32
Episode 1923	Average Score: 142.65
Episode 1924	Average Score: 142.70
Episode 1925	Average Score: 145.60
Episode 1926	Average Score: 146.55
Episode 1927	Average Score: 143.91
Episode 1928	Average Score: 147.30
Episode 1929	Average Score: 147.21
Episode 1930	Average Score: 147.71
Episode 1931	Average Score: 147.99
Episode 1932	Average Score: 151.58
Episode 1933	Average Score: 152.11
Episode 1934	Average Score: 155.75
Episode 1935	Average Score: 153.22
Episode 1936	Average Score: 152.93
Episode 1937	Average Score: 150.32
Episode 1938	Average

In [None]:
# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()