In [1]:
# Mountain Car- v0 solution using action-value neural network function approximator

In [1]:
import torch
import torch.nn as nn
from torchvision import transforms
import matplotlib.pyplot as plt
# from torch.autograd import Variable

# One hidden layer function approximator
class action_value_function(nn.Module):

    def __init__(self):

        super(action_value_function, self).__init__()
        self.block = nn.Sequential(
            torch.nn.Linear(2, 100, bias = False),
#             torch.nn.ReLU(),
#             torch.nn.Linear(20, 4),
#             torch.nn.ReLU(),
            torch.nn.Linear(100, 3, bias = False),
#             torch.nn.Softmax(),
        )

    def forward(self, input):
        return self.block(input)
    
# I tried different weight initializations but found they did not perform well.
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        nn.init.normal_(m.weight, 0, 1)


In [2]:
# Instaciate action_value_function class
q_hat = action_value_function()
# q_hat.apply(weights_init)

In [3]:
import gym
import numpy as np

done = False
LEARNING_RATE = 0.1
DISCOUNT = 0.99
EPISODES = 25000
SHOW_EVERY = 300
BATCH_SIZE = 4

env = gym.make("MountainCar-v0")
env.reset()

DISCRETE_OS_SIZE = [20, 20] #* len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE


In [4]:
# Test number of actions and other parameters
print('observation_space:',env.observation_space.low,'to',env.observation_space.high,'| Number of action values:',env.action_space.n)

observation_space: [-1.2  -0.07] to [0.6  0.07] | Number of action values: 3


In [5]:
# Function for discretization of observation state space
def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table

# Sampled version of q value table
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

In [6]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.SGD(q_hat.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)


In [7]:
avg_loss = 0
epsilon = 0.3
curr_high_pos = 0
curr_low_pos = 0
successes = 0
tot_reward = np.array([])

In [None]:
# Train agent
import random

for episode in range(EPISODES):
   
    state = torch.from_numpy(env.reset()).float()
    done = False
    
    if episode%SHOW_EVERY == 0:
        print('#'*50,'| RENDERING |','#'*49)
        RENDER = True
    else:
        RENDER = False

    running_loss = 0

    while not done:
        if RENDER:
            env.render()
        
        q = q_hat(state)
        
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            action = torch.argmax(q).item()
        
        next_state, reward, done, _ = env.step(action)
        next_state = torch.from_numpy(next_state).float()
        
        q_target = q.clone()
        q_target = q_target.data
        
        q_target[action] = reward + DISCOUNT * max(q_hat(next_state).detach())
        # Calculate loss
        loss = criterion(q, q_target)
        optimizer.zero_grad()
    
        loss.backward()
    
        optimizer.step()
        # zero the parameter gradients

        if next_state[0] >= 0.5:
            epsilon *= .99
            scheduler.step()
            break 
        

        else:
            state = next_state
    
        running_loss += loss.item()
            
    avg_loss += running_loss
    env.close()
    if episode%100 == 0:
        print('Episode:',episode,'| Average Loss:',avg_loss/100, '| Epsilon:',epsilon)
        avg_loss = 0


# env.close()

################################################## | RENDERING | #################################################
Episode: 0 | Average Loss: 0.6572934195399285 | Epsilon: 0.3
Episode: 100 | Average Loss: 91.24136128859058 | Epsilon: 0.3
Episode: 200 | Average Loss: 138.94484069248472 | Epsilon: 0.3
################################################## | RENDERING | #################################################
Episode: 300 | Average Loss: 131.32130029336025 | Epsilon: 0.3
Episode: 400 | Average Loss: 143.99756023009036 | Epsilon: 0.3
Episode: 500 | Average Loss: 155.47570345211935 | Epsilon: 0.3
################################################## | RENDERING | #################################################
Episode: 600 | Average Loss: 140.07814855983068 | Epsilon: 0.3
Episode: 700 | Average Loss: 132.555411678661 | Epsilon: 0.3
Episode: 800 | Average Loss: 136.43251731984927 | Epsilon: 0.3
################################################## | RENDERING | ############################

In [29]:
# Test your policy
env.close()
state = torch.from_numpy(env.reset()).float()
done = False
while not done:
    env.render()
    q = q_hat(state)
    action = torch.argmax(q).item()

    next_state, reward, done, _ = env.step(action)
    next_state = torch.from_numpy(next_state).float()
    state = next_state
    
env.close()