In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import gym
from collections import deque
import random
import itertools
from torch import nn
from torch.nn import functional
import matplotlib.pyplot as plt




device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [6]:
class Fourier_Basis:
  def __init__(self, order, k):
    #self.env = env
    self.order = [order]*k
    self.coefficients = np.array([])

  def get_coefficients(self):
    prods = [range(0, i+1) for i in self.order]
    #print(prods)
    coeffs = [v for v in itertools.product(*prods)]
    self.coefficients = np.array(coeffs)
    return self.coefficients
  
  def value(self, state):
    self.get_coefficients()
    return np.cos(np.pi*np.dot(self.coefficients, state))

In [15]:
class Model(nn.Module):
    def __init__(self, input_features, output_values):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(in_features=input_features, out_features=32)
        self.fc2 = nn.Linear(in_features=32, out_features=32)
        self.fc3 = nn.Linear(in_features=32, out_features=output_values)

    def forward(self, x):
        x = functional.selu(self.fc1(x))
        x = functional.selu(self.fc2(x))
        x = self.fc3(x)
        return x



n_features = len(env.observation_space.high)
n_actions = env.action_space.n

memory = deque(maxlen=memory_len)
# each memory entry is in form: (state, action, env_reward, next_state)
device = torch.device("cuda" if use_cuda and torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()
FB = Fourier_Basis(order, k)
# policy_net = Model(n_features, n_actions).to(device)
# target_net = Model(n_features, n_actions).to(device)
input_features = (order + 1)**k
output_values = env.action_space.n
policy_net = Model(input_features, output_values).to(device)
target_net = Model(input_features, output_values).to(device)

target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

u_state = env.observation_space.high
l_state = env.observation_space.low
d_state = u_state - l_state


def get_states_tensor(sample, states_idx):
    sample_len = len(sample)
    states_tensor = torch.empty((sample_len, n_features), dtype=torch.float32, requires_grad=False)

    features_range = range(n_features)
    for i in range(sample_len):
        for j in features_range:
            states_tensor[i, j] = sample[i][states_idx][j].item()

    return states_tensor


def normalize_state(state):
    #state = state[0]
    if len(state) != 2:
        state = state[0]
    state[0] /= 1.8
    state[1] /= 0.14
    #state[2] /= 0.3
    #state[3] /= 0.3


def state_reward(state, env_reward):
    #print((abs(state[0]) + abs(state[2])) / 2.5)
    #print(env_reward - (abs(state[0]) + abs(state[2])) / 2.5)
    return env_reward - (abs(state[0]) + abs(state[1])) / 2.5


def get_action(state, e=min_epsilon):
    if random.random() < e:
        action = random.randrange(0, n_actions)
    else:
        #print(state)
        encoded_state = (state - l_state) / d_state
        state_cos = FB.value(encoded_state)

        encoded_state = torch.tensor(state_cos, dtype=torch.float32, device=device)
        action = policy_net(encoded_state).argmax().item()

    return action


def fit(model, inputs, labels):
    inputs = inputs.to(device)
    labels = labels.to(device)
    train_ds = TensorDataset(inputs, labels)
    train_dl = DataLoader(train_ds, batch_size=5)

    optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
    model.train()
    total_loss = 0.0

    for x, y in train_dl:
        out = model(x)
        loss = criterion(out, y)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()

    return total_loss / len(inputs)


def optimize_model(train_batch_size=100):
    train_batch_size = min(train_batch_size, len(memory))
    train_sample = random.sample(memory, train_batch_size)
    

    state = torch.tensor([s[0] for s in train_sample], dtype=torch.float32, device=device)
    next_state = torch.tensor([s[3] for s in train_sample], dtype=torch.float32, device=device)

    q_estimates = policy_net(state).detach()
    next_state_q_estimates = target_net(next_state).detach()
    next_actions = policy_net(next_state).argmax(dim=1)

    for i in range(len(train_sample)):
        next_action = next_actions[i].item()
        q_estimates[i][train_sample[i][1]] = (train_sample[i][2] +
                                              gamma * next_state_q_estimates[i][next_action].item())

    fit(policy_net, state, q_estimates)




def train_one_episode():
    global epsilon
    current_state = env.reset()
    #normalize_state(current_state)
    done = False
    score = 0
    reward = 0
    steps = 0
    while not done and steps < 200:
        if len(current_state) != 4:
            current_state = current_state[0]
        action = get_action(current_state, epsilon)
        next_state, env_reward, done, info, _ = env.step(action)
        #normalize_state(next_state)
        encoded_current_state = FB.value((current_state - l_state) / d_state)
        encoded_next_state = FB.value((next_state - l_state) / d_state)
        #print(encoded_current_state)
        memory.append((encoded_current_state, action, env_reward, encoded_next_state))
        current_state = next_state
        score += env_reward
        reward += state_reward(next_state, env_reward)
        #print(reward)

        optimize_model(100)

        epsilon -= epsilon_decay
        steps += 1

    return score, reward

def test():
    state = env.reset()
    #normalize_state(state)
    done = False
    score = 0
    steps = 0
    reward = 0
    while not done and steps < 200:
        #encoded_state = FB.value((state - l_state) / d_state)
        if len(state) != 4:
            state = state[0]
        action = get_action(state)
        state, env_reward, done, info, _ = env.step(action)
        #normalize_state(state)
        score += env_reward
        #print(score)
        reward += state_reward(state, env_reward)
        steps += 1


    return score, reward



def main():
    best_test_reward = 0
    global reward_list
    reward_list = []

    for i in range(episode_limit):
        score, reward = train_one_episode()
        reward_list.append(reward)

        print(f'Episode {i + 1}: score: {score} - reward: {reward}')

        if i % target_update_delay == 0:
            target_net.load_state_dict(policy_net.state_dict())
            target_net.eval()

        if (i + 1) % test_delay == 0:
            test_score, test_reward = test()
            print(f'Test Episode {i + 1}: test score: {test_score} - test reward: {test_reward}')
            if test_reward > best_test_reward:
                print('New best test reward. Saving model')
                best_test_reward = test_reward
                torch.save(policy_net.state_dict(), 'policy_net.pth')

    if episode_limit % test_delay != 0:
        test_score, test_reward = test()
        print(f'Test Episode {episode_limit}: test score: {test_score} - test reward: {test_reward}')
        if test_reward > best_test_reward:
            print('New best test reward. Saving model')
            best_test_reward = test_reward
            torch.save(policy_net.state_dict(), 'policy_net.pth')

    print(f'best test reward: {best_test_reward}')


if __name__ == '__main__':
    main()

Episode 1: score: -200.0 - reward: -241.87732808589942
Episode 2: score: -200.0 - reward: -243.47483144998554
Episode 3: score: -200.0 - reward: -242.41111849546425
Episode 4: score: -200.0 - reward: -241.18677787780754
Episode 5: score: -200.0 - reward: -242.8466920018196
Episode 6: score: -200.0 - reward: -242.43963247537616
Episode 7: score: -200.0 - reward: -241.29141353368755
Episode 8: score: -200.0 - reward: -247.29067519903177
Episode 9: score: -200.0 - reward: -240.79542729854572
Episode 10: score: -200.0 - reward: -244.92836153507236
Test Episode 10: test score: -500.0 - test reward: -619.5503972768786
Episode 11: score: -200.0 - reward: -246.4071639299393
Episode 12: score: -200.0 - reward: -243.0556904196739
Episode 13: score: -200.0 - reward: -243.17475211620334
Episode 14: score: -200.0 - reward: -242.6519859194756
Episode 15: score: -200.0 - reward: -241.14762754440292
Episode 16: score: -200.0 - reward: -240.97588378190997
Episode 17: score: -200.0 - reward: -239.940153

In [12]:
# Parameters
use_cuda = True
env = gym.make('MountainCar-v0')
env._max_episode_steps = 200
episode_limit = 500
target_update_delay = 2  # update target net every target_update_delay episodes
test_delay = 10
order = 10
k = env.observation_space.shape[0]
learning_rate = 1e-4
epsilon = 0.5  # initial epsilon
min_epsilon = 0.1
epsilon_decay = 0.9 / 2.5e3
gamma = 0.99
memory_len = 10000