In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import os

In [None]:
#Load Dataset here

In [None]:
class DQNAgent:
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = deque(maxlen=20000)
    self.gamma = 0.9995    # discount rate
    self.epsilon = 1.0  # exploration rate
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.995
    self.learning_rate = 0.001
    self.model = self._build_model()

  def _build_model(self):
    # Neural Net for Deep-Q learning Model
    model = Sequential()
    model.add(Dense(24, input_dim=self.state_size, activation='relu'))
    model.add(Dense(48, activation='relu'))
    model.add(Dense(self.action_size, activation='linear'))
    model.compile(loss='mse',
                  optimizer=Adam(lr=self.learning_rate))
    return model

  def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

  def act(self, state):# We implement the epsilon-greedy policy
    if np.random.rand() <= self.epsilon:
        return random.randrange(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0]) # returns action
  
  def exploit(self, state): # When we test the agent we dont want it to explore anymore, but to exploit what it has learnt
    act_values = self.model.predict(state)
    return np.argmax(act_values[0]) 

  def replay(self, batch_size):
    minibatch = random.sample(self.memory, batch_size)
    
    state_b = np.squeeze(np.array(list(map(lambda x: x[0], minibatch))))
    action_b = np.squeeze(np.array(list(map(lambda x: x[1], minibatch))))
    reward_b = np.squeeze(np.array(list(map(lambda x: x[2], minibatch))))
    next_state_b = np.squeeze(np.array(list(map(lambda x: x[3], minibatch))))
    done_b = np.squeeze(np.array(list(map(lambda x: x[4], minibatch))))

    target = (reward_b + self.gamma * np.amax(self.model.predict(next_state_b), 1))

    target[done_b==1] = reward_b[done_b==1]
    target_f = self.model.predict(state_b)
    for k in range(target_f.shape[0]):
      target_f[k][action_b[k]] = target[k]
    self.model.train_on_batch(state_b, target_f)
    if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay
  def load(self, name):
    self.model.load_weights(name)
  def save(self, name):
    self.model.save_weights(name)

In [None]:
EPISODES = 30

state_size = 2 #2D coordinates
action_size = 4 #up, down, right, left
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32
reward = 0
episode_reward_list = deque(maxlen=50)
agent.memory.clear()
goal = np.array([3,3])

for e in range(EPISODES):
    #generate state randomly instead
    cov = [[np.random.uniform(0.01, 1), 0], [0, np.random.uniform(0.01, 1)]]
    x1, x2 = np.random.multivariate_normal(
        [-4,0], cov, check_valid='warn').T
    state = np.array([x1,x2])
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    for time in range(100):
        distance1 = np.linalg.norm(goal-state[0])
        action_index = agent.act(state)
        action = A[action_index]

        wind_value, severity = wind(state, means)
        
        next_state = state + 0.5*action + 0.1*wind_value  # transition
        done = reached_goal(next_state)
        distance2 = np.linalg.norm(goal-next_state[0])
        
        if (distance2-distance1) < 0: #whatever action we took took us closer to our goal
            reward = 0.0
        else:
            reward = -1.0

        if done:
            reward += 10
        # reward -= severity*10

        # if action_index == 0 or action_index == 2: #encourage moving up and right
        #          reward += 0.75
   
        total_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action_index, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, time, agent.epsilon))
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

    episode_reward_list.append(total_reward)
    episode_reward_avg = np.array(episode_reward_list).mean()
    print("episode: {}/{}, score: {}, e: {:.2}, last 50 ep. avg. rew.: {:.2f}"
                  .format(e, EPISODES, total_reward, agent.epsilon, episode_reward_avg)) 

In [None]:
#Load or Save Agent
save = True
load = False
if save:
    agent.save("latest_trained_agent")
if load:
    state_size = 2 #2D coordinates
    action_size = 4 #up, down, right, left
    agent = DQNAgent(state_size, action_size)
    agent.load("latest_trained_agent")
    

In [None]:
s = np.array([-3,-3])
history = []
history.append(s)

state_size = 2 #2D coordinates
action_size = 4 #up, down, right, left

agent.epsilon = 0.01
goal = [3,3]
total_reward = 0
for t in range(100):
    s = np.reshape(s, [1, 2])
    distance1 = np.linalg.norm(goal-s[0])
    action = agent.act(s)
    wind_value, severity = wind(s, means)
    s = s + 0.4*(A[action] + wind_value)  # transition
    distance2 = np.linalg.norm(goal-s[0])
    history.append(s[0])
    if reached_goal(s):
        print(t)
        break
    if (distance2-distance1) < 0: #whatever action we took took us closer to our goal
            reward = 0.0
    else:
            reward = -1.0
    # reward -= severity*4
    total_reward+=reward
    

print(total_reward)
history = np.array(history)
fig = plt.figure(figsize=(12,6))
plt.plot(history[:, 0], history[:, 1], label="Trajectory")
for i in range(num_concepts):
    start = i*examples_per_concept
    end = (i+1)*examples_per_concept
    plt.scatter(clusters[start:end,0], clusters[start:end,1], label=f"Cluster {i}")

plt.legend()
plt.show()