In [3]:
import gym
import random
import numpy as np
from collections import deque
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential






## Build Model
<!-- <h3>Base Model</h3> -->
- Epsilon = 1
- Epsilon_min = 0.01
- Epsilon_decay = 0.99
- Learning_rate = 0.01
- Discount_rate = 0.8
- Train_start = 1000
- Batch_size = 64

In [None]:
EPISODES = 50

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size, epsilon_decay=0.99, learning_rate=0.01, discount_rate=0.8):
        self.state_size = state_size
        self.action_size = action_size

        self.render = False

        self.epsilon = 1
        self.epsilon_min = 0.01 
        self.epsilon_decay = epsilon_decay 
        self.learning_rate = learning_rate 
        self.discount_rate = discount_rate 
        self.train_start = 1000
        self.batch_size = 64

        self.memory = deque(maxlen=3000)

        self.model = self.build_model()
        self.t_model = self.build_model()

        self.update_t_weights()

    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.action_size, activation="linear"))
        model.summary()
        model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
        return model

    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    def update_t_weights(self):
        self.t_model.set_weights(self.model.get_weights())
    def get_action(self, state, env):
        if np.random.rand() < self.epsilon:
            e = env.action_space.sample()
            return e
        else:
            q_value = self.model.predict(state)
            return q_value[0]
    def train(self):
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        target_input = np.zeros((batch_size, self.state_size))
        t_input = np.zeros((batch_size, self.state_size))
        actions = []
        rewards = []
        dones = []
        for i in range(self.batch_size):
            target_input[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            t_input[i] = mini_batch[i][3]
            dones.append(mini_batch[i][4])

        target = self.model.predict(target_input)
        t_target = self.model.predict(t_input)

        for i in range(self.batch_size):
            if dones[i]:
                target[i] = rewards[i]
            else:
                target[i] = rewards[i] + self.discount_rate * t_target[i]
        self.model.fit(target_input, target, epochs=1, batch_size=self.batch_size, verbose=0)
    
    def save_weights(self, filename):
        self.model.save_weights(filename)



def main(epsilon_decay=0.99, learning_rate=0.01, discount_rate=0.8, filename="best_pendulum.h5"):
    env = gym.make("Pendulum-v0")

    state_size = env.observation_space.shape[0]

    action_size = env.action_space.shape[0]
    
    agent = DQNAgent(state_size, action_size, epsilon_decay, learning_rate, discount_rate)
    scores = []
    episodes = []
    
    best_score = -np.inf

    for e in range(EPISODES):
        done = False
        score = 0

        state = env.reset()
        state = np.reshape(state, [1, state_size])

        while not done:
            if agent.render == True:
                env.render()

            action = agent.get_action(state, env)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])

            reward = reward if not done or score == 0 else -16

            agent.append_sample(state, action, reward, next_state, done)
            agent.train()

            score += reward
            state = next_state
            if done:
                if score > best_score:
                    best_score = score
                    agent.save_weights(filename)

                agent.update_t_weights()
                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon)

    env.close()
    return scores, episodes

In [None]:
res_scores, res_episodes = main()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.lineplot(x=EPISODES, y=res_episodes)

# Tune the model

In [None]:
# Setting of the parameters that we wanted to tune
epsilon_decay = [0.99, 0.98, 0.97]
learning_rate = np.arange(0.01, 0.1, 0.01)
discount_rate = [0.8, 0.9, 0.95]

### Epsilon_Decay_Scores

In [None]:
epsilon_decay_scores = []

for e in epsilon_decay:
    res_scores, res_episodes = main(e, filename=f"best_epislion_decay{e}.h5")
    epsilon_decay_scores.append(res_episodes)

In [None]:
# sns.lineplot(x=EPISODES, y=epsilon_decay_scores)

for i, scores in enumerate(epsilon_decay_scores):
    sns.lineplot(x=EPISODES, y=scores, label=epsilon_decay[i])

plt.xlabel('Episode')
plt.ylabel('Score')
plt.show()

### Learning Rate


In [None]:
learning_rate_scores = []

for e in learning_rate:
    res_scores, res_episodes = main(epsilon_decay=0.01, learning_rate=e, filename=f"best_learning{e}.h5") # Epsilon decay is set as a placeholder
    learning_rate_scores.append(res_episodes)

In [None]:
# sns.lineplot(x=EPISODES, y=learning_rate_scores)

for i, scores in enumerate(learning_rate_scores):
    sns.lineplot(x=EPISODES, y=scores, label=learning_rate[i])

plt.xlabel('Episode')
plt.ylabel('Score')
plt.show()

In [None]:
### Discount rate
discount_rate_scores = []

for e in discount_rate:
    res_scores, res_episodes = main(epsilon_decay=0.01, learning_rate=0.01, discount_rate=e, filename=f"best_discount{e}.h5") 
    discount_rate_scores.append(res_episodes)

In [None]:
# sns.lineplot(x=EPISODES, y=learning_rate_scores)

for i, scores in enumerate(discount_rate_scores):
    sns.lineplot(x=EPISODES, y=scores, label=discount_rate[i])

plt.xlabel('Episode')
plt.ylabel('Score')
plt.show()

### Final DQN Model

In [None]:
res_scores, res_episodes = main(epsilon_decay=0.99, learning_rate=0.01, discount_rate=0.8, filename="best_DQN_pendulum.h5")

In [None]:
sns.lineplot(x=EPISODES, y=res_episodes)

### DDPG

In [None]:
class DDPG:
    def __init__(self):
        return