In [None]:
import numpy as np
import tensorflow as tf
import gym
import os
import datetime
from statistics import mean
from gym import wrappers


class MyModel(tf.keras.Model):
    def __init__(self, num_states, hidden_units, num_actions):
        super(MyModel, self).__init__()
        self.input_layer = tf.keras.layers.InputLayer(input_shape=(num_states,))
        self.hidden_layers = []
        for i in hidden_units:
            self.hidden_layers.append(tf.keras.layers.Dense(
                i, activation='tanh', kernel_initializer='RandomNormal'))
        self.output_layer = tf.keras.layers.Dense(
            num_actions, activation='linear', kernel_initializer='RandomNormal')

    @tf.function
    def call(self, inputs):
        z = self.input_layer(inputs)
        for layer in self.hidden_layers:
            z = layer(z)
        output = self.output_layer(z)
        return output


class DQN:
    def __init__(self, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.optimizer = tf.optimizers.Adam(lr)
        self.gamma = gamma
        self.model = MyModel(num_states, hidden_units, num_actions)
        self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences

    def predict(self, inputs):
        return self.model(np.atleast_2d(inputs.astype('float32')))

    def train(self, TargetNet):
        if len(self.experience['s']) < self.min_experiences:
            return 0
        ids = np.random.randint(low=0, high=len(self.experience['s']), size=self.batch_size)
        states = np.asarray([self.experience['s'][i] for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        dones = np.asarray([self.experience['done'][i] for i in ids])
        value_next = np.max(TargetNet.predict(states_next), axis=1)
        actual_values = np.where(dones, rewards, rewards+self.gamma*value_next)

        with tf.GradientTape() as tape:
            selected_action_values = tf.math.reduce_sum(
                self.predict(states) * tf.one_hot(actions, self.num_actions), axis=1)
            loss = tf.math.reduce_mean(tf.square(actual_values - selected_action_values))
        variables = self.model.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss

    def get_action(self, states, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.predict(np.atleast_2d(states))[0])

    def add_experience(self, exp):
        if len(self.experience['s']) >= self.max_experiences:
            for key in self.experience.keys():
                self.experience[key].pop(0)
        for key, value in exp.items():
            self.experience[key].append(value)

    def copy_weights(self, TrainNet):
        variables1 = self.model.trainable_variables
        variables2 = TrainNet.model.trainable_variables
        for v1, v2 in zip(variables1, variables2):
            v1.assign(v2.numpy())


def play_game(env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()
    losses = list()
    while not done:
        action = TrainNet.get_action(observations, epsilon)
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        rewards += reward
        if done:
            reward = -200
            env.reset()

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        TrainNet.add_experience(exp)
        loss = TrainNet.train(TargetNet)
        if isinstance(loss, int):
            losses.append(loss)
        else:
            losses.append(loss.numpy())
        iter += 1
        if iter % copy_step == 0:
            TargetNet.copy_weights(TrainNet)
    return rewards, mean(losses)

def make_video(env, TrainNet):
    env = wrappers.Monitor(env, os.path.join(os.getcwd(), "videos"), force=True)
    rewards = 0
    steps = 0
    done = False
    observation = env.reset()
    while not done:
        env.render()
        action = TrainNet.get_action(observation, 0)
        observation, reward, done, _ = env.step(action)
        steps += 1
        rewards += reward
    print("Testing steps: {} rewards {}: ".format(steps, rewards))


def main():
    env = gym.make('CartPole-v0')
    gamma = 0.99
    copy_step = 25
    num_states = len(env.observation_space.sample())
    num_actions = env.action_space.n
    hidden_units = [200, 200]
    max_experiences = 10000
    min_experiences = 100
    batch_size = 32
    lr = 1e-2
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/dqn/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)

    TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    N = 50000
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.1
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward, losses = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
        with summary_writer.as_default():
            tf.summary.scalar('episode reward', total_reward, step=n)
            tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
            tf.summary.scalar('average loss)', losses, step=n)
        if n % 100 == 0:
            print("episode:", n, "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards,
                  "episode loss: ", losses)
    print("avg reward for last 100 episodes:", avg_rewards)
    make_video(env, TrainNet)
    env.close()


if __name__ == '__main__':
    for i in range(3):
        main()

episode: 0 episode reward: 23.0 eps: 0.989901 avg reward (last 100): 23.0 episode loss:  0
episode: 100 episode reward: 12.0 eps: 0.980050830419928 avg reward (last 100): 20.653465346534652 episode loss:  68.23142
episode: 200 episode reward: 17.0 eps: 0.9702986765411791 avg reward (last 100): 22.18811881188119 episode loss:  65.00485
episode: 300 episode reward: 18.0 eps: 0.960643563042708 avg reward (last 100): 23.85148514851485 episode loss:  184.54599
episode: 400 episode reward: 16.0 eps: 0.9510845243085565 avg reward (last 100): 23.84158415841584 episode loss:  294.69516
episode: 500 episode reward: 14.0 eps: 0.9416206043312847 avg reward (last 100): 22.663366336633665 episode loss:  167.42046
episode: 600 episode reward: 13.0 eps: 0.9322508566163586 avg reward (last 100): 21.623762376237625 episode loss:  307.902
episode: 700 episode reward: 12.0 eps: 0.9229743440874912 avg reward (last 100): 23.425742574257427 episode loss:  261.20093
episode: 800 episode reward: 23.0 eps: 0.91

episode: 6600 episode reward: 16.0 eps: 0.5116147681731024 avg reward (last 100): 24.504950495049506 episode loss:  185.02132
episode: 6700 episode reward: 59.0 eps: 0.5065238628945193 avg reward (last 100): 27.386138613861387 episode loss:  277.4452
episode: 6800 episode reward: 14.0 eps: 0.501483615490118 avg reward (last 100): 27.0 episode loss:  186.41364
episode: 6900 episode reward: 26.0 eps: 0.4964935218805499 avg reward (last 100): 25.861386138613863 episode loss:  199.8156
episode: 7000 episode reward: 12.0 eps: 0.49155308300238854 avg reward (last 100): 24.257425742574256 episode loss:  379.40692
episode: 7100 episode reward: 29.0 eps: 0.48666180475821974 avg reward (last 100): 28.465346534653467 episode loss:  220.12282
episode: 7200 episode reward: 13.0 eps: 0.48181919796722483 avg reward (last 100): 24.22772277227723 episode loss:  707.77167
episode: 7300 episode reward: 14.0 eps: 0.477024778316258 avg reward (last 100): 25.485148514851485 episode loss:  448.20355
episode:

episode: 13200 episode reward: 45.0 eps: 0.26442004908856137 avg reward (last 100): 34.73267326732673 episode loss:  483.48557
episode: 13300 episode reward: 27.0 eps: 0.2617888947368702 avg reward (last 100): 36.118811881188115 episode loss:  543.8843
episode: 13400 episode reward: 31.0 eps: 0.25918392211098357 avg reward (last 100): 33.42574257425743 episode loss:  772.10986
episode: 13500 episode reward: 11.0 eps: 0.2566048706854152 avg reward (last 100): 38.366336633663366 episode loss:  196.72615
episode: 13600 episode reward: 110.0 eps: 0.25405148252708043 avg reward (last 100): 32.13861386138614 episode loss:  384.07675
episode: 13700 episode reward: 40.0 eps: 0.25152350226949866 avg reward (last 100): 38.475247524752476 episode loss:  501.1154
episode: 13800 episode reward: 68.0 eps: 0.24902067708725495 avg reward (last 100): 34.693069306930695 episode loss:  637.8317
episode: 13900 episode reward: 45.0 eps: 0.24654275667071443 avg reward (last 100): 33.13861386138614 episode l

episode: 19700 episode reward: 90.0 eps: 0.13803488341191486 avg reward (last 100): 46.16831683168317 episode loss:  731.533
episode: 19800 episode reward: 55.0 eps: 0.13666134503830696 avg reward (last 100): 49.37623762376238 episode loss:  248.58652
